diff --git a/.gitignore b/.gitignore index 6fa0865d..1aff930e 100644 --- a/.gitignore +++ b/.gitignore @@ -52,6 +52,8 @@ training/wordlist2dawg *.patch # ignore compilation files +build/* +/bin */.deps/* */.libs/* *.lo @@ -63,8 +65,6 @@ training/wordlist2dawg *.jar # tessdata -*.cube.* -*.tesseract_cube.* *.traineddata # OpenCL @@ -73,5 +73,10 @@ kernel*.bin # build dirs /build* +/.cppan /cppan -/win* \ No newline at end of file +/*.dll +/*.lib +/*.exe +/*.lnk +/win* diff --git a/.travis.yml b/.travis.yml index fdd70463..e28239a7 100644 --- a/.travis.yml +++ b/.travis.yml @@ -2,16 +2,16 @@ language: cpp notifications: email: false - + sudo: required os: - linux - #- osx + - osx -branches: - only: - - master +#branches: + #only: + #- master addons: apt: @@ -24,17 +24,17 @@ addons: before_install: - if [[ $TRAVIS_OS_NAME == linux ]]; then LINUX=true; fi - if [[ $TRAVIS_OS_NAME == osx ]]; then OSX=true; fi - - - if [[ $OSX ]]; then brew update; fi - - export LEPT_VER=1.73 + #- if [[ $OSX ]]; then brew update; fi + + - export LEPT_VER=1.74.1 install: - - if [[ $OSX ]]; then brew install icu4c pango; brew link --force gettext; fi - - if [[ $OSX ]]; then export ICU_ROOT=/usr/local/opt/icu4c ; fi - - wget https://www.cmake.org/files/v3.6/cmake-3.6.1-Linux-x86_64.sh - - sudo sh cmake-3.6.1-Linux-x86_64.sh --skip-license --prefix=/usr - - wget -O leptonica.zip https://github.com/DanBloomberg/leptonica/archive/v$LEPT_VER.zip + #- if [[ $OSX ]]; then brew install icu4c pango; brew link --force gettext; fi + #- if [[ $OSX ]]; then export ICU_ROOT=/usr/local/opt/icu4c ; fi + - if [[ $LINUX ]]; then wget https://www.cmake.org/files/v3.7/cmake-3.7.2-Linux-x86_64.sh; fi + - if [[ $LINUX ]]; then sudo sh cmake-3.7.2-Linux-x86_64.sh --skip-license --prefix=/usr; fi + - wget -O leptonica.zip https://github.com/DanBloomberg/leptonica/archive/$LEPT_VER.zip - unzip leptonica.zip -d . - cmake -Hleptonica-$LEPT_VER -Bleptonica-$LEPT_VER/build - make -C leptonica-$LEPT_VER/build diff --git a/AUTHORS b/AUTHORS index 4252027d..4d9c75c4 100644 --- a/AUTHORS +++ b/AUTHORS @@ -2,12 +2,14 @@ Ray Smith (lead developer) Ahmad Abdulkader Rika Antonova Nicholas Beato +Jeff Breidenbach Samuel Charron Phil Cheatle Simon Crouch David Eger Sheelagh Huddleston Dan Johnson +Rajesh Katikam Thomas Kielbus Dar-Shyang Lee Zongyi (Joe) Liu @@ -26,3 +28,15 @@ Joern Wanke Ping Ping Xiu Andrew Ziem Oscar Zuniga + +Community Contributors: +Zdenko Podobný (Maintainer) +Jim Regan (Maintainer) +James R Barlow +Amit Dovev +Martin Ettl +Tom Morris +Tobias Müller +Egor Pugin +Sundar M. Vaidya +Stefan Weil diff --git a/CMakeLists.txt b/CMakeLists.txt index e4bc5cbb..6e7d5c2f 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -10,6 +10,12 @@ cmake_minimum_required(VERSION 2.8.11) +if (NOT APPVEYOR) +if (WIN32 AND (CMAKE_VERSION VERSION_EQUAL 3.6 OR (CMAKE_VERSION VERSION_GREATER 3.6 AND CMAKE_VERSION VERSION_LESS 3.7))) + message(FATAL_ERROR "You have bugged CMake version 3.6 which is known to not work with tesseract. Please, upgrade CMake.") +endif() +endif() + # In-source builds are disabled. if (${CMAKE_SOURCE_DIR} STREQUAL ${CMAKE_BINARY_DIR}) message(FATAL_ERROR @@ -40,25 +46,31 @@ set_property(GLOBAL PROPERTY PREDEFINED_TARGETS_FOLDER "CMake Targets") project(tesseract C CXX) -set(VERSION_MAJOR 3) -set(VERSION_MINOR 05) +set(VERSION_MAJOR 4) +set(VERSION_MINOR 00) set(VERSION_PLAIN ${VERSION_MAJOR}.${VERSION_MINOR}) -set(MINIMUM_LEPTONICA_VERSION 1.71) +set(MINIMUM_LEPTONICA_VERSION 1.74) -if(NOT EXISTS ${PROJECT_SOURCE_DIR}/cppan) +if(NOT EXISTS ${PROJECT_SOURCE_DIR}/.cppan) if (NOT Leptonica_DIR AND NOT MSVC) find_package(PkgConfig REQUIRED) - pkg_check_modules(Leptonica REQUIRED lept) + pkg_check_modules(Leptonica REQUIRED lept>=${MINIMUM_LEPTONICA_VERSION}) else() find_package(Leptonica ${MINIMUM_LEPTONICA_VERSION} REQUIRED CONFIG) endif() else() - add_subdirectory(cppan) + if (STATIC) + set(CPPAN_BUILD_SHARED_LIBS 0) + else() + set(CPPAN_BUILD_SHARED_LIBS 1) + endif() + add_subdirectory(.cppan) endif() find_package(OpenCL QUIET) -find_package(PkgConfig) + +option(BUILD_TRAINING_TOOLS "Build training tools" ON) ############################################################################### # @@ -76,6 +88,9 @@ if (WIN32) add_definitions(-D_CRT_SECURE_NO_WARNINGS) set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /MP") + if (APPVEYOR) + set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} /W0") + endif() endif() set(LIB_Ws2_32 Ws2_32) @@ -104,12 +119,7 @@ include(Configure) configure_file(${AUTOCONFIG_SRC} ${AUTOCONFIG} @ONLY) -set(INCLUDE_DIR - ${CMAKE_SOURCE_DIR}/api - ${CMAKE_SOURCE_DIR}/ccmain - ${CMAKE_SOURCE_DIR}/ccstruct - ${CMAKE_SOURCE_DIR}/ccutil -) +set(INCLUDE_DIR "${CMAKE_INSTALL_PREFIX}/include" "${CMAKE_INSTALL_PREFIX}/include/tesseract") configure_file( ${CMAKE_SOURCE_DIR}/cmake/templates/TesseractConfig-version.cmake.in @@ -137,14 +147,14 @@ include_directories(${Leptonica_INCLUDE_DIRS}) include_directories(${CMAKE_BINARY_DIR}) include_directories(api) +include_directories(arch) include_directories(ccmain) include_directories(ccstruct) include_directories(ccutil) include_directories(classify) -include_directories(cube) include_directories(cutil) include_directories(dict) -include_directories(neural_networks/runtime) +include_directories(lstm) include_directories(opencl) include_directories(textord) include_directories(vs2010/port) @@ -159,14 +169,14 @@ string(SUBSTRING ${VERSION_MINOR} 0 1 VERSION_MINOR_0) string(SUBSTRING ${VERSION_MINOR} 1 1 VERSION_MINOR_1) file(GLOB tesseract_src + arch/*.cpp ccmain/*.cpp ccstruct/*.cpp ccutil/*.cpp classify/*.cpp - cube/*.cpp cutil/*.cpp dict/*.cpp - neural_networks/runtime/*.cpp + lstm/*.cpp opencl/*.cpp textord/*.cpp viewer/*.cpp @@ -174,14 +184,14 @@ file(GLOB tesseract_src ) file(GLOB tesseract_hdr api/*.h + arch/*.h ccmain/*.h ccstruct/*.h ccutil/*.h classify/*.h - cube/*.h cutil/*.h dict/*.h - neural_networks/runtime/*.h + lstm/*.h opencl/*.h textord/*.h viewer/*.h @@ -201,25 +211,40 @@ set(tesseract_src ${tesseract_src} api/pdfrenderer.cpp ) -add_library (tesseract ${LIBRARY_TYPE} ${tesseract_src} ${tesseract_hdr}) -if (NOT STATIC) -target_compile_definitions (tesseract PUBLIC -DTESS_EXPORTS) -endif() -target_link_libraries (tesseract ${LIB_Ws2_32} ${LIB_pthread}) -set_target_properties (tesseract PROPERTIES VERSION ${VERSION_MAJOR}.${VERSION_MINOR_0}.${VERSION_MINOR_1}) -set_target_properties (tesseract PROPERTIES SOVERSION ${VERSION_MAJOR}.${VERSION_MINOR_0}.${VERSION_MINOR_1}) if (WIN32) -set_target_properties (tesseract PROPERTIES OUTPUT_NAME tesseract${VERSION_MAJOR}${VERSION_MINOR}) -set_target_properties (tesseract PROPERTIES DEBUG_OUTPUT_NAME tesseract${VERSION_MAJOR}${VERSION_MINOR}d) + set_source_files_properties( + ${CMAKE_CURRENT_SOURCE_DIR}/arch/dotproductsse.cpp + PROPERTIES COMPILE_DEFINITIONS __SSE4_1__) + if (MSVC) + set_source_files_properties( + ${CMAKE_CURRENT_SOURCE_DIR}/arch/dotproductavx.cpp + PROPERTIES COMPILE_FLAGS "/arch:AVX") + endif() +endif() + +add_library (libtesseract ${LIBRARY_TYPE} ${tesseract_src} ${tesseract_hdr}) +if (NOT STATIC) +target_compile_definitions (libtesseract + PRIVATE -DTESS_EXPORTS + INTERFACE -DTESS_IMPORTS +) +set_target_properties (libtesseract PROPERTIES WINDOWS_EXPORT_ALL_SYMBOLS True) +endif() +target_link_libraries (libtesseract ${LIB_Ws2_32} ${LIB_pthread}) +set_target_properties (libtesseract PROPERTIES VERSION ${VERSION_MAJOR}.${VERSION_MINOR_0}.${VERSION_MINOR_1}) +set_target_properties (libtesseract PROPERTIES SOVERSION ${VERSION_MAJOR}.${VERSION_MINOR_0}.${VERSION_MINOR_1}) +if (WIN32) +set_target_properties (libtesseract PROPERTIES OUTPUT_NAME tesseract${VERSION_MAJOR}${VERSION_MINOR}) +set_target_properties (libtesseract PROPERTIES DEBUG_OUTPUT_NAME tesseract${VERSION_MAJOR}${VERSION_MINOR}d) endif() if (NOT CPPAN_BUILD) - target_link_libraries (tesseract ${Leptonica_LIBRARIES}) - export(TARGETS tesseract FILE ${CMAKE_BINARY_DIR}/TesseractTargets.cmake) + target_link_libraries (libtesseract ${Leptonica_LIBRARIES}) + export(TARGETS libtesseract FILE ${CMAKE_BINARY_DIR}/TesseractTargets.cmake) else() - target_link_libraries (tesseract cppan) + target_link_libraries (libtesseract pvt.cppan.demo.danbloomberg.leptonica) file(WRITE ${CMAKE_BINARY_DIR}/TesseractTargets.cmake "include(${CMAKE_BINARY_DIR}/cppan.cmake)\n") - export(TARGETS tesseract APPEND FILE ${CMAKE_BINARY_DIR}/TesseractTargets.cmake) + export(TARGETS libtesseract APPEND FILE ${CMAKE_BINARY_DIR}/TesseractTargets.cmake) endif() ######################################## @@ -231,12 +256,97 @@ set(tesseractmain_src vs2010/tesseract/resource.h vs2010/tesseract/tesseract.rc ) -add_executable (tesseractmain ${tesseractmain_src}) -target_link_libraries (tesseractmain tesseract) -set_target_properties (tesseractmain PROPERTIES OUTPUT_NAME tesseract) +add_executable (tesseract ${tesseractmain_src}) +target_link_libraries (tesseract libtesseract) ######################################## +if (BUILD_TRAINING_TOOLS) add_subdirectory(training) +endif() + +get_target_property(tesseract_NAME libtesseract NAME) +get_target_property(tesseract_VERSION libtesseract VERSION) +get_target_property(tesseract_OUTPUT_NAME libtesseract OUTPUT_NAME) +configure_file(tesseract.pc.cmake ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc @ONLY) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/tesseract.pc DESTINATION lib/pkgconfig) +install(TARGETS tesseract RUNTIME DESTINATION bin LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) +install(TARGETS libtesseract EXPORT TesseractTargets RUNTIME DESTINATION bin LIBRARY DESTINATION lib ARCHIVE DESTINATION lib) +install(EXPORT TesseractTargets DESTINATION cmake) +install(FILES + ${CMAKE_BINARY_DIR}/TesseractConfig.cmake + ${CMAKE_BINARY_DIR}/TesseractConfig-version.cmake + DESTINATION cmake) + +install(FILES + # from api/makefile.am + api/apitypes.h + api/baseapi.h + api/capi.h + api/renderer.h + + #from arch/makefile.am + arch/dotproductavx.h + arch/dotproductsse.h + arch/simddetect.h + + #from ccmain/makefile.am + ccmain/thresholder.h + ccmain/ltrresultiterator.h + ccmain/pageiterator.h + ccmain/resultiterator.h + ccmain/osdetect.h + + #from ccstruct/makefile.am + ccstruct/publictypes.h + + #from ccutil/makefile.am + ccutil/basedir.h + ccutil/errcode.h + ccutil/fileerr.h + ccutil/genericvector.h + ccutil/helpers.h + ccutil/host.h + ccutil/memry.h + ccutil/ndminx.h + ccutil/params.h + ccutil/ocrclass.h + ccutil/platform.h + ccutil/serialis.h + ccutil/strngs.h + ccutil/tesscallback.h + ccutil/unichar.h + ccutil/unicharcompress.h + ccutil/unicharmap.h + ccutil/unicharset.h + + #from lstm/makefile.am + lstm/convolve.h + lstm/ctc.h + lstm/fullyconnected.h + lstm/functions.h + lstm/input.h + lstm/lstm.h + lstm/lstmrecognizer.h + lstm/lstmtrainer.h + lstm/maxpool.h + lstm/networkbuilder.h + lstm/network.h + lstm/networkio.h + lstm/networkscratch.h + lstm/parallel.h + lstm/plumbing.h + lstm/recodebeam.h + lstm/reconfig.h + lstm/reversed.h + lstm/series.h + lstm/static_shape.h + lstm/stridemap.h + lstm/tfnetwork.h + lstm/weightmatrix.h + + #${CMAKE_BINARY_DIR}/src/endianness.h + DESTINATION include/tesseract) + ############################################################################### diff --git a/CONTRIBUTING.md b/CONTRIBUTING.md index 0d29fc45..417f6062 100644 --- a/CONTRIBUTING.md +++ b/CONTRIBUTING.md @@ -24,7 +24,7 @@ When creating an issue, please report your operating system, including its speci Search through open and closed issues to see if similar issue has been reported already (and sometimes also has been solved). -Similary, before you post your question in the forum, search through past threads to see if similar question has been asked already. +Similarly, before you post your question in the forum, search through past threads to see if similar question has been asked already. Read the [wiki](https://github.com/tesseract-ocr/tesseract/wiki) before you report your issue or ask a question in the forum. @@ -44,7 +44,7 @@ When attaching a file to the issue report / forum ... Do not attach programs or libraries to your issues/posts. -For large files or for programs, add a link to a iocation where they can be downloaded (your site, Git repo, Google Drive, Dropbox etc.) +For large files or for programs, add a link to a location where they can be downloaded (your site, Git repo, Google Drive, Dropbox etc.) Attaching a multi-page TIFF image is useful only if you have problem with multi-page functionality, otherwise attach only one or a few single page images. diff --git a/COPYING b/COPYING index eb5b7546..be5ebaed 100644 --- a/COPYING +++ b/COPYING @@ -17,5 +17,5 @@ in this distribution is now licensed under the Apache License: Other Dependencies and Licenses: ================================ -Tesseract uses Leptonica library (http://leptonica.com/) with a very weakly -restricted copyright license (http://leptonica.com/about-the-license.html) +Tesseract uses Leptonica library (http://leptonica.com/) which essentially +uses a BSD 2-clause license. (http://leptonica.com/about-the-license.html) diff --git a/ChangeLog b/ChangeLog index 492d6984..1e634e3b 100644 --- a/ChangeLog +++ b/ChangeLog @@ -1,3 +1,46 @@ +2017-03-24 - V4.00.00-alpha + * Added new neural network system based on LSTMs, with major accuracy gains. + * Improvements to PDF rendering. + * Fixes to trainingdata rendering. + * Added LSTM models+lang models to 101 languages. (tessdata repository) + * Improved multi-page TIFF handling. + * Fixed damage to binary images when processing PDFs. + * Fixes to training process to allow incremental training from a recognition model. + * Made LSTM the default engine, pushed cube out. + * Deleted cube code. + * Changed OEModes --oem 0 for legacy tesseract engine, --oem 1 for LSTM, --oem 2 for both, --oem 3 for default. + * Avoid use of Leptonica debug parameters or functions. + * Fixed multi-language mode. + * Removed support for VS2010. + * Added Support for VS2015 and VS2017 with CPPAN. + * Implemented invisible text only for PDF. + * Added AVX / SSE support for Windows + * Enabled OpenMP support. + * Miscellaneous Fixes. + +2017-02-16 - V3.05.00 + * Made some fine tuning to the hOCR output. + * Added TSV as another optional output format. + * Fixed ABI break introduced in 3.04.00 with the AnalyseLayout() method. + * text2image tool - Enable all OpenType ligatures available in a font. This feature requires Pango 1.38 or newer. + * Training tools - Replaced asserts with tprintf() and exit(1). + * Fixed Cygwin compatibility. + * Improved multipage tiff processing. + * Improved the embedded pdf font (pdf.ttf). + * Enable selection of OCR engine mode from command line. + * Changed tesseract command line parameter '-psm' to '--psm'. + * Write output of tesseract --help, --version and --list-langs to stdout instead of stderr. + * Added new C API for orientation and script detection, removed the old one. + * Increased minimum autoconf version to 2.59. + * Removed dead code. + * Require Leptonica 1.74 or higher. + * Fixed many compiler warning. + * Fixed memory and resource leaks. + * Fixed some issues with the 'Cube' OCR engine. + * Fixed some openCL issues. + * Added option to build Tesseract with CMake build system. + * Implemented CPPAN support for easy Windows building. + 2016-02-17 - V3.04.01 * Added OSD renderer for psm 0. Works for single page and multi-page images. * Improve tesstrain.sh script. diff --git a/INSTALL b/INSTALL index f2f22037..22aa2e0a 100644 --- a/INSTALL +++ b/INSTALL @@ -45,7 +45,7 @@ The simplest way to compile this package is: `sh ./configure' instead to prevent `csh' from trying to execute `configure' itself. - Running `configure' takes awhile. While running, it prints some + Running `configure' takes a while. While running, it prints some messages telling which features it is checking for. 2. Type `make' to compile the package. diff --git a/INSTALL.GIT.md b/INSTALL.GIT.md index 07acbb0b..f0f4999d 100644 --- a/INSTALL.GIT.md +++ b/INSTALL.GIT.md @@ -3,11 +3,12 @@ If you have cloned Tesseract from GitHub, you must generate the configure script. -If you have tesseract 3.0x installation in your system, please remove it +If you have tesseract 4.0x installation in your system, please remove it before new build. Known dependencies for training tools (excluding leptonica): - * compiler with c++ support + * compiler with c++11 support + * autoconf-archive * pango-devel * cairo-devel * icu-devel @@ -24,7 +25,7 @@ So, the steps for making Tesseract are: You need to install at least English language and OSD data files to TESSDATA_PREFIX directory. You can retrieve single file with tools like [wget](https://www.gnu.org/software/wget/), [curl](https://curl.haxx.se/), [GithubDownloader](https://github.com/intezer/GithubDownloader) or browser. -All language data files can be retrieved from git repository (usefull only for packagers!): +All language data files can be retrieved from git repository (useful only for packagers!): $ git clone https://github.com/tesseract-ocr/tessdata.git tesseract-ocr.tessdata diff --git a/LICENSE b/LICENSE new file mode 100644 index 00000000..d6456956 --- /dev/null +++ b/LICENSE @@ -0,0 +1,202 @@ + + Apache License + Version 2.0, January 2004 + http://www.apache.org/licenses/ + + TERMS AND CONDITIONS FOR USE, REPRODUCTION, AND DISTRIBUTION + + 1. Definitions. + + "License" shall mean the terms and conditions for use, reproduction, + and distribution as defined by Sections 1 through 9 of this document. + + "Licensor" shall mean the copyright owner or entity authorized by + the copyright owner that is granting the License. + + "Legal Entity" shall mean the union of the acting entity and all + other entities that control, are controlled by, or are under common + control with that entity. For the purposes of this definition, + "control" means (i) the power, direct or indirect, to cause the + direction or management of such entity, whether by contract or + otherwise, or (ii) ownership of fifty percent (50%) or more of the + outstanding shares, or (iii) beneficial ownership of such entity. + + "You" (or "Your") shall mean an individual or Legal Entity + exercising permissions granted by this License. + + "Source" form shall mean the preferred form for making modifications, + including but not limited to software source code, documentation + source, and configuration files. + + "Object" form shall mean any form resulting from mechanical + transformation or translation of a Source form, including but + not limited to compiled object code, generated documentation, + and conversions to other media types. + + "Work" shall mean the work of authorship, whether in Source or + Object form, made available under the License, as indicated by a + copyright notice that is included in or attached to the work + (an example is provided in the Appendix below). + + "Derivative Works" shall mean any work, whether in Source or Object + form, that is based on (or derived from) the Work and for which the + editorial revisions, annotations, elaborations, or other modifications + represent, as a whole, an original work of authorship. For the purposes + of this License, Derivative Works shall not include works that remain + separable from, or merely link (or bind by name) to the interfaces of, + the Work and Derivative Works thereof. + + "Contribution" shall mean any work of authorship, including + the original version of the Work and any modifications or additions + to that Work or Derivative Works thereof, that is intentionally + submitted to Licensor for inclusion in the Work by the copyright owner + or by an individual or Legal Entity authorized to submit on behalf of + the copyright owner. For the purposes of this definition, "submitted" + means any form of electronic, verbal, or written communication sent + to the Licensor or its representatives, including but not limited to + communication on electronic mailing lists, source code control systems, + and issue tracking systems that are managed by, or on behalf of, the + Licensor for the purpose of discussing and improving the Work, but + excluding communication that is conspicuously marked or otherwise + designated in writing by the copyright owner as "Not a Contribution." + + "Contributor" shall mean Licensor and any individual or Legal Entity + on behalf of whom a Contribution has been received by Licensor and + subsequently incorporated within the Work. + + 2. Grant of Copyright License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + copyright license to reproduce, prepare Derivative Works of, + publicly display, publicly perform, sublicense, and distribute the + Work and such Derivative Works in Source or Object form. + + 3. Grant of Patent License. Subject to the terms and conditions of + this License, each Contributor hereby grants to You a perpetual, + worldwide, non-exclusive, no-charge, royalty-free, irrevocable + (except as stated in this section) patent license to make, have made, + use, offer to sell, sell, import, and otherwise transfer the Work, + where such license applies only to those patent claims licensable + by such Contributor that are necessarily infringed by their + Contribution(s) alone or by combination of their Contribution(s) + with the Work to which such Contribution(s) was submitted. If You + institute patent litigation against any entity (including a + cross-claim or counterclaim in a lawsuit) alleging that the Work + or a Contribution incorporated within the Work constitutes direct + or contributory patent infringement, then any patent licenses + granted to You under this License for that Work shall terminate + as of the date such litigation is filed. + + 4. Redistribution. You may reproduce and distribute copies of the + Work or Derivative Works thereof in any medium, with or without + modifications, and in Source or Object form, provided that You + meet the following conditions: + + (a) You must give any other recipients of the Work or + Derivative Works a copy of this License; and + + (b) You must cause any modified files to carry prominent notices + stating that You changed the files; and + + (c) You must retain, in the Source form of any Derivative Works + that You distribute, all copyright, patent, trademark, and + attribution notices from the Source form of the Work, + excluding those notices that do not pertain to any part of + the Derivative Works; and + + (d) If the Work includes a "NOTICE" text file as part of its + distribution, then any Derivative Works that You distribute must + include a readable copy of the attribution notices contained + within such NOTICE file, excluding those notices that do not + pertain to any part of the Derivative Works, in at least one + of the following places: within a NOTICE text file distributed + as part of the Derivative Works; within the Source form or + documentation, if provided along with the Derivative Works; or, + within a display generated by the Derivative Works, if and + wherever such third-party notices normally appear. The contents + of the NOTICE file are for informational purposes only and + do not modify the License. You may add Your own attribution + notices within Derivative Works that You distribute, alongside + or as an addendum to the NOTICE text from the Work, provided + that such additional attribution notices cannot be construed + as modifying the License. + + You may add Your own copyright statement to Your modifications and + may provide additional or different license terms and conditions + for use, reproduction, or distribution of Your modifications, or + for any such Derivative Works as a whole, provided Your use, + reproduction, and distribution of the Work otherwise complies with + the conditions stated in this License. + + 5. Submission of Contributions. Unless You explicitly state otherwise, + any Contribution intentionally submitted for inclusion in the Work + by You to the Licensor shall be under the terms and conditions of + this License, without any additional terms or conditions. + Notwithstanding the above, nothing herein shall supersede or modify + the terms of any separate license agreement you may have executed + with Licensor regarding such Contributions. + + 6. Trademarks. This License does not grant permission to use the trade + names, trademarks, service marks, or product names of the Licensor, + except as required for reasonable and customary use in describing the + origin of the Work and reproducing the content of the NOTICE file. + + 7. Disclaimer of Warranty. Unless required by applicable law or + agreed to in writing, Licensor provides the Work (and each + Contributor provides its Contributions) on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or + implied, including, without limitation, any warranties or conditions + of TITLE, NON-INFRINGEMENT, MERCHANTABILITY, or FITNESS FOR A + PARTICULAR PURPOSE. You are solely responsible for determining the + appropriateness of using or redistributing the Work and assume any + risks associated with Your exercise of permissions under this License. + + 8. Limitation of Liability. In no event and under no legal theory, + whether in tort (including negligence), contract, or otherwise, + unless required by applicable law (such as deliberate and grossly + negligent acts) or agreed to in writing, shall any Contributor be + liable to You for damages, including any direct, indirect, special, + incidental, or consequential damages of any character arising as a + result of this License or out of the use or inability to use the + Work (including but not limited to damages for loss of goodwill, + work stoppage, computer failure or malfunction, or any and all + other commercial damages or losses), even if such Contributor + has been advised of the possibility of such damages. + + 9. Accepting Warranty or Additional Liability. While redistributing + the Work or Derivative Works thereof, You may choose to offer, + and charge a fee for, acceptance of support, warranty, indemnity, + or other liability obligations and/or rights consistent with this + License. However, in accepting such obligations, You may act only + on Your own behalf and on Your sole responsibility, not on behalf + of any other Contributor, and only if You agree to indemnify, + defend, and hold each Contributor harmless for any liability + incurred by, or claims asserted against, such Contributor by reason + of your accepting any such warranty or additional liability. + + END OF TERMS AND CONDITIONS + + APPENDIX: How to apply the Apache License to your work. + + To apply the Apache License to your work, attach the following + boilerplate notice, with the fields enclosed by brackets "[]" + replaced with your own identifying information. (Don't include + the brackets!) The text should be enclosed in the appropriate + comment syntax for the file format. We also recommend that a + file or class name and description of purpose be included on the + same "printed page" as the copyright notice for easier + identification within third-party archives. + + Copyright [yyyy] [name of copyright owner] + + Licensed under the Apache License, Version 2.0 (the "License"); + you may not use this file except in compliance with the License. + You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + + Unless required by applicable law or agreed to in writing, software + distributed under the License is distributed on an "AS IS" BASIS, + WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + See the License for the specific language governing permissions and + limitations under the License. diff --git a/Makefile.am b/Makefile.am index a4aa1dd9..258cbdd0 100644 --- a/Makefile.am +++ b/Makefile.am @@ -4,9 +4,12 @@ ACLOCAL_AMFLAGS = -I m4 if ENABLE_TRAINING TRAINING_SUBDIR = training training: + $(MAKE) @cd "$(top_builddir)/training" && $(MAKE) training-install: @cd "$(top_builddir)/training" && $(MAKE) install +training-uninstall: + @cd "$(top_builddir)/training" && $(MAKE) uninstall clean-local: @cd "$(top_builddir)/training" && $(MAKE) clean else @@ -16,10 +19,7 @@ endif .PHONY: install-langs ScrollView.jar install-jars training -SUBDIRS = ccutil viewer cutil opencl ccstruct dict classify wordrec textord -if !NO_CUBE_BUILD - SUBDIRS += neural_networks/runtime cube -endif +SUBDIRS = arch ccutil viewer cutil opencl ccstruct dict classify wordrec textord lstm SUBDIRS += ccmain api . tessdata doc EXTRA_DIST = README.md\ @@ -35,14 +35,14 @@ dist-hook: # Need to remove .svn directories from directories # added using EXTRA_DIST. $(distdir)/tessdata would in # theory suffice. - rm -rf `find $(distdir) -name .svn` - rm -rf `find $(distdir) -name .git` - rm -rf `find $(distdir) -name .deps` - rm -rf `find $(distdir) -name .libs` - rm -rf `find $(distdir) -name *.o` - rm -rf `find $(distdir) -name *.lo` - rm -rf `find $(distdir) -name *.la` - rm -rf `find $(distdir)/training -executable -type f` + rm -rf $(find $(distdir) -name .svn) + rm -rf $(find $(distdir) -name .git) + rm -rf $(find $(distdir) -name .deps) + rm -rf $(find $(distdir) -name .libs) + rm -rf $(find $(distdir) -name *.o) + rm -rf $(find $(distdir) -name *.lo) + rm -rf $(find $(distdir) -name *.la) + rm -rf $(find $(distdir)/training -executable -type f) rm -rf $(distdir)/doc/html/* ScrollView.jar: diff --git a/README.md b/README.md index fd64016d..f9e81194 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,30 @@ [![Build Status](https://travis-ci.org/tesseract-ocr/tesseract.svg?branch=master)](https://travis-ci.org/tesseract-ocr/tesseract) -[![Build status](https://ci.appveyor.com/api/projects/status/miah0ikfsf0j3819?svg=true)](https://ci.appveyor.com/project/zdenop/tesseract/) +[![Build status](https://ci.appveyor.com/api/projects/status/miah0ikfsf0j3819/branch/master?svg=true)](https://ci.appveyor.com/project/zdenop/tesseract/) For the latest online version of the README.md see: https://github.com/tesseract-ocr/tesseract/blob/master/README.md -#About +# About This package contains an OCR engine - `libtesseract` and a command line program - `tesseract`. The lead developer is Ray Smith. The maintainer is Zdenko Podobny. -For a list of contributors see [AUTHORS](https://github.com/tesseract-ocr/tesseract/blob/master/AUTHORS) and github's log of [contributors](https://github.com/tesseract-ocr/tesseract/graphs/contributors). +For a list of contributors see [AUTHORS](https://github.com/tesseract-ocr/tesseract/blob/master/AUTHORS) +and GitHub's log of [contributors](https://github.com/tesseract-ocr/tesseract/graphs/contributors). Tesseract has unicode (UTF-8) support, and can recognize more than 100 languages "out of the box". It can be trained to recognize other languages. See [Tesseract Training](https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract) for more information. Tesseract supports various output formats: plain-text, hocr(html), pdf. -This project does not include a GUI application. If you need one, please see the [3rdParty](https://github.com/tesseract-ocr/tesseract/wiki/3rdParty) wiki page. +This project does not include a GUI application. If you need one, please see the [3rdParty](https://github.com/tesseract-ocr/tesseract/wiki/User-Projects-%E2%80%93-3rdParty) wiki page. You should note that in many cases, in order to get better OCR results, you'll need to [improve the quality](https://github.com/tesseract-ocr/tesseract/wiki/ImproveQuality) of the image you are giving Tesseract. -The latest stable version is 3.04.01, released in February 2016. +The latest stable version is 3.05.00, released in February 2017. -#Brief history +# Brief history Tesseract was originally developed at Hewlett-Packard Laboratories Bristol and at Hewlett-Packard Co, Greeley Colorado between 1985 and 1994, with some @@ -33,13 +34,13 @@ In 2005 Tesseract was open sourced by HP. Since 2006 it is developed by Google. [Release Notes](https://github.com/tesseract-ocr/tesseract/wiki/ReleaseNotes) -#For developers +# For developers Developers can use `libtesseract` [C](https://github.com/tesseract-ocr/tesseract/blob/master/api/capi.h) or [C++](https://github.com/tesseract-ocr/tesseract/blob/master/api/baseapi.h) API to build their own application. If you need bindings to `libtesseract` for other programming languages, please see the [wrapper](https://github.com/tesseract-ocr/tesseract/wiki/AddOns#tesseract-wrappers) section on AddOns wiki page. Documentation of Tesseract generated from source code by doxygen can be found on [tesseract-ocr.github.io](http://tesseract-ocr.github.io/). -#License +# License The code in this repository is licensed under the Apache License, Version 2.0 (the "License"); you may not use this file except in compliance with the License. @@ -55,19 +56,27 @@ Documentation of Tesseract generated from source code by doxygen can be found on **NOTE**: This software depends on other packages that may be licensed under different open source licenses. -#Installing Tesseract +# Installing Tesseract You can either [Install Tesseract via pre-built binary package](https://github.com/tesseract-ocr/tesseract/wiki) or [build it from source](https://github.com/tesseract-ocr/tesseract/wiki/Compiling). -#Running Tesseract +## Supported Compilers + +* GCC 4.8 and above +* Clang 3.4 and above +* MSVC 2015, 2017 + +Other compilers might work, but are not officially supported. + +# Running Tesseract Basic command line usage: - tesseract imagename outputbase [-l lang] [-psm pagesegmode] [configfiles...] + tesseract imagename outputbase [-l lang] [--psm pagesegmode] [configfiles...] For more information about the various command line options use `tesseract --help` or `man tesseract`. -#Support +# Support Mailing-lists: * [tesseract-ocr](https://groups.google.com/d/forum/tesseract-ocr) - For tesseract users. diff --git a/android/jni/Android.mk b/android/jni/Android.mk index 46248014..170bb513 100644 --- a/android/jni/Android.mk +++ b/android/jni/Android.mk @@ -4,7 +4,7 @@ include $(CLEAR_VARS) LOCAL_MODULE := tesseract-$(APP_ABI) LOCAL_STATIC_LIBRARIES := \ - mobile_base \ + base \ leptonica-$(APP_ABI) LOCAL_C_INCLUDES := $(APP_C_INCLUDES) @@ -30,13 +30,6 @@ $(info local path=$(LOCAL_PATH)) LOCAL_SRC_FILES := $(wildcard $(LOCAL_PATH)/../../api/*.cpp $(LOCAL_PATH)/../../ccmain/*.cpp $(LOCAL_PATH)/../../ccstruct/*.cpp $(LOCAL_PATH)/../../ccutil/*.cpp $(LOCAL_PATH)/../../classify/*.cpp $(LOCAL_PATH)/../../cutil/*.cpp $(LOCAL_PATH)/../../dict/*.cpp $(LOCAL_PATH)/../../image/*.cpp $(LOCAL_PATH)/../../textord/*.cpp $(LOCAL_PATH)/../../viewer/*.cpp $(LOCAL_PATH)/../../wordrec/*.cpp) EXPLICIT_SRC_EXCLUDES := \ - $(LOCAL_PATH)/../../ccmain/cubeclassifier.cpp \ - $(LOCAL_PATH)/../../ccmain/cubeclassifier.h \ - $(LOCAL_PATH)/../../ccmain/cube_control.cpp \ - $(LOCAL_PATH)/../../ccmain/cube_reco_context.cpp \ - $(LOCAL_PATH)/../../ccmain/cube_reco_context.h \ - $(LOCAL_PATH)/../../ccmain/tesseract_cube_combiner.cpp \ - $(LOCAL_PATH)/../../ccmain/tesseract_cube_combiner.h \ $(LOCAL_PATH)/../../api/pdfrenderer.cpp \ $(LOCAL_PATH)/../../api/tesseractmain.cpp \ @@ -47,11 +40,10 @@ LOCAL_SRC_FILES := $(LOCAL_SRC_FILES:$(LOCAL_PATH)/%=%) $(info local src files = $(LOCAL_SRC_FILES)) LOCAL_LDLIBS := -ldl -llog -ljnigraphics -LOCAL_CFLAGS := -DANDROID_BUILD -DNO_CUBE_BUILD -DGRAPHICS_DISABLED +LOCAL_CFLAGS := -DANDROID_BUILD -DGRAPHICS_DISABLED include $(BUILD_SHARED_LIBRARY) -$(call import-module,mobile/base) -$(call import-module,mobile/base) +$(call import-module,base/port) $(call import-module,mobile/util/hash) $(call import-module,third_party/leptonica/android/jni) diff --git a/api/Makefile.am b/api/Makefile.am index 9d20919b..b8c8a376 100644 --- a/api/Makefile.am +++ b/api/Makefile.am @@ -1,6 +1,7 @@ AM_CPPFLAGS += -DLOCALEDIR=\"$(localedir)\"\ -DUSE_STD_NAMESPACE \ - -I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct -I$(top_srcdir)/cube \ + -I$(top_srcdir)/arch -I$(top_srcdir)/lstm \ + -I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct \ -I$(top_srcdir)/viewer \ -I$(top_srcdir)/textord -I$(top_srcdir)/dict \ -I$(top_srcdir)/classify -I$(top_srcdir)/ccmain \ @@ -27,15 +28,15 @@ libtesseract_api_la_LIBADD = \ ../wordrec/libtesseract_wordrec.la \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_arch.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccutil/libtesseract_ccutil.la \ ../opencl/libtesseract_opencl.la - if !NO_CUBE_BUILD - libtesseract_api_la_LIBADD += ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ - endif endif libtesseract_api_la_CPPFLAGS = $(AM_CPPFLAGS) @@ -45,7 +46,7 @@ endif libtesseract_api_la_SOURCES = baseapi.cpp capi.cpp renderer.cpp pdfrenderer.cpp lib_LTLIBRARIES += libtesseract.la -libtesseract_la_LDFLAGS = +libtesseract_la_LDFLAGS = $(LEPTONICA_LIBS) $(OPENCL_LDFLAGS) libtesseract_la_SOURCES = # Dummy C++ source to cause C++ linking. # see http://www.gnu.org/s/hello/manual/automake/Libtool-Convenience-Libraries.html#Libtool-Convenience-Libraries @@ -57,15 +58,15 @@ libtesseract_la_LIBADD = \ ../wordrec/libtesseract_wordrec.la \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_arch.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccutil/libtesseract_ccutil.la \ ../opencl/libtesseract_opencl.la -if !NO_CUBE_BUILD -libtesseract_la_LIBADD += ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la -endif libtesseract_la_LDFLAGS += -version-info $(GENERIC_LIBRARY_VERSION) -no-undefined @@ -81,9 +82,10 @@ tesseract_LDADD = libtesseract.la tesseract_LDFLAGS = $(OPENCL_LDFLAGS) -if OPENMP -tesseract_LDADD += $(OPENMP_CFLAGS) -endif +tesseract_LDADD += $(LEPTONICA_LIBS) +tesseract_LDADD += $(OPENMP_CXXFLAGS) + +tesseract_LDADD += -ltiff if T_WIN tesseract_LDADD += -lws2_32 @@ -92,4 +94,3 @@ endif if ADD_RT tesseract_LDADD += -lrt endif - diff --git a/api/apitypes.h b/api/apitypes.h index 2cb38add..2c0e85c9 100644 --- a/api/apitypes.h +++ b/api/apitypes.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_API_APITYPES_H__ -#define TESSERACT_API_APITYPES_H__ +#ifndef TESSERACT_API_APITYPES_H_ +#define TESSERACT_API_APITYPES_H_ #include "publictypes.h" @@ -30,4 +30,4 @@ // than the lower-level one, and lower-level code should be sure to include // only the lower-level file. -#endif // TESSERACT_API_APITYPES_H__ +#endif // TESSERACT_API_APITYPES_H_ diff --git a/api/baseapi.cpp b/api/baseapi.cpp index 534f3f00..07164b2f 100644 --- a/api/baseapi.cpp +++ b/api/baseapi.cpp @@ -34,8 +34,6 @@ // workaround for stdlib.h with -std=c++11 for _splitpath and _MAX_FNAME #undef __STRICT_ANSI__ #endif // _MSC_VER -#include -#include #include #include #else @@ -110,27 +108,30 @@ const int kMinCredibleResolution = 70; const int kMaxCredibleResolution = 2400; TessBaseAPI::TessBaseAPI() - : tesseract_(NULL), - osd_tesseract_(NULL), - equ_detect_(NULL), - // Thresholder is initialized to NULL here, but will be set before use by: - // A constructor of a derived API, SetThresholder(), or - // created implicitly when used in InternalSetImage. - thresholder_(NULL), - paragraph_models_(NULL), - block_list_(NULL), - page_res_(NULL), - input_file_(NULL), - input_image_(NULL), - output_file_(NULL), - datapath_(NULL), - language_(NULL), - last_oem_requested_(OEM_DEFAULT), - recognition_done_(false), - truth_cb_(NULL), - rect_left_(0), rect_top_(0), rect_width_(0), rect_height_(0), - image_width_(0), image_height_(0) { -} + : tesseract_(nullptr), + osd_tesseract_(nullptr), + equ_detect_(nullptr), + reader_(nullptr), + // Thresholder is initialized to NULL here, but will be set before use by: + // A constructor of a derived API, SetThresholder(), or + // created implicitly when used in InternalSetImage. + thresholder_(nullptr), + paragraph_models_(nullptr), + block_list_(nullptr), + page_res_(nullptr), + input_file_(nullptr), + output_file_(nullptr), + datapath_(nullptr), + language_(nullptr), + last_oem_requested_(OEM_DEFAULT), + recognition_done_(false), + truth_cb_(NULL), + rect_left_(0), + rect_top_(0), + rect_width_(0), + rect_height_(0), + image_width_(0), + image_height_(0) {} TessBaseAPI::~TessBaseAPI() { End(); @@ -278,20 +279,33 @@ int TessBaseAPI::Init(const char* datapath, const char* language, const GenericVector *vars_vec, const GenericVector *vars_values, bool set_only_non_debug_params) { + return Init(datapath, 0, language, oem, configs, configs_size, vars_vec, + vars_values, set_only_non_debug_params, nullptr); +} + +// In-memory version reads the traineddata file directly from the given +// data[data_size] array. Also implements the version with a datapath in data, +// flagged by data_size = 0. +int TessBaseAPI::Init(const char* data, int data_size, const char* language, + OcrEngineMode oem, char** configs, int configs_size, + const GenericVector* vars_vec, + const GenericVector* vars_values, + bool set_only_non_debug_params, FileReader reader) { PERF_COUNT_START("TessBaseAPI::Init") // Default language is "eng". - if (language == NULL) language = "eng"; + if (language == nullptr) language = "eng"; + STRING datapath = data_size == 0 ? data : language; // If the datapath, OcrEngineMode or the language have changed - start again. // Note that the language_ field stores the last requested language that was // initialized successfully, while tesseract_->lang stores the language // actually used. They differ only if the requested language was NULL, in // which case tesseract_->lang is set to the Tesseract default ("eng"). - if (tesseract_ != NULL && - (datapath_ == NULL || language_ == NULL || - *datapath_ != datapath || last_oem_requested_ != oem || + if (tesseract_ != nullptr && + (datapath_ == nullptr || language_ == nullptr || *datapath_ != datapath || + last_oem_requested_ != oem || (*language_ != language && tesseract_->lang != language))) { delete tesseract_; - tesseract_ = NULL; + tesseract_ = nullptr; } // PERF_COUNT_SUB("delete tesseract_") #ifdef USE_OPENCL @@ -300,19 +314,25 @@ int TessBaseAPI::Init(const char* datapath, const char* language, #endif PERF_COUNT_SUB("OD::InitEnv()") bool reset_classifier = true; - if (tesseract_ == NULL) { + if (tesseract_ == nullptr) { reset_classifier = false; tesseract_ = new Tesseract; + if (reader != nullptr) reader_ = reader; + TessdataManager mgr(reader_); + if (data_size != 0) { + mgr.LoadMemBuffer(language, data, data_size); + } if (tesseract_->init_tesseract( - datapath, output_file_ != NULL ? output_file_->string() : NULL, - language, oem, configs, configs_size, vars_vec, vars_values, - set_only_non_debug_params) != 0) { + datapath.string(), + output_file_ != nullptr ? output_file_->string() : nullptr, + language, oem, configs, configs_size, vars_vec, vars_values, + set_only_non_debug_params, &mgr) != 0) { return -1; } } PERF_COUNT_SUB("update tesseract_") // Update datapath and language requested for the last valid initialization. - if (datapath_ == NULL) + if (datapath_ == nullptr) datapath_ = new STRING(datapath); else *datapath_ = datapath; @@ -320,7 +340,7 @@ int TessBaseAPI::Init(const char* datapath, const char* language, (strcmp(tesseract_->datadir.string(), "") != 0)) *datapath_ = tesseract_->datadir; - if (language_ == NULL) + if (language_ == nullptr) language_ = new STRING(language); else *language_ = language; @@ -424,7 +444,8 @@ int TessBaseAPI::InitLangMod(const char* datapath, const char* language) { tesseract_ = new Tesseract; else ParamUtils::ResetToDefaults(tesseract_->params()); - return tesseract_->init_tesseract_lm(datapath, NULL, language); + TessdataManager mgr; + return tesseract_->init_tesseract_lm(datapath, NULL, language, &mgr); } /** @@ -434,7 +455,7 @@ int TessBaseAPI::InitLangMod(const char* datapath, const char* language) { void TessBaseAPI::InitForAnalysePage() { if (tesseract_ == NULL) { tesseract_ = new Tesseract; - tesseract_->InitAdaptiveClassifier(false); + tesseract_->InitAdaptiveClassifier(nullptr); } } @@ -515,9 +536,7 @@ void TessBaseAPI::ClearAdaptiveClassifier() { /** * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Does not copy the image buffer, or take - * ownership. The source image may be destroyed after Recognize is called, - * either explicitly or implicitly via one of the Get*Text functions. + * TesseractRect above. Copies the image buffer and converts to Pix. * SetImage clears all recognition results, and sets the rectangle to the * full image, so it may be followed immediately by a GetUTF8Text, and it * will automatically perform recognition. @@ -525,9 +544,11 @@ void TessBaseAPI::ClearAdaptiveClassifier() { void TessBaseAPI::SetImage(const unsigned char* imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line) { - if (InternalSetImage()) + if (InternalSetImage()) { thresholder_->SetImage(imagedata, width, height, bytes_per_pixel, bytes_per_line); + SetInputImage(thresholder_->GetPixRect()); + } } void TessBaseAPI::SetSourceResolution(int ppi) { @@ -539,18 +560,17 @@ void TessBaseAPI::SetSourceResolution(int ppi) { /** * Provide an image for Tesseract to recognize. As with SetImage above, - * Tesseract doesn't take a copy or ownership or pixDestroy the image, so - * it must persist until after Recognize. + * Tesseract takes its own copy of the image, so it need not persist until + * after Recognize. * Pix vs raw, which to use? - * Use Pix where possible. A future version of Tesseract may choose to use Pix - * as its internal representation and discard IMAGE altogether. - * Because of that, an implementation that sources and targets Pix may end up - * with less copies than an implementation that does not. + * Use Pix where possible. Tesseract uses Pix as its internal representation + * and it is therefore more efficient to provide a Pix directly. */ void TessBaseAPI::SetImage(Pix* pix) { - if (InternalSetImage()) + if (InternalSetImage()) { thresholder_->SetImage(pix); - SetInputImage(pix); + SetInputImage(thresholder_->GetPixRect()); + } } /** @@ -693,8 +713,8 @@ Boxa* TessBaseAPI::GetComponentImages(PageIteratorLevel level, if (pixa != NULL) { Pix* pix = NULL; if (raw_image) { - pix = page_it->GetImage(level, raw_padding, input_image_, - &left, &top); + pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left, + &top); } else { pix = page_it->GetBinaryImage(level); } @@ -747,53 +767,6 @@ void TessBaseAPI::DumpPGM(const char* filename) { fclose(fp); } -#ifndef NO_CUBE_BUILD -/** - * Placeholder for call to Cube and test that the input data is correct. - * reskew is the direction of baselines in the skewed image in - * normalized (cos theta, sin theta) form, so (0.866, 0.5) would represent - * a 30 degree anticlockwise skew. - */ -int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks, - Boxa* boxa_words, Pixa* pixa_words, - const FCOORD& reskew, Pix* page_pix, - PAGE_RES* page_res) { - int block_count = boxaGetCount(boxa_blocks); - ASSERT_HOST(block_count == pixaGetCount(pixa_blocks)); - // Write each block to the current directory as junk_write_display.nnn.png. - for (int i = 0; i < block_count; ++i) { - Pix* pix = pixaGetPix(pixa_blocks, i, L_CLONE); - pixDisplayWrite(pix, 1); - } - int word_count = boxaGetCount(boxa_words); - ASSERT_HOST(word_count == pixaGetCount(pixa_words)); - int pr_word = 0; - PAGE_RES_IT page_res_it(page_res); - for (page_res_it.restart_page(); page_res_it.word () != NULL; - page_res_it.forward(), ++pr_word) { - WERD_RES *word = page_res_it.word(); - WERD_CHOICE* choice = word->best_choice; - // Write the first 100 words to files names wordims/.tif. - if (pr_word < 100) { - STRING filename("wordims/"); - if (choice != NULL) { - filename += choice->unichar_string(); - } else { - char numbuf[32]; - filename += "unclassified"; - snprintf(numbuf, 32, "%03d", pr_word); - filename += numbuf; - } - filename += ".tif"; - Pix* pix = pixaGetPix(pixa_words, pr_word, L_CLONE); - pixWrite(filename.string(), pix, IFF_TIFF_G4); - } - } - ASSERT_HOST(pr_word == word_count); - return 0; -} -#endif // NO_CUBE_BUILD - /** * Runs page layout analysis in the mode set by SetPageSegMode. * May optionally be called prior to Recognize to get access to just @@ -809,9 +782,7 @@ int CubeAPITest(Boxa* boxa_blocks, Pixa* pixa_blocks, * has not been subjected to a call of Init, SetImage, Recognize, Clear, End * DetectOS, or anything else that changes the internal PAGE_RES. */ -PageIterator* TessBaseAPI::AnalyseLayout() { - return AnalyseLayout(false); -} +PageIterator* TessBaseAPI::AnalyseLayout() { return AnalyseLayout(false); } PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) { if (FindLines() == 0) { @@ -836,8 +807,7 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) { return -1; if (FindLines() != 0) return -1; - if (page_res_ != NULL) - delete page_res_; + delete page_res_; if (block_list_->empty()) { page_res_ = new PAGE_RES(false, block_list_, &tesseract_->prev_word_best_choice_); @@ -851,13 +821,17 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) { } else if (tesseract_->tessedit_resegment_from_boxes) { page_res_ = tesseract_->ApplyBoxes(*input_file_, false, block_list_); } else { - // TODO(rays) LSTM here. - page_res_ = new PAGE_RES(false, + page_res_ = new PAGE_RES(tesseract_->AnyLSTMLang(), block_list_, &tesseract_->prev_word_best_choice_); } if (page_res_ == NULL) { return -1; } + if (tesseract_->tessedit_train_line_recognizer) { + tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_); + tesseract_->CorrectClassifyWords(page_res_); + return 0; + } if (tesseract_->tessedit_make_boxes_from_boxes) { tesseract_->CorrectClassifyWords(page_res_); return 0; @@ -940,17 +914,10 @@ int TessBaseAPI::RecognizeForChopTest(ETEXT_DESC* monitor) { return 0; } -void TessBaseAPI::SetInputImage(Pix *pix) { - if (input_image_) - pixDestroy(&input_image_); - input_image_ = NULL; - if (pix) - input_image_ = pixCopy(NULL, pix); -} +// Takes ownership of the input pix. +void TessBaseAPI::SetInputImage(Pix* pix) { tesseract_->set_pix_original(pix); } -Pix* TessBaseAPI::GetInputImage() { - return input_image_; -} +Pix* TessBaseAPI::GetInputImage() { return tesseract_->pix_original(); } const char * TessBaseAPI::GetInputName() { if (input_file_) @@ -994,8 +961,7 @@ bool TessBaseAPI::ProcessPagesFileList(FILE *flist, } // Begin producing output - const char* kUnknownTitle = ""; - if (renderer && !renderer->BeginDocument(kUnknownTitle)) { + if (renderer && !renderer->BeginDocument(unknown_title_)) { return false; } @@ -1038,26 +1004,13 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, int tessedit_page_number) { #ifndef ANDROID_BUILD Pix *pix = NULL; -#ifdef USE_OPENCL - OpenclDevice od; -#endif // USE_OPENCL int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0; + size_t offset = 0; for (; ; ++page) { if (tessedit_page_number >= 0) page = tessedit_page_number; -#ifdef USE_OPENCL - if ( od.selectedDeviceIsOpenCL() ) { - pix = (data) ? - od.pixReadMemTiffCl(data, size, page) : - od.pixReadTiffCl(filename, page); - } else { -#endif // USE_OPENCL - pix = (data) ? - pixReadMemTiff(data, size, page) : - pixReadTiff(filename, page); -#ifdef USE_OPENCL - } -#endif // USE_OPENCL + pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset) + : pixReadFromMultipageTiff(filename, &offset); if (pix == NULL) break; tprintf("Page %d\n", page + 1); char page_str[kMaxIntSize]; @@ -1068,6 +1021,7 @@ bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data, pixDestroy(&pix); if (!r) return false; if (tessedit_page_number >= 0) break; + if (!offset) break; } return true; #else @@ -1107,7 +1061,6 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename, const char* retry_config, int timeout_millisec, TessResultRenderer* renderer) { -#ifndef ANDROID_BUILD PERF_COUNT_START("ProcessPages") bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-"); if (stdInput) { @@ -1142,7 +1095,15 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename, // Maybe we have a filelist if (r != 0 || format == IFF_UNKNOWN) { - STRING s(buf.c_str()); + STRING s; + if (stdInput) { + s = buf.c_str(); + } else { + std::ifstream t(filename); + std::string u((std::istreambuf_iterator(t)), + std::istreambuf_iterator()); + s = u.c_str(); + } return ProcessPagesFileList(NULL, &s, retry_config, timeout_millisec, renderer, tesseract_->tessedit_page_number); @@ -1164,8 +1125,7 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename, } // Begin the output - const char* kUnknownTitle = ""; - if (renderer && !renderer->BeginDocument(kUnknownTitle)) { + if (renderer && !renderer->BeginDocument(unknown_title_)) { pixDestroy(&pix); return false; } @@ -1187,9 +1147,6 @@ bool TessBaseAPI::ProcessPagesInternal(const char* filename, } PERF_COUNT_END return true; -#else - return false; -#endif } bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename, @@ -1379,8 +1336,9 @@ static void AddBaselineCoordsTohOCR(const PageIterator *it, hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0); } -static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int num2) { - const unsigned long BUFSIZE = 64; +static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, + int num2) { + const size_t BUFSIZE = 64; char id_buffer[BUFSIZE]; if (num2 >= 0) { snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2); @@ -1393,8 +1351,7 @@ static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1, int *hocr_str += "'"; } -static void AddBoxTohOCR(const ResultIterator *it, - PageIteratorLevel level, +static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level, STRING* hocr_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); @@ -1410,7 +1367,7 @@ static void AddBoxTohOCR(const ResultIterator *it, // add custom height measures float row_height, descenders, ascenders; // row attributes it->RowAttributes(&row_height, &descenders, &ascenders); - // TODO: Do we want to limit these to a single decimal place? + // TODO(rays): Do we want to limit these to a single decimal place? hocr_str->add_str_double("; x_size ", row_height); hocr_str->add_str_double("; x_descenders ", descenders * -1); hocr_str->add_str_double("; x_ascenders ", ascenders); @@ -1418,9 +1375,8 @@ static void AddBoxTohOCR(const ResultIterator *it, *hocr_str += "\">"; } -static void AddBoxToTSV(const PageIterator *it, - PageIteratorLevel level, - STRING* hocr_str) { +static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level, + STRING* hocr_str) { int left, top, right, bottom; it->BoundingBox(level, &left, &top, &right, &bottom); hocr_str->add_str_int("\t", left); @@ -1429,8 +1385,6 @@ static void AddBoxToTSV(const PageIterator *it, hocr_str->add_str_int("\t", bottom - top); } - - /** * Make a HTML-formatted string with hOCR markup from the internal * data structures. @@ -1440,7 +1394,7 @@ static void AddBoxToTSV(const PageIterator *it, * STL removed from original patch submission and refactored by rays. */ char* TessBaseAPI::GetHOCRText(int page_number) { - return GetHOCRText(NULL,page_number); + return GetHOCRText(NULL, page_number); } /** @@ -1452,13 +1406,12 @@ char* TessBaseAPI::GetHOCRText(int page_number) { * STL removed from original patch submission and refactored by rays. */ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { - if (tesseract_ == NULL || - (page_res_ == NULL && Recognize(monitor) < 0)) + if (tesseract_ == NULL || (page_res_ == NULL && Recognize(monitor) < 0)) return NULL; int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; int page_id = page_number + 1; // hOCR uses 1-based page numbers. - bool para_is_ltr = true; // Default direction is LTR + bool para_is_ltr = true; // Default direction is LTR const char* paragraph_lang = NULL; bool font_info = false; GetBoolVariable("hocr_font_info", &font_info); @@ -1470,13 +1423,13 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { #ifdef _WIN32 // convert input name from ANSI encoding to utf-8 - int str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, - NULL, 0); + int str16_len = + MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, NULL, 0); wchar_t *uni16_str = new WCHAR[str16_len]; str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, uni16_str, str16_len); - int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL, - 0, NULL, NULL); + int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL, 0, + NULL, NULL); char *utf8_str = new char[utf8_len]; WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str, utf8_len, NULL, NULL); @@ -1509,7 +1462,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { // Open any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { - para_is_ltr = true; // reset to default direction + para_is_ltr = true; // reset to default direction hocr_str += "
WordRecognitionLanguage(); if (paragraph_lang) { - hocr_str += " lang='"; - hocr_str += paragraph_lang; - hocr_str += "'"; + hocr_str += " lang='"; + hocr_str += paragraph_lang; + hocr_str += "'"; } AddBoxTohOCR(res_it, RIL_PARA, &hocr_str); } @@ -1567,8 +1520,12 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { } switch (res_it->WordDirection()) { // Only emit direction if different from current paragraph direction - case DIR_LEFT_TO_RIGHT: if (!para_is_ltr) hocr_str += " dir='ltr'"; break; - case DIR_RIGHT_TO_LEFT: if (para_is_ltr) hocr_str += " dir='rtl'"; break; + case DIR_LEFT_TO_RIGHT: + if (!para_is_ltr) hocr_str += " dir='ltr'"; + break; + case DIR_RIGHT_TO_LEFT: + if (para_is_ltr) hocr_str += " dir='rtl'"; + break; case DIR_MIX: case DIR_NEUTRAL: default: // Do nothing. @@ -1600,7 +1557,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { if (last_word_in_para) { hocr_str += "\n

\n"; pcnt++; - para_is_ltr = true; // back to default direction + para_is_ltr = true; // back to default direction } if (last_word_in_block) { hocr_str += "
\n"; @@ -1620,8 +1577,7 @@ char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) { * page_number is 0-based but will appear in the output as 1-based. */ char* TessBaseAPI::GetTSVText(int page_number) { - if (tesseract_ == NULL || - (page_res_ == NULL && Recognize(NULL) < 0)) + if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0)) return NULL; int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1; @@ -1629,9 +1585,10 @@ char* TessBaseAPI::GetTSVText(int page_number) { STRING tsv_str(""); - int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, word_num = 0; + int page_num = page_id, block_num = 0, par_num = 0, line_num = 0, + word_num = 0; - tsv_str.add_str_int("1\t", page_num); // level 1 - page + tsv_str.add_str_int("1\t", page_num); // level 1 - page tsv_str.add_str_int("\t", block_num); tsv_str.add_str_int("\t", par_num); tsv_str.add_str_int("\t", line_num); @@ -1642,7 +1599,7 @@ char* TessBaseAPI::GetTSVText(int page_number) { tsv_str.add_str_int("\t", rect_height_); tsv_str += "\t-1\t\n"; - ResultIterator *res_it = GetIterator(); + ResultIterator* res_it = GetIterator(); while (!res_it->Empty(RIL_BLOCK)) { if (res_it->Empty(RIL_WORD)) { res_it->Next(RIL_WORD); @@ -1652,46 +1609,40 @@ char* TessBaseAPI::GetTSVText(int page_number) { // Add rows for any new block/paragraph/textline. if (res_it->IsAtBeginningOf(RIL_BLOCK)) { block_num++, par_num = 0, line_num = 0, word_num = 0; - tsv_str.add_str_int("2\t", page_num); // level 2 - block + tsv_str.add_str_int("2\t", page_num); // level 2 - block tsv_str.add_str_int("\t", block_num); tsv_str.add_str_int("\t", par_num); tsv_str.add_str_int("\t", line_num); tsv_str.add_str_int("\t", word_num); AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str); - tsv_str += "\t-1\t\n"; // end of row for block + tsv_str += "\t-1\t\n"; // end of row for block } if (res_it->IsAtBeginningOf(RIL_PARA)) { par_num++, line_num = 0, word_num = 0; - tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph + tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph tsv_str.add_str_int("\t", block_num); tsv_str.add_str_int("\t", par_num); tsv_str.add_str_int("\t", line_num); tsv_str.add_str_int("\t", word_num); AddBoxToTSV(res_it, RIL_PARA, &tsv_str); - tsv_str += "\t-1\t\n"; // end of row for para + tsv_str += "\t-1\t\n"; // end of row for para } if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { line_num++, word_num = 0; - tsv_str.add_str_int("4\t", page_num); // level 4 - line + tsv_str.add_str_int("4\t", page_num); // level 4 - line tsv_str.add_str_int("\t", block_num); tsv_str.add_str_int("\t", par_num); tsv_str.add_str_int("\t", line_num); tsv_str.add_str_int("\t", word_num); AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str); - tsv_str += "\t-1\t\n"; // end of row for line + tsv_str += "\t-1\t\n"; // end of row for line } // Now, process the word... int left, top, right, bottom; - bool bold, italic, underlined, monospace, serif, smallcaps; - int pointsize, font_id; - const char *font_name; res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom); - font_name = res_it->WordFontAttributes(&bold, &italic, &underlined, - &monospace, &serif, &smallcaps, - &pointsize, &font_id); word_num++; - tsv_str.add_str_int("5\t", page_num); // level 5 - word + tsv_str.add_str_int("5\t", page_num); // level 5 - word tsv_str.add_str_int("\t", block_num); tsv_str.add_str_int("\t", par_num); tsv_str.add_str_int("\t", line_num); @@ -1712,11 +1663,11 @@ char* TessBaseAPI::GetTSVText(int page_number) { tsv_str += res_it->GetUTF8Text(RIL_SYMBOL); res_it->Next(RIL_SYMBOL); } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); - tsv_str += "\n"; // end of row + tsv_str += "\n"; // end of row wcnt++; } - char *ret = new char[tsv_str.length() + 1]; + char* ret = new char[tsv_str.length() + 1]; strcpy(ret, tsv_str.string()); delete res_it; return ret; @@ -1760,7 +1711,7 @@ char* TessBaseAPI::GetBoxText(int page_number) { int total_length = blob_count * kBytesPerBoxFileLine + utf8_length + kMaxBytesPerLine; char* result = new char[total_length]; - strcpy(result, "\0"); + result[0] = '\0'; int output_length = 0; LTRResultIterator* it = GetLTRIterator(); do { @@ -1907,43 +1858,70 @@ char* TessBaseAPI::GetUNLVText() { return result; } - /** - * The recognized text is returned as a char* which is coded - * as UTF8 and must be freed with the delete [] operator. - * page_number is a 0-based page index that will appear in the osd file. - */ -char* TessBaseAPI::GetOsdText(int page_number) { +/** + * Detect the orientation of the input image and apparent script (alphabet). + * orient_deg is the detected clockwise rotation of the input image in degrees + * (0, 90, 180, 270) + * orient_conf is the confidence (15.0 is reasonably confident) + * script_name is an ASCII string, the name of the script, e.g. "Latin" + * script_conf is confidence level in the script + * Returns true on success and writes values to each parameter as an output + */ +bool TessBaseAPI::DetectOrientationScript(int* orient_deg, float* orient_conf, + const char** script_name, + float* script_conf) { OSResults osr; bool osd = DetectOS(&osr); if (!osd) { - return NULL; + return false; } int orient_id = osr.best_result.orientation_id; int script_id = osr.get_best_script(orient_id); - float orient_conf = osr.best_result.oconfidence; - float script_conf = osr.best_result.sconfidence; - const char* script_name = - osr.unicharset->get_script_from_script_id(script_id); + if (orient_conf) *orient_conf = osr.best_result.oconfidence; + if (orient_deg) *orient_deg = orient_id * 90; // convert quadrant to degrees - // clockwise orientation of the input image, in degrees - int orient_deg = orient_id * 90; + if (script_name) { + const char* script = osr.unicharset->get_script_from_script_id(script_id); + + *script_name = script; + } + + if (script_conf) *script_conf = osr.best_result.sconfidence; + + return true; +} + +/** + * The recognized text is returned as a char* which is coded + * as UTF8 and must be freed with the delete [] operator. + * page_number is a 0-based page index that will appear in the osd file. + */ +char* TessBaseAPI::GetOsdText(int page_number) { + int orient_deg; + float orient_conf; + const char* script_name; + float script_conf; + + if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name, + &script_conf)) + return NULL; // clockwise rotation needed to make the page upright - int rotate = OrientationIdToValue(orient_id); + int rotate = OrientationIdToValue(orient_deg / 90); - char* osd_buf = new char[255]; - snprintf(osd_buf, 255, - "Page number: %d\n" - "Orientation in degrees: %d\n" - "Rotate: %d\n" - "Orientation confidence: %.2f\n" - "Script: %s\n" - "Script confidence: %.2f\n", - page_number, - orient_deg, rotate, orient_conf, - script_name, script_conf); + const int kOsdBufsize = 255; + char* osd_buf = new char[kOsdBufsize]; + snprintf(osd_buf, kOsdBufsize, + "Page number: %d\n" + "Orientation in degrees: %d\n" + "Rotate: %d\n" + "Orientation confidence: %.2f\n" + "Script: %s\n" + "Script confidence: %.2f\n", + page_number, orient_deg, rotate, orient_conf, script_name, + script_conf); return osd_buf; } @@ -2020,8 +1998,7 @@ bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) { for (t = 0; text[t] != '\0'; ++t) { if (text[t] == '\n' || text[t] == ' ') continue; - while (wordstr[w] != '\0' && wordstr[w] == ' ') - ++w; + while (wordstr[w] == ' ') ++w; if (text[t] != wordstr[w]) break; ++w; @@ -2063,7 +2040,7 @@ void TessBaseAPI::Clear() { if (thresholder_ != NULL) thresholder_->Clear(); ClearResults(); - SetInputImage(NULL); + if (tesseract_ != NULL) SetInputImage(NULL); } /** @@ -2073,6 +2050,7 @@ void TessBaseAPI::Clear() { * other than Init and anything declared above it in the class definition. */ void TessBaseAPI::End() { + Clear(); if (thresholder_ != NULL) { delete thresholder_; thresholder_ = NULL; @@ -2108,10 +2086,6 @@ void TessBaseAPI::End() { delete input_file_; input_file_ = NULL; } - if (input_image_ != NULL) { - pixDestroy(&input_image_); - input_image_ = NULL; - } if (output_file_ != NULL) { delete output_file_; output_file_ = NULL; @@ -2242,6 +2216,8 @@ void TessBaseAPI::Threshold(Pix** pix) { if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) { // Use the minimum default resolution, as it is safer to under-estimate // than over-estimate resolution. + tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n", y_res, + kMinCredibleResolution); thresholder_->SetSourceYResolution(kMinCredibleResolution); } PageSegMode pageseg_mode = @@ -2286,7 +2262,7 @@ int TessBaseAPI::FindLines() { } if (tesseract_ == NULL) { tesseract_ = new Tesseract; - tesseract_->InitAdaptiveClassifier(false); + tesseract_->InitAdaptiveClassifier(nullptr); } if (tesseract_->pix_binary() == NULL) Threshold(tesseract_->mutable_pix_binary()); @@ -2308,14 +2284,16 @@ int TessBaseAPI::FindLines() { Tesseract* osd_tess = osd_tesseract_; OSResults osr; - if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && osd_tess == NULL) { + if (PSM_OSD_ENABLED(tesseract_->tessedit_pageseg_mode) && + osd_tess == nullptr) { if (strcmp(language_->string(), "osd") == 0) { osd_tess = tesseract_; } else { osd_tesseract_ = new Tesseract; - if (osd_tesseract_->init_tesseract( - datapath_->string(), NULL, "osd", OEM_TESSERACT_ONLY, - NULL, 0, NULL, NULL, false) == 0) { + TessdataManager mgr(reader_); + if (osd_tesseract_->init_tesseract(datapath_->string(), nullptr, "osd", + OEM_TESSERACT_ONLY, nullptr, 0, + nullptr, nullptr, false, &mgr) == 0) { osd_tess = osd_tesseract_; osd_tesseract_->set_source_resolution( thresholder_->GetSourceYResolution()); @@ -2323,7 +2301,7 @@ int TessBaseAPI::FindLines() { tprintf("Warning: Auto orientation and script detection requested," " but osd language failed to load\n"); delete osd_tesseract_; - osd_tesseract_ = NULL; + osd_tesseract_ = nullptr; } } } @@ -2766,7 +2744,7 @@ void TessBaseAPI::GetFeaturesForBlob(TBLOB* blob, INT_FX_RESULT_STRUCT fx_info; tesseract_->ExtractFeatures(*blob, false, &bl_features, &cn_features, &fx_info, &outline_counts); - if (cn_features.size() == 0 || cn_features.size() > MAX_NUM_INT_FEATURES) { + if (cn_features.empty() || cn_features.size() > MAX_NUM_INT_FEATURES) { *num_features = 0; return; // Feature extraction failed. } @@ -2847,13 +2825,6 @@ int TessBaseAPI::NumDawgs() const { return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs(); } -#ifndef NO_CUBE_BUILD -/** Return a pointer to underlying CubeRecoContext object if present. */ -CubeRecoContext *TessBaseAPI::GetCubeRecoContext() const { - return (tesseract_ == NULL) ? NULL : tesseract_->GetCubeRecoContext(); -} -#endif // NO_CUBE_BUILD - /** Escape a char string - remove <>&"' with HTML codes. */ STRING HOcrEscape(const char* text) { STRING ret; diff --git a/api/baseapi.h b/api/baseapi.h index 3b0d3f67..0901e9b1 100644 --- a/api/baseapi.h +++ b/api/baseapi.h @@ -17,11 +17,11 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_API_BASEAPI_H__ -#define TESSERACT_API_BASEAPI_H__ +#ifndef TESSERACT_API_BASEAPI_H_ +#define TESSERACT_API_BASEAPI_H_ -#define TESSERACT_VERSION_STR "3.05.00dev" -#define TESSERACT_VERSION 0x030500 +#define TESSERACT_VERSION_STR "4.00.00alpha" +#define TESSERACT_VERSION 0x040000 #define MAKE_VERSION(major, minor, patch) (((major) << 16) | ((minor) << 8) | \ (patch)) @@ -29,14 +29,15 @@ // To avoid collision with other typenames include the ABSOLUTE MINIMUM // complexity of includes here. Use forward declarations wherever possible // and hide includes of complex types in baseapi.cpp. -#include "platform.h" #include "apitypes.h" +#include "pageiterator.h" +#include "platform.h" +#include "publictypes.h" +#include "resultiterator.h" +#include "serialis.h" +#include "tesscallback.h" #include "thresholder.h" #include "unichar.h" -#include "tesscallback.h" -#include "publictypes.h" -#include "pageiterator.h" -#include "resultiterator.h" template class GenericVector; class PAGE_RES; @@ -65,9 +66,6 @@ struct TBLOB; namespace tesseract { -#ifndef NO_CUBE_BUILD -class CubeRecoContext; -#endif // NO_CUBE_BUILD class Dawg; class Dict; class EquationDetect; @@ -142,6 +140,7 @@ class TESS_API TessBaseAPI { * is stored in the PDF so we need that as well. */ const char* GetInputName(); + // Takes ownership of the input pix. void SetInputImage(Pix *pix); Pix* GetInputImage(); int GetSourceYResolution(); @@ -239,6 +238,13 @@ class TESS_API TessBaseAPI { int Init(const char* datapath, const char* language) { return Init(datapath, language, OEM_DEFAULT, NULL, 0, NULL, NULL, false); } + // In-memory version reads the traineddata file directly from the given + // data[data_size] array, and/or reads data via a FileReader. + int Init(const char* data, int data_size, const char* language, + OcrEngineMode mode, char** configs, int configs_size, + const GenericVector* vars_vec, + const GenericVector* vars_values, + bool set_only_non_debug_params, FileReader reader); /** * Returns the languages string used in the last valid initialization. @@ -333,9 +339,7 @@ class TESS_API TessBaseAPI { /** * Provide an image for Tesseract to recognize. Format is as - * TesseractRect above. Does not copy the image buffer, or take - * ownership. The source image may be destroyed after Recognize is called, - * either explicitly or implicitly via one of the Get*Text functions. + * TesseractRect above. Copies the image buffer and converts to Pix. * SetImage clears all recognition results, and sets the rectangle to the * full image, so it may be followed immediately by a GetUTF8Text, and it * will automatically perform recognition. @@ -345,13 +349,11 @@ class TESS_API TessBaseAPI { /** * Provide an image for Tesseract to recognize. As with SetImage above, - * Tesseract doesn't take a copy or ownership or pixDestroy the image, so - * it must persist until after Recognize. + * Tesseract takes its own copy of the image, so it need not persist until + * after Recognize. * Pix vs raw, which to use? - * Use Pix where possible. A future version of Tesseract may choose to use Pix - * as its internal representation and discard IMAGE altogether. - * Because of that, an implementation that sources and targets Pix may end up - * with less copies than an implementation that does not. + * Use Pix where possible. Tesseract uses Pix as its internal representation + * and it is therefore more efficient to provide a Pix directly. */ void SetImage(Pix* pix); @@ -376,8 +378,7 @@ class TESS_API TessBaseAPI { * delete it when it it is replaced or the API is destructed. */ void SetThresholder(ImageThresholder* thresholder) { - if (thresholder_ != NULL) - delete thresholder_; + delete thresholder_; thresholder_ = thresholder; ClearResults(); } @@ -588,8 +589,8 @@ class TESS_API TessBaseAPI { * data structures. * page_number is 0-based but will appear in the output as 1-based. * monitor can be used to - * cancel the recognition - * receive progress callbacks + * cancel the recognition + * receive progress callbacks */ char* GetHOCRText(ETEXT_DESC* monitor, int page_number); @@ -622,6 +623,18 @@ class TESS_API TessBaseAPI { */ char* GetUNLVText(); + /** + * Detect the orientation of the input image and apparent script (alphabet). + * orient_deg is the detected clockwise rotation of the input image in degrees + * (0, 90, 180, 270) + * orient_conf is the confidence (15.0 is reasonably confident) + * script_name is an ASCII string, the name of the script, e.g. "Latin" + * script_conf is confidence level in the script + * Returns true on success and writes values to each parameter as an output + */ + bool DetectOrientationScript(int* orient_deg, float* orient_conf, + const char** script_name, float* script_conf); + /** * The recognized text is returned as a char* which is coded * as UTF8 and must be freed with the delete [] operator. @@ -750,21 +763,12 @@ class TESS_API TessBaseAPI { */ static void NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode); - Tesseract* tesseract() const { - return tesseract_; - } + Tesseract* tesseract() const { return tesseract_; } - OcrEngineMode oem() const { - return last_oem_requested_; - } + OcrEngineMode oem() const { return last_oem_requested_; } void InitTruthCallback(TruthCallback *cb) { truth_cb_ = cb; } -#ifndef NO_CUBE_BUILD - /** Return a pointer to underlying CubeRecoContext object if present. */ - CubeRecoContext *GetCubeRecoContext() const; -#endif // NO_CUBE_BUILD - void set_min_orientation_margin(double margin); /** @@ -855,9 +859,7 @@ class TESS_API TessBaseAPI { int** y1, PAGE_RES* page_res); - TESS_LOCAL const PAGE_RES* GetPageRes() const { - return page_res_; - }; + TESS_LOCAL const PAGE_RES* GetPageRes() const { return page_res_; } /* @} */ @@ -865,12 +867,12 @@ class TESS_API TessBaseAPI { Tesseract* tesseract_; ///< The underlying data object. Tesseract* osd_tesseract_; ///< For orientation & script detection. EquationDetect* equ_detect_; ///* paragraph_models_; BLOCK_LIST* block_list_; ///< The page layout. PAGE_RES* page_res_; ///< The page-level data. STRING* input_file_; ///< Name used by training code. - Pix* input_image_; ///< Image used for searchable PDF STRING* output_file_; ///< Name used by debug code. STRING* datapath_; ///< Current location of tessdata. STRING* language_; ///< Last initialized language. @@ -898,7 +900,7 @@ class TESS_API TessBaseAPI { const char* retry_config, int timeout_millisec, TessResultRenderer* renderer, int tessedit_page_number); - // TIFF supports multipage so gets special consideration + // TIFF supports multipage so gets special consideration. bool ProcessPagesMultipageTiff(const unsigned char *data, size_t size, const char* filename, @@ -906,10 +908,16 @@ class TESS_API TessBaseAPI { int timeout_millisec, TessResultRenderer* renderer, int tessedit_page_number); + // There's currently no way to pass a document title from the + // Tesseract command line, and we have multiple places that choose + // to set the title to an empty string. Using a single named + // variable will hopefully reduce confusion if the situation changes + // in the future. + const char *unknown_title_ = ""; }; // class TessBaseAPI. /** Escape a char string - remove &<>"' with HTML codes. */ STRING HOcrEscape(const char* text); } // namespace tesseract. -#endif // TESSERACT_API_BASEAPI_H__ +#endif // TESSERACT_API_BASEAPI_H_ diff --git a/api/capi.cpp b/api/capi.cpp index 849d2961..4f697314 100644 --- a/api/capi.cpp +++ b/api/capi.cpp @@ -64,9 +64,10 @@ TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outpu return new TessHOcrRenderer(outputbase, font_info); } -TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir) +TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir, + BOOL textonly) { - return new TessPDFRenderer(outputbase, datadir); + return new TessPDFRenderer(outputbase, datadir, textonly); } TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase) @@ -538,9 +539,18 @@ TESS_API void TESS_CALL TessBaseAPISetProbabilityInContextFunc(TessBaseAPI* hand TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* results) { - return handle->DetectOS(results) ? TRUE : FALSE; + return FALSE; // Unsafe ABI, return FALSE always } +TESS_API BOOL TESS_CALL TessBaseAPIDetectOrientationScript(TessBaseAPI* handle, + int* orient_deg, float* orient_conf, const char** script_name, float* script_conf) +{ + bool success; + success = handle->DetectOrientationScript(orient_deg, orient_conf, script_name, script_conf); + return (BOOL)success; +} + + TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features, int* num_features, int* FeatureOutlineIndex) { @@ -598,13 +608,6 @@ TESS_API void TESS_CALL TessBaseAPIInitTruthCallback(TessBaseAPI* handle, TessTr handle->InitTruthCallback(cb); } -#ifndef NO_CUBE_BUILD -TESS_API TessCubeRecoContext* TESS_CALL TessBaseAPIGetCubeRecoContext(const TessBaseAPI* handle) -{ - return handle->GetCubeRecoContext(); -} -#endif // NO_CUBE_BUILD - TESS_API void TESS_CALL TessBaseAPISetMinOrientationMargin(TessBaseAPI* handle, double margin) { handle->set_min_orientation_margin(margin); diff --git a/api/capi.h b/api/capi.h index a0c54a20..f3fc3833 100644 --- a/api/capi.h +++ b/api/capi.h @@ -68,9 +68,6 @@ typedef tesseract::ProbabilityInContextFunc TessProbabilityInContextFunc; typedef tesseract::FillLatticeFunc TessFillLatticeFunc; typedef tesseract::Dawg TessDawg; typedef tesseract::TruthCallback TessTruthCallback; -#ifndef NO_CUBE_BUILD -typedef tesseract::CubeRecoContext TessCubeRecoContext; -#endif // NO_CUBE_BUILD typedef tesseract::Orientation TessOrientation; typedef tesseract::ParagraphJustification TessParagraphJustification; typedef tesseract::WritingDirection TessWritingDirection; @@ -88,7 +85,7 @@ typedef struct TessPageIterator TessPageIterator; typedef struct TessResultIterator TessResultIterator; typedef struct TessMutableIterator TessMutableIterator; typedef struct TessChoiceIterator TessChoiceIterator; -typedef enum TessOcrEngineMode { OEM_TESSERACT_ONLY, OEM_CUBE_ONLY, OEM_TESSERACT_CUBE_COMBINED, OEM_DEFAULT } TessOcrEngineMode; +typedef enum TessOcrEngineMode { OEM_TESSERACT_ONLY, OEM_LSTM_ONLY, OEM_TESSERACT_LSTM_COMBINED, OEM_DEFAULT } TessOcrEngineMode; typedef enum TessPageSegMode { PSM_OSD_ONLY, PSM_AUTO_OSD, PSM_AUTO_ONLY, PSM_AUTO, PSM_SINGLE_COLUMN, PSM_SINGLE_BLOCK_VERT_TEXT, PSM_SINGLE_BLOCK, PSM_SINGLE_LINE, PSM_SINGLE_WORD, PSM_CIRCLE_WORD, PSM_SINGLE_CHAR, PSM_SPARSE_TEXT, PSM_SPARSE_TEXT_OSD, PSM_COUNT } TessPageSegMode; @@ -122,7 +119,8 @@ TESS_API void TESS_CALL TessDeleteBlockList(BLOCK_LIST* block_list); TESS_API TessResultRenderer* TESS_CALL TessTextRendererCreate(const char* outputbase); TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate(const char* outputbase); TESS_API TessResultRenderer* TESS_CALL TessHOcrRendererCreate2(const char* outputbase, BOOL font_info); -TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir); +TESS_API TessResultRenderer* TESS_CALL TessPDFRendererCreate(const char* outputbase, const char* datadir, + BOOL textonly); TESS_API TessResultRenderer* TESS_CALL TessUnlvRendererCreate(const char* outputbase); TESS_API TessResultRenderer* TESS_CALL TessBoxTextRendererCreate(const char* outputbase); @@ -285,7 +283,10 @@ TESS_API void TESS_CALL TessBaseAPIClearPersistentCache(TessBaseAPI* handle); TESS_API void TESS_CALL TessBaseAPISetProbabilityInContextFunc(TessBaseAPI* handle, TessProbabilityInContextFunc f); TESS_API void TESS_CALL TessBaseAPISetFillLatticeFunc(TessBaseAPI* handle, TessFillLatticeFunc f); -TESS_API BOOL TESS_CALL TessBaseAPIDetectOS(TessBaseAPI* handle, OSResults* results); + +// Call TessDeleteText(*best_script_name) to free memory allocated by this function +TESS_API BOOL TESS_CALL TessBaseAPIDetectOrientationScript(TessBaseAPI* handle, + int* orient_deg, float* orient_conf, const char **script_name, float* script_conf); TESS_API void TESS_CALL TessBaseAPIGetFeaturesForBlob(TessBaseAPI* handle, TBLOB* blob, INT_FEATURE_STRUCT* int_features, int* num_features, int* FeatureOutlineIndex); @@ -313,11 +314,6 @@ TESS_API void TESS_CALL TessNormalizeTBLOB(TBLOB* tblob, ROW* row, BOOL numeric TESS_API TessOcrEngineMode TESS_CALL TessBaseAPIOem(const TessBaseAPI* handle); TESS_API void TESS_CALL TessBaseAPIInitTruthCallback(TessBaseAPI* handle, TessTruthCallback* cb); - -#ifndef NO_CUBE_BUILD -TESS_API TessCubeRecoContext* - TESS_CALL TessBaseAPIGetCubeRecoContext(const TessBaseAPI* handle); -#endif // NO_CUBE_BUILD #endif TESS_API void TESS_CALL TessBaseAPISetMinOrientationMargin(TessBaseAPI* handle, double margin); diff --git a/api/pdfrenderer.cpp b/api/pdfrenderer.cpp index 47083004..912d9770 100644 --- a/api/pdfrenderer.cpp +++ b/api/pdfrenderer.cpp @@ -20,12 +20,12 @@ #include "config_auto.h" #endif +#include "allheaders.h" #include "baseapi.h" -#include "renderer.h" #include "math.h" +#include "renderer.h" #include "strngs.h" #include "tprintf.h" -#include "allheaders.h" #ifdef _MSC_VER #include "mathfix.h" @@ -159,7 +159,7 @@ CIDToGIDMap. OK there is a small problem there, if I use GID 0 then Acrobat gets upset about it and complains it cannot extract the font. If I set the -CIDToGIDMap so that all the entries are 1 instead, its happy. Totally +CIDToGIDMap so that all the entries are 1 instead, it's happy. Totally mad...... */ @@ -169,19 +169,26 @@ namespace tesseract { // Use for PDF object fragments. Must be large enough // to hold a colormap with 256 colors in the verbose // PDF representation. -const int kBasicBufSize = 2048; +static const int kBasicBufSize = 2048; // If the font is 10 pts, nominal character width is 5 pts -const int kCharWidth = 2; +static const int kCharWidth = 2; + +// Used for memory allocation. A codepoint must take no more than this +// many bytes, when written in the PDF way. e.g. "<0063>" for the +// letter 'c' +static const int kMaxBytesPerCodepoint = 20; /********************************************************************** * PDF Renderer interface implementation **********************************************************************/ -TessPDFRenderer::TessPDFRenderer(const char* outputbase, const char *datadir) +TessPDFRenderer::TessPDFRenderer(const char *outputbase, const char *datadir, + bool textonly) : TessResultRenderer(outputbase, "pdf") { obj_ = 0; datadir_ = datadir; + textonly_ = textonly; offsets_.push_back(0); } @@ -282,7 +289,7 @@ void AffineMatrix(int writing_direction, } } -// There are some really stupid PDF viewers in the wild, such as +// There are some really awkward PDF viewers in the wild, such as // 'Preview' which ships with the Mac. They do a better job with text // selection and highlighting when given perfectly flat baseline // instead of very slightly tilted. We clip small tilts to appease @@ -302,6 +309,23 @@ void ClipBaseline(int ppi, int x1, int y1, int x2, int y2, *line_y1 = *line_y2 = (y1 + y2) / 2; } +bool CodepointToUtf16be(int code, char utf16[kMaxBytesPerCodepoint]) { + if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) { + tprintf("Dropping invalid codepoint %d\n", code); + return false; + } + if (code < 0x10000) { + snprintf(utf16, kMaxBytesPerCodepoint, "%04X", code); + } else { + int a = code - 0x010000; + int high_surrogate = (0x03FF & (a >> 10)) + 0xD800; + int low_surrogate = (0x03FF & a) + 0xDC00; + snprintf(utf16, kMaxBytesPerCodepoint, + "%04X%04X", high_surrogate, low_surrogate); + } + return true; +} + char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, double width, double height) { STRING pdf_str(""); @@ -326,7 +350,11 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, pdf_str.add_str_double("", prec(width)); pdf_str += " 0 0 "; pdf_str.add_str_double("", prec(height)); - pdf_str += " 0 0 cm /Im1 Do Q\n"; + pdf_str += " 0 0 cm"; + if (!textonly_) { + pdf_str += " /Im1 Do"; + } + pdf_str += " Q\n"; int line_x1 = 0; int line_y1 = 0; @@ -436,25 +464,13 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, if (grapheme && grapheme[0] != '\0') { GenericVector unicodes; UNICHAR::UTF8ToUnicode(grapheme, &unicodes); - char utf16[20]; + char utf16[kMaxBytesPerCodepoint]; for (int i = 0; i < unicodes.length(); i++) { int code = unicodes[i]; - // Convert to UTF-16BE https://en.wikipedia.org/wiki/UTF-16 - if ((code > 0xD7FF && code < 0xE000) || code > 0x10FFFF) { - tprintf("Dropping invalid codepoint %d\n", code); - continue; + if (CodepointToUtf16be(code, utf16)) { + pdf_word += utf16; + pdf_word_len++; } - if (code < 0x10000) { - snprintf(utf16, sizeof(utf16), "<%04X>", code); - } else { - int a = code - 0x010000; - int high_surrogate = (0x03FF & (a >> 10)) + 0xD800; - int low_surrogate = (0x03FF & a) + 0xDC00; - snprintf(utf16, sizeof(utf16), "<%04X%04X>", - high_surrogate, low_surrogate); - } - pdf_word += utf16; - pdf_word_len++; } } delete []grapheme; @@ -465,9 +481,9 @@ char* TessPDFRenderer::GetPDFTextObjects(TessBaseAPI* api, kCharWidth * prec(100.0 * word_length / (fontsize * pdf_word_len)); pdf_str.add_str_double("", h_stretch); pdf_str += " Tz"; // horizontal stretch - pdf_str += " [ "; + pdf_str += " [ <"; pdf_str += pdf_word; // UTF-16BE representation - pdf_str += " ] TJ"; // show the text + pdf_str += "> ] TJ"; // show the text } if (last_word_in_line) { pdf_str += " \n"; @@ -567,7 +583,8 @@ bool TessPDFRenderer::BeginDocumentHandler() { "<<\n" " /Length %lu /Filter /FlateDecode\n" ">>\n" - "stream\n", (unsigned long)len); + "stream\n", + (unsigned long)len); if (n >= sizeof(buf)) { lept_free(comp); return false; @@ -619,7 +636,6 @@ bool TessPDFRenderer::BeginDocumentHandler() { AppendPDFObject(buf); // FONT DESCRIPTOR - const int kCharHeight = 2; // Effect: highlights are half height n = snprintf(buf, sizeof(buf), "7 0 obj\n" "<<\n" @@ -635,10 +651,10 @@ bool TessPDFRenderer::BeginDocumentHandler() { " /Type /FontDescriptor\n" ">>\n" "endobj\n", - 1000 / kCharHeight, - 1000 / kCharHeight, + 1000, + 1000, 1000 / kCharWidth, - 1000 / kCharHeight, + 1000, 8L // Font data ); if (n >= sizeof(buf)) return false; @@ -703,11 +719,6 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix, L_COMP_DATA *cid = NULL; const int kJpegQuality = 85; - // TODO(jbreiden) Leptonica 1.71 doesn't correctly handle certain - // types of PNG files, especially if there are 2 samples per pixel. - // We can get rid of this logic after Leptonica 1.72 is released and - // has propagated everywhere. Bug discussion as follows. - // https://code.google.com/p/tesseract-ocr/issues/detail?id=1300 int format, sad; findFileFormat(filename, &format); if (pixGetSpp(pix) == 4 && format == IFF_PNG) { @@ -819,10 +830,6 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix, *pdf_object_size = b1_len + colorspace_len + b2_len + cid->nbytescomp + b3_len; *pdf_object = new char[*pdf_object_size]; - if (!pdf_object) { - l_CIDataDestroy(&cid); - return false; - } char *p = *pdf_object; memcpy(p, b1, b1_len); @@ -841,6 +848,7 @@ bool TessPDFRenderer::imageToPDFObj(Pix *pix, bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { size_t n; char buf[kBasicBufSize]; + char buf2[kBasicBufSize]; Pix *pix = api->GetInputImage(); char *filename = (char *)api->GetInputName(); int ppi = api->GetSourceYResolution(); @@ -849,6 +857,9 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { double width = pixGetWidth(pix) * 72.0 / ppi; double height = pixGetHeight(pix) * 72.0 / ppi; + snprintf(buf2, sizeof(buf2), "/XObject << /Im1 %ld 0 R >>\n", obj_ + 2); + const char *xobject = (textonly_) ? "" : buf2; + // PAGE n = snprintf(buf, sizeof(buf), "%ld 0 obj\n" @@ -859,19 +870,18 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { " /Contents %ld 0 R\n" " /Resources\n" " <<\n" - " /XObject << /Im1 %ld 0 R >>\n" + " %s" " /ProcSet [ /PDF /Text /ImageB /ImageI /ImageC ]\n" " /Font << /f-0-0 %ld 0 R >>\n" " >>\n" ">>\n" "endobj\n", obj_, - 2L, // Pages object - width, - height, - obj_ + 1, // Contents object - obj_ + 2, // Image object - 3L); // Type0 Font + 2L, // Pages object + width, height, + obj_ + 1, // Contents object + xobject, // Image object + 3L); // Type0 Font if (n >= sizeof(buf)) return false; pages_.push_back(obj_); AppendPDFObject(buf); @@ -908,13 +918,15 @@ bool TessPDFRenderer::AddImageHandler(TessBaseAPI* api) { objsize += strlen(b2); AppendPDFObjectDIY(objsize); - char *pdf_object; - if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) { - return false; + if (!textonly_) { + char *pdf_object = nullptr; + if (!imageToPDFObj(pix, filename, obj_, &pdf_object, &objsize)) { + return false; + } + AppendData(pdf_object, objsize); + AppendPDFObjectDIY(objsize); + delete[] pdf_object; } - AppendData(pdf_object, objsize); - AppendPDFObjectDIY(objsize); - delete[] pdf_object; return true; } @@ -958,15 +970,27 @@ bool TessPDFRenderer::EndDocumentHandler() { offsets_.back() += pages_objsize; // manipulation #2 // INFO + STRING utf16_title = "FEFF"; // byte_order_marker + GenericVector unicodes; + UNICHAR::UTF8ToUnicode(title(), &unicodes); + char utf16[kMaxBytesPerCodepoint]; + for (int i = 0; i < unicodes.length(); i++) { + int code = unicodes[i]; + if (CodepointToUtf16be(code, utf16)) { + utf16_title += utf16; + } + } + char* datestr = l_getFormattedDate(); n = snprintf(buf, sizeof(buf), "%ld 0 obj\n" "<<\n" " /Producer (Tesseract %s)\n" " /CreationDate (D:%s)\n" - " /Title (%s)" + " /Title <%s>\n" ">>\n" - "endobj\n", obj_, TESSERACT_VERSION_STR, datestr, title()); + "endobj\n", + obj_, TESSERACT_VERSION_STR, datestr, utf16_title.c_str()); lept_free(datestr); if (n >= sizeof(buf)) return false; AppendPDFObject(buf); diff --git a/api/renderer.cpp b/api/renderer.cpp index 4a88a246..e6831493 100644 --- a/api/renderer.cpp +++ b/api/renderer.cpp @@ -155,11 +155,11 @@ TessHOcrRenderer::TessHOcrRenderer(const char *outputbase, bool font_info) bool TessHOcrRenderer::BeginDocumentHandler() { AppendString( - "\n" - "\n" - "\n \n "); + "<?xml version=\"1.0\" encoding=\"UTF-8\"?>\n" + "<!DOCTYPE html PUBLIC \"-//W3C//DTD XHTML 1.0 Transitional//EN\"\n" + " \"http://www.w3.org/TR/xhtml1/DTD/xhtml1-transitional.dtd\">\n" + "<html xmlns=\"http://www.w3.org/1999/xhtml\" xml:lang=\"en\" " + "lang=\"en\">\n <head>\n <title>"); AppendString(title()); AppendString( "\n" @@ -198,25 +198,25 @@ bool TessHOcrRenderer::AddImageHandler(TessBaseAPI* api) { /********************************************************************** * TSV Text Renderer interface implementation **********************************************************************/ -TessTsvRenderer::TessTsvRenderer(const char *outputbase) +TessTsvRenderer::TessTsvRenderer(const char* outputbase) : TessResultRenderer(outputbase, "tsv") { - font_info_ = false; + font_info_ = false; } -TessTsvRenderer::TessTsvRenderer(const char *outputbase, bool font_info) +TessTsvRenderer::TessTsvRenderer(const char* outputbase, bool font_info) : TessResultRenderer(outputbase, "tsv") { - font_info_ = font_info; + font_info_ = font_info; } bool TessTsvRenderer::BeginDocumentHandler() { // Output TSV column headings - AppendString("level\tpage_num\tblock_num\tpar_num\tline_num\tword_num\tleft\ttop\twidth\theight\tconf\ttext\n"); + AppendString( + "level\tpage_num\tblock_num\tpar_num\tline_num\tword_" + "num\tleft\ttop\twidth\theight\tconf\ttext\n"); return true; } -bool TessTsvRenderer::EndDocumentHandler() { - return true; -} +bool TessTsvRenderer::EndDocumentHandler() { return true; } bool TessTsvRenderer::AddImageHandler(TessBaseAPI* api) { char* tsv = api->GetTSVText(imagenum()); @@ -266,8 +266,7 @@ bool TessBoxTextRenderer::AddImageHandler(TessBaseAPI* api) { * Osd Text Renderer interface implementation **********************************************************************/ TessOsdRenderer::TessOsdRenderer(const char* outputbase) - : TessResultRenderer(outputbase, "osd") { -} + : TessResultRenderer(outputbase, "osd") {} bool TessOsdRenderer::AddImageHandler(TessBaseAPI* api) { char* osd = api->GetOsdText(imagenum()); diff --git a/api/renderer.h b/api/renderer.h index 6b47813f..a6f6d1e7 100644 --- a/api/renderer.h +++ b/api/renderer.h @@ -15,8 +15,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_API_RENDERER_H__ -#define TESSERACT_API_RENDERER_H__ +#ifndef TESSERACT_API_RENDERER_H_ +#define TESSERACT_API_RENDERER_H_ // To avoid collision with other typenames include the ABSOLUTE MINIMUM // complexity of includes here. Use forward declarations wherever possible @@ -57,6 +57,7 @@ class TESS_API TessResultRenderer { /** * Starts a new document with the given title. * This clears the contents of the output data. + * Title should use UTF-8 encoding. */ bool BeginDocument(const char* title); @@ -77,7 +78,7 @@ class TESS_API TessResultRenderer { bool EndDocument(); const char* file_extension() const { return file_extension_; } - const char* title() const { return title_; } + const char* title() const { return title_.c_str(); } /** * Returns the index of the last image given to AddImage @@ -126,7 +127,7 @@ class TESS_API TessResultRenderer { private: const char* file_extension_; // standard extension for generated output - const char* title_; // title of document being renderered + STRING title_; // title of document being renderered int imagenum_; // index of last image added FILE* fout_; // output file pointer @@ -153,13 +154,13 @@ class TESS_API TessHOcrRenderer : public TessResultRenderer { explicit TessHOcrRenderer(const char *outputbase, bool font_info); explicit TessHOcrRenderer(const char *outputbase); -protected: + protected: virtual bool BeginDocumentHandler(); virtual bool AddImageHandler(TessBaseAPI* api); virtual bool EndDocumentHandler(); -private: - bool font_info_; // whether to print font information + private: + bool font_info_; // whether to print font information }; /** @@ -167,15 +168,15 @@ private: */ class TESS_API TessTsvRenderer : public TessResultRenderer { public: - explicit TessTsvRenderer(const char *outputbase, bool font_info); - explicit TessTsvRenderer(const char *outputbase); + explicit TessTsvRenderer(const char* outputbase, bool font_info); + explicit TessTsvRenderer(const char* outputbase); -protected: + protected: virtual bool BeginDocumentHandler(); virtual bool AddImageHandler(TessBaseAPI* api); virtual bool EndDocumentHandler(); -private: + private: bool font_info_; // whether to print font information }; @@ -186,30 +187,30 @@ class TESS_API TessPDFRenderer : public TessResultRenderer { public: // datadir is the location of the TESSDATA. We need it because // we load a custom PDF font from this location. - TessPDFRenderer(const char *outputbase, const char *datadir); + TessPDFRenderer(const char* outputbase, const char* datadir, bool textonly); -protected: + protected: virtual bool BeginDocumentHandler(); virtual bool AddImageHandler(TessBaseAPI* api); virtual bool EndDocumentHandler(); -private: + private: // We don't want to have every image in memory at once, // so we store some metadata as we go along producing - // PDFs one page at a time. At the end that metadata is + // PDFs one page at a time. At the end, that metadata is // used to make everything that isn't easily handled in a // streaming fashion. long int obj_; // counter for PDF objects GenericVector offsets_; // offset of every PDF object in bytes GenericVector pages_; // object number for every /Page object const char *datadir_; // where to find the custom font + bool textonly_; // skip images if set // Bookkeeping only. DIY = Do It Yourself. void AppendPDFObjectDIY(size_t objectsize); // Bookkeeping + emit data. void AppendPDFObject(const char *data); // Create the /Contents object for an entire page. - static char* GetPDFTextObjects(TessBaseAPI* api, - double width, double height); + char* GetPDFTextObjects(TessBaseAPI* api, double width, double height); // Turn an image into a PDF object. Only transcode if we have to. static bool imageToPDFObj(Pix *pix, char *filename, long int objnum, char **pdf_object, long int *pdf_object_size); @@ -251,4 +252,4 @@ class TESS_API TessOsdRenderer : public TessResultRenderer { } // namespace tesseract. -#endif // TESSERACT_API_RENDERER_H__ +#endif // TESSERACT_API_RENDERER_H_ diff --git a/api/tesseractmain.cpp b/api/tesseractmain.cpp index 798888fc..9e6d35d4 100644 --- a/api/tesseractmain.cpp +++ b/api/tesseractmain.cpp @@ -27,70 +27,82 @@ #include "allheaders.h" #include "baseapi.h" #include "basedir.h" -#include "renderer.h" -#include "strngs.h" -#include "tprintf.h" #include "openclwrapper.h" #include "osdetect.h" +#include "renderer.h" +#include "simddetect.h" +#include "strngs.h" +#include "tprintf.h" #if defined(HAVE_TIFFIO_H) && defined(_WIN32) #include -#include static void Win32WarningHandler(const char* module, const char* fmt, va_list ap) { - if (module != NULL) { - fprintf(stderr, "%s: ", module); - } - fprintf(stderr, "Warning, "); - vfprintf(stderr, fmt, ap); - fprintf(stderr, ".\n"); + if (module != NULL) { + fprintf(stderr, "%s: ", module); + } + fprintf(stderr, "Warning, "); + vfprintf(stderr, fmt, ap); + fprintf(stderr, ".\n"); } #endif /* HAVE_TIFFIO_H && _WIN32 */ void PrintVersionInfo() { - char *versionStrP; + char* versionStrP; - printf("tesseract %s\n", tesseract::TessBaseAPI::Version()); + printf("tesseract %s\n", tesseract::TessBaseAPI::Version()); - versionStrP = getLeptonicaVersion(); - printf(" %s\n", versionStrP); - lept_free(versionStrP); + versionStrP = getLeptonicaVersion(); + printf(" %s\n", versionStrP); + lept_free(versionStrP); - versionStrP = getImagelibVersions(); - printf(" %s\n", versionStrP); - lept_free(versionStrP); + versionStrP = getImagelibVersions(); + printf(" %s\n", versionStrP); + lept_free(versionStrP); #ifdef USE_OPENCL - cl_platform_id platform; - cl_uint num_platforms; - cl_device_id devices[2]; - cl_uint num_devices; - char info[256]; - int i; + cl_platform_id platform[4]; + cl_uint num_platforms; - printf(" OpenCL info:\n"); - clGetPlatformIDs(1, &platform, &num_platforms); - printf(" Found %d platforms.\n", num_platforms); - clGetPlatformInfo(platform, CL_PLATFORM_NAME, 256, info, 0); - printf(" Platform name: %s.\n", info); - clGetPlatformInfo(platform, CL_PLATFORM_VERSION, 256, info, 0); - printf(" Version: %s.\n", info); - clGetDeviceIDs(platform, CL_DEVICE_TYPE_ALL, 2, devices, &num_devices); - printf(" Found %d devices.\n", num_devices); - for (i = 0; i < num_devices; ++i) { - clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 256, info, 0); - printf(" Device %d name: %s.\n", i+1, info); + printf(" OpenCL info:\n"); + if (clGetPlatformIDs(4, platform, &num_platforms) == CL_SUCCESS) { + printf(" Found %u platform(s).\n", num_platforms); + for (unsigned n = 0; n < num_platforms; n++) { + char info[256]; + if (clGetPlatformInfo(platform[n], CL_PLATFORM_NAME, 256, info, 0) == + CL_SUCCESS) { + printf(" Platform %u name: %s.\n", n + 1, info); + } + if (clGetPlatformInfo(platform[n], CL_PLATFORM_VERSION, 256, info, 0) == + CL_SUCCESS) { + printf(" Version: %s.\n", info); + } + cl_device_id devices[2]; + cl_uint num_devices; + if (clGetDeviceIDs(platform[n], CL_DEVICE_TYPE_ALL, 2, devices, + &num_devices) == CL_SUCCESS) { + printf(" Found %u device(s).\n", num_devices); + for (unsigned i = 0; i < num_devices; ++i) { + if (clGetDeviceInfo(devices[i], CL_DEVICE_NAME, 256, info, 0) == + CL_SUCCESS) { + printf(" Device %u name: %s.\n", i + 1, info); + } + } + } + } } #endif + if (SIMDDetect::IsAVXAvailable()) printf(" Found AVX\n"); + if (SIMDDetect::IsSSEAvailable()) printf(" Found SSE\n"); } void PrintUsage(const char* program) { printf( "Usage:\n" - " %s --help | --help-psm | --version\n" + " %s --help | --help-psm | --help-oem | --version\n" " %s --list-langs [--tessdata-dir PATH]\n" " %s --print-parameters [options...] [configfile...]\n" " %s imagename|stdin outputbase|stdout [options...] [configfile...]\n", @@ -100,27 +112,33 @@ void PrintUsage(const char* program) { void PrintHelpForPSM() { const char* msg = "Page segmentation modes:\n" - " 0 Orientation and script detection (OSD) only.\n" - " 1 Automatic page segmentation with OSD.\n" - " 2 Automatic page segmentation, but no OSD, or OCR.\n" - " 3 Fully automatic page segmentation, but no OSD. (Default)\n" - " 4 Assume a single column of text of variable sizes.\n" - " 5 Assume a single uniform block of vertically aligned text.\n" - " 6 Assume a single uniform block of text.\n" - " 7 Treat the image as a single text line.\n" - " 8 Treat the image as a single word.\n" - " 9 Treat the image as a single word in a circle.\n" - " 10 Treat the image as a single character.\n" + " 0 Orientation and script detection (OSD) only.\n" + " 1 Automatic page segmentation with OSD.\n" + " 2 Automatic page segmentation, but no OSD, or OCR.\n" + " 3 Fully automatic page segmentation, but no OSD. (Default)\n" + " 4 Assume a single column of text of variable sizes.\n" + " 5 Assume a single uniform block of vertically aligned text.\n" + " 6 Assume a single uniform block of text.\n" + " 7 Treat the image as a single text line.\n" + " 8 Treat the image as a single word.\n" + " 9 Treat the image as a single word in a circle.\n" + " 10 Treat the image as a single character.\n" + " 11 Sparse text. Find as much text as possible in no" + " particular order.\n" + " 12 Sparse text with OSD.\n" + " 13 Raw line. Treat the image as a single text line,\n" + "\t\t\tbypassing hacks that are Tesseract-specific.\n"; - //TODO: Consider publishing these modes. - #if 0 - " 11 Sparse text. Find as much text as possible in no" - " particular order.\n" - " 12 Sparse text with OSD.\n" - " 13 Raw line. Treat the image as a single text line,\n" - "\t\t\tbypassing hacks that are Tesseract-specific.\n" - #endif - ; + printf("%s", msg); +} + +void PrintHelpForOEM() { + const char* msg = + "OCR Engine modes:\n" + " 0 Original Tesseract only.\n" + " 1 Neural nets LSTM only.\n" + " 2 Tesseract + LSTM.\n" + " 3 Default, based on what is available.\n"; printf("%s", msg); } @@ -136,32 +154,34 @@ void PrintHelpMessage(const char* program) { " -l LANG[+LANG] Specify language(s) used for OCR.\n" " -c VAR=VALUE Set value for config variables.\n" " Multiple -c arguments are allowed.\n" - " -psm NUM Specify page segmentation mode.\n" - "NOTE: These options must occur before any configfile.\n" - ; + " --psm NUM Specify page segmentation mode.\n" + " --oem NUM Specify OCR Engine mode.\n" + "NOTE: These options must occur before any configfile.\n"; printf("\n%s\n", ocr_options); PrintHelpForPSM(); + PrintHelpForOEM(); - const char *single_options = + const char* single_options = "Single options:\n" " -h, --help Show this help message.\n" " --help-psm Show page segmentation modes.\n" + " --help-oem Show OCR Engine modes.\n" " -v, --version Show version information.\n" " --list-langs List available languages for tesseract engine.\n" - " --print-parameters Print tesseract parameters to stdout.\n" - ; + " --print-parameters Print tesseract parameters.\n"; printf("\n%s", single_options); } -void SetVariablesFromCLArgs(tesseract::TessBaseAPI* api, int argc, char** argv) { +void SetVariablesFromCLArgs(tesseract::TessBaseAPI* api, int argc, + char** argv) { char opt1[256], opt2[255]; for (int i = 0; i < argc; i++) { if (strcmp(argv[i], "-c") == 0 && i + 1 < argc) { strncpy(opt1, argv[i + 1], 255); opt1[255] = '\0'; - char *p = strchr(opt1, '='); + char* p = strchr(opt1, '='); if (!p) { fprintf(stderr, "Missing = in configvar assignment\n"); exit(1); @@ -190,8 +210,8 @@ void PrintLangsList(tesseract::TessBaseAPI* api) { } void PrintBanner() { - tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n", - tesseract::TessBaseAPI::Version()); + tprintf("Tesseract Open Source OCR Engine v%s with Leptonica\n", + tesseract::TessBaseAPI::Version()); } /** @@ -209,31 +229,26 @@ void PrintBanner() { * but that doesn't work. */ void FixPageSegMode(tesseract::TessBaseAPI* api, - tesseract::PageSegMode pagesegmode) { + tesseract::PageSegMode pagesegmode) { if (api->GetPageSegMode() == tesseract::PSM_SINGLE_BLOCK) - api->SetPageSegMode(pagesegmode); + api->SetPageSegMode(pagesegmode); } // NOTE: arg_i is used here to avoid ugly *i so many times in this function -void ParseArgs(const int argc, char** argv, - const char** lang, - const char** image, - const char** outputbase, - const char** datapath, - bool* list_langs, - bool* print_parameters, - GenericVector* vars_vec, - GenericVector* vars_values, - int* arg_i, - tesseract::PageSegMode* pagesegmode) { +void ParseArgs(const int argc, char** argv, const char** lang, + const char** image, const char** outputbase, + const char** datapath, bool* list_langs, bool* print_parameters, + GenericVector* vars_vec, + GenericVector* vars_values, int* arg_i, + tesseract::PageSegMode* pagesegmode, + tesseract::OcrEngineMode* enginemode) { if (argc == 1) { PrintHelpMessage(argv[0]); exit(0); } if (argc == 2) { - if ((strcmp(argv[1], "-h") == 0) || - (strcmp(argv[1], "--help") == 0)) { + if ((strcmp(argv[1], "-h") == 0) || (strcmp(argv[1], "--help") == 0)) { PrintHelpMessage(argv[0]); exit(0); } @@ -241,8 +256,11 @@ void ParseArgs(const int argc, char** argv, PrintHelpForPSM(); exit(0); } - if ((strcmp(argv[1], "-v") == 0) || - (strcmp(argv[1], "--version") == 0)) { + if ((strcmp(argv[1], "--help-oem") == 0)) { + PrintHelpForOEM(); + exit(0); + } + if ((strcmp(argv[1], "-v") == 0) || (strcmp(argv[1], "--version") == 0)) { PrintVersionInfo(); exit(0); } @@ -269,8 +287,16 @@ void ParseArgs(const int argc, char** argv, noocr = true; *list_langs = true; } else if (strcmp(argv[i], "-psm") == 0 && i + 1 < argc) { + // The parameter -psm is deprecated and was replaced by --psm. + // It is still supported for compatibility reasons. *pagesegmode = static_cast(atoi(argv[i + 1])); ++i; + } else if (strcmp(argv[i], "--psm") == 0 && i + 1 < argc) { + *pagesegmode = static_cast(atoi(argv[i + 1])); + ++i; + } else if (strcmp(argv[i], "--oem") == 0 && i + 1 < argc) { + *enginemode = static_cast(atoi(argv[i + 1])); + ++i; } else if (strcmp(argv[i], "--print-parameters") == 0) { noocr = true; *print_parameters = true; @@ -298,10 +324,10 @@ void ParseArgs(const int argc, char** argv, } } -void PreloadRenderers(tesseract::TessBaseAPI* api, - tesseract::PointerVector* renderers, - tesseract::PageSegMode pagesegmode, - const char* outputbase) { +void PreloadRenderers( + tesseract::TessBaseAPI* api, + tesseract::PointerVector* renderers, + tesseract::PageSegMode pagesegmode, const char* outputbase) { if (pagesegmode == tesseract::PSM_OSD_ONLY) { renderers->push_back(new tesseract::TessOsdRenderer(outputbase)); } else { @@ -311,7 +337,7 @@ void PreloadRenderers(tesseract::TessBaseAPI* api, bool font_info; api->GetBoolVariable("hocr_font_info", &font_info); renderers->push_back( - new tesseract::TessHOcrRenderer(outputbase, font_info)); + new tesseract::TessHOcrRenderer(outputbase, font_info)); } api->GetBoolVariable("tessedit_create_tsv", &b); @@ -324,8 +350,10 @@ void PreloadRenderers(tesseract::TessBaseAPI* api, api->GetBoolVariable("tessedit_create_pdf", &b); if (b) { - renderers->push_back(new tesseract::TessPDFRenderer(outputbase, - api->GetDatapath())); + bool textonly; + api->GetBoolVariable("textonly_pdf", &textonly); + renderers->push_back(new tesseract::TessPDFRenderer( + outputbase, api->GetDatapath(), textonly)); } api->GetBoolVariable("tessedit_write_unlv", &b); @@ -358,26 +386,36 @@ void PreloadRenderers(tesseract::TessBaseAPI* api, * main() * **********************************************************************/ -int main(int argc, char **argv) { + +int main(int argc, char** argv) { const char* lang = "eng"; const char* image = NULL; const char* outputbase = NULL; const char* datapath = NULL; bool list_langs = false; bool print_parameters = false; - GenericVector vars_vec, vars_values; int arg_i = 1; tesseract::PageSegMode pagesegmode = tesseract::PSM_AUTO; + tesseract::OcrEngineMode enginemode = tesseract::OEM_DEFAULT; + /* main() calls functions like ParseArgs which call exit(). + * This results in memory leaks if vars_vec and vars_values are + * declared as auto variables (destructor is not called then). */ + static GenericVector vars_vec; + static GenericVector vars_values; + +#if !defined(DEBUG) + // Disable debugging and informational messages from Leptonica. + setMsgSeverity(L_SEVERITY_ERROR); +#endif #if defined(HAVE_TIFFIO_H) && defined(_WIN32) /* Show libtiff warnings on console (not in GUI). */ TIFFSetWarningHandler(Win32WarningHandler); #endif /* HAVE_TIFFIO_H && _WIN32 */ - ParseArgs(argc, argv, - &lang, &image, &outputbase, &datapath, - &list_langs, &print_parameters, - &vars_vec, &vars_values, &arg_i, &pagesegmode); + ParseArgs(argc, argv, &lang, &image, &outputbase, &datapath, &list_langs, + &print_parameters, &vars_vec, &vars_values, &arg_i, &pagesegmode, + &enginemode); bool banner = false; if (outputbase != NULL && strcmp(outputbase, "-") && @@ -390,8 +428,8 @@ int main(int argc, char **argv) { api.SetOutputName(outputbase); - int init_failed = api.Init(datapath, lang, tesseract::OEM_DEFAULT, - &(argv[arg_i]), argc - arg_i, &vars_vec, &vars_values, false); + int init_failed = api.Init(datapath, lang, enginemode, &(argv[arg_i]), + argc - arg_i, &vars_vec, &vars_values, false); if (init_failed) { fprintf(stderr, "Could not initialize tesseract.\n"); exit(1); @@ -400,8 +438,8 @@ int main(int argc, char **argv) { SetVariablesFromCLArgs(&api, argc, argv); if (list_langs) { - PrintLangsList(&api); - exit(0); + PrintLangsList(&api); + exit(0); } if (print_parameters) { @@ -430,12 +468,13 @@ int main(int argc, char **argv) { tesseract::TextlineOrder order; float deskew_angle; - tesseract::PageIterator* it = api.AnalyseLayout(); + tesseract::PageIterator* it = api.AnalyseLayout(); if (it) { it->Orientation(&orientation, &direction, &order, &deskew_angle); - tprintf("Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n" \ - "Deskew angle: %.4f\n", - orientation, direction, order, deskew_angle); + tprintf( + "Orientation: %d\nWritingDirection: %d\nTextlineOrder: %d\n" + "Deskew angle: %.4f\n", + orientation, direction, order, deskew_angle); } else { ret_val = 1; } @@ -450,14 +489,12 @@ int main(int argc, char **argv) { // ambigs.train, box.train, box.train.stderr, linebox, rebox bool b = false; bool in_training_mode = - (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) || - (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) || - (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b); + (api.GetBoolVariable("tessedit_ambigs_training", &b) && b) || + (api.GetBoolVariable("tessedit_resegment_from_boxes", &b) && b) || + (api.GetBoolVariable("tessedit_make_boxes_from_boxes", &b) && b); tesseract::PointerVector renderers; - - if (in_training_mode) { renderers.push_back(NULL); } else { diff --git a/appveyor.yml b/appveyor.yml index 020331c4..0f4fc60a 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -1,4 +1,4 @@ -os: Visual Studio 2015 +os: Visual Studio 2017 platform: - Win32 @@ -6,20 +6,40 @@ platform: configuration: - Release + +# for curl +install: + - set PATH=C:\Program Files\Git\mingw64\bin;%PATH% before_build: - - if %platform%==Win32 set generator=Visual Studio 14 - - if %platform%==Win64 set generator=Visual Studio 14 Win64 + - if %platform%==Win32 set generator=Visual Studio 15 2017 + - if %platform%==Win64 set generator=Visual Studio 15 2017 Win64 - if %platform%==Win32 set vcplatform=Win32 - if %platform%==Win64 set vcplatform=x64 - - - curl -fsS -o cppan.zip https://cppan.org/client/cppan-master-Windows-client.zip + + - curl -fsS -L -o cppan.zip https://cppan.org/client/cppan-master-Windows-client.zip - 7z x cppan.zip - set PATH=%PATH%;%cd% + + - cppan # dummy run to create %USERPROFILE%\.cppan\cppan.yml + - ps: 'Add-Content $env:USERPROFILE\.cppan\cppan.yml "`n`nbuild_warning_level: 0`n"' + - ps: 'Add-Content $env:USERPROFILE\.cppan\cppan.yml "`n`nbuild_system_verbose: false`n"' + - ps: 'Add-Content $env:USERPROFILE\.cppan\cppan.yml "`n`nvar_check_jobs: 1`n"' build_script: - - cppan - mkdir build + - mkdir build\bin + - mkdir build\bin\Release - cd build - - cmake .. -G "%generator%" -DSTATIC=1 - - msbuild tesseract.sln /p:Platform=%vcplatform% /logger:"C:\Program Files\AppVeyor\BuildAgent\Appveyor.MSBuildLogger.dll" + #- cmd: 'echo local_settings: > cppan.yml' + #- cmd: 'echo generator: %generator% >> cppan.yml' + #- cmd: 'echo use_shared_libs: true >> cppan.yml' + #- cppan --build .. + - cmake .. -G "%generator%" -DBUILD_TRAINING_TOOLS=Off -DAPPVEYOR=1 + - cmake --build . --config Release > bin\Release\log.txt 2>&1 + +artifacts: + - path: build\bin\Release + #- path: build + name: tesseract-$(APPVEYOR_BUILD_VERSION) + diff --git a/arch/Makefile.am b/arch/Makefile.am new file mode 100644 index 00000000..20c12aff --- /dev/null +++ b/arch/Makefile.am @@ -0,0 +1,38 @@ +AM_CPPFLAGS += -I$(top_srcdir)/ccutil -I$(top_srcdir)/viewer +AUTOMAKE_OPTIONS = subdir-objects +SUBDIRS = +AM_CXXFLAGS = + +if VISIBILITY +AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden +AM_CPPFLAGS += -DTESS_EXPORTS +endif + +include_HEADERS = dotproductavx.h dotproductsse.h simddetect.h + +noinst_HEADERS = + +if !USING_MULTIPLELIBS +noinst_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la +noinst_LTLIBRARIES += libtesseract_arch.la +else +lib_LTLIBRARIES = libtesseract_avx.la libtesseract_sse.la +lib_LTLIBRARIES += libtesseract_arch.la +libtesseract_arch_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION) +libtesseract_avx_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION) +libtesseract_sse_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION) +endif + +if AVX_OPT +libtesseract_avx_la_CXXFLAGS = -mavx +endif +if SSE41_OPT +libtesseract_sse_la_CXXFLAGS = -msse4.1 +endif + +libtesseract_arch_la_SOURCES = simddetect.cpp + +libtesseract_avx_la_SOURCES = dotproductavx.cpp + +libtesseract_sse_la_SOURCES = dotproductsse.cpp + diff --git a/arch/dotproductavx.cpp b/arch/dotproductavx.cpp new file mode 100644 index 00000000..d78feff7 --- /dev/null +++ b/arch/dotproductavx.cpp @@ -0,0 +1,112 @@ +/////////////////////////////////////////////////////////////////////// +// File: dotproductavx.cpp +// Description: Architecture-specific dot-product function. +// Author: Ray Smith +// Created: Wed Jul 22 10:48:05 PDT 2015 +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#if !defined(__AVX__) +// Implementation for non-avx archs. + +#include "dotproductavx.h" +#include +#include + +namespace tesseract { +double DotProductAVX(const double* u, const double* v, int n) { + fprintf(stderr, "DotProductAVX can't be used on Android\n"); + abort(); +} +} // namespace tesseract + +#else // !defined(__AVX__) +// Implementation for avx capable archs. +#include +#include +#include "dotproductavx.h" +#include "host.h" + +namespace tesseract { + +// Computes and returns the dot product of the n-vectors u and v. +// Uses Intel AVX intrinsics to access the SIMD instruction set. +double DotProductAVX(const double* u, const double* v, int n) { + int max_offset = n - 4; + int offset = 0; + // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and + // v, and multiplying them together in parallel. + __m256d sum = _mm256_setzero_pd(); + if (offset <= max_offset) { + offset = 4; + // Aligned load is reputedly faster but requires 32 byte aligned input. + if ((reinterpret_cast(u) & 31) == 0 && + (reinterpret_cast(v) & 31) == 0) { + // Use aligned load. + __m256d floats1 = _mm256_load_pd(u); + __m256d floats2 = _mm256_load_pd(v); + // Multiply. + sum = _mm256_mul_pd(floats1, floats2); + while (offset <= max_offset) { + floats1 = _mm256_load_pd(u + offset); + floats2 = _mm256_load_pd(v + offset); + offset += 4; + __m256d product = _mm256_mul_pd(floats1, floats2); + sum = _mm256_add_pd(sum, product); + } + } else { + // Use unaligned load. + __m256d floats1 = _mm256_loadu_pd(u); + __m256d floats2 = _mm256_loadu_pd(v); + // Multiply. + sum = _mm256_mul_pd(floats1, floats2); + while (offset <= max_offset) { + floats1 = _mm256_loadu_pd(u + offset); + floats2 = _mm256_loadu_pd(v + offset); + offset += 4; + __m256d product = _mm256_mul_pd(floats1, floats2); + sum = _mm256_add_pd(sum, product); + } + } + } + // Add the 4 product sums together horizontally. Not so easy as with sse, as + // there is no add across the upper/lower 128 bit boundary, so permute to + // move the upper 128 bits to lower in another register. + __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1); + sum = _mm256_hadd_pd(sum, sum2); + sum = _mm256_hadd_pd(sum, sum); + double result; + // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse + // instruction, as that introduces a 70 cycle delay. All this casting is to + // fool the intrinsics into thinking we are extracting the bottom int64. + auto cast_sum = _mm256_castpd_si256(sum); + *(reinterpret_cast(&result)) = +#if defined(_WIN32) || defined(__i386__) + // This is a very simple workaround that is activated + // for all platforms that do not have _mm256_extract_epi64. + // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y] + ((uint64_t*)&cast_sum)[0] +#else + _mm256_extract_epi64(cast_sum, 0) +#endif + ; + while (offset < n) { + result += u[offset] * v[offset]; + ++offset; + } + return result; +} + +} // namespace tesseract. + +#endif // ANDROID_BUILD diff --git a/arch/dotproductavx.h b/arch/dotproductavx.h new file mode 100644 index 00000000..ef00cdfb --- /dev/null +++ b/arch/dotproductavx.h @@ -0,0 +1,30 @@ +/////////////////////////////////////////////////////////////////////// +// File: dotproductavx.h +// Description: Architecture-specific dot-product function. +// Author: Ray Smith +// Created: Wed Jul 22 10:51:05 PDT 2015 +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_ARCH_DOTPRODUCTAVX_H_ +#define TESSERACT_ARCH_DOTPRODUCTAVX_H_ + +namespace tesseract { + +// Computes and returns the dot product of the n-vectors u and v. +// Uses Intel AVX intrinsics to access the SIMD instruction set. +double DotProductAVX(const double* u, const double* v, int n); + +} // namespace tesseract. + +#endif // TESSERACT_ARCH_DOTPRODUCTAVX_H_ diff --git a/arch/dotproductsse.cpp b/arch/dotproductsse.cpp new file mode 100644 index 00000000..cc5c2455 --- /dev/null +++ b/arch/dotproductsse.cpp @@ -0,0 +1,141 @@ +/////////////////////////////////////////////////////////////////////// +// File: dotproductsse.cpp +// Description: Architecture-specific dot-product function. +// Author: Ray Smith +// Created: Wed Jul 22 10:57:45 PDT 2015 +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#if !defined(__SSE4_1__) +// This code can't compile with "-msse4.1", so use dummy stubs. + +#include "dotproductsse.h" +#include +#include + +namespace tesseract { +double DotProductSSE(const double* u, const double* v, int n) { + fprintf(stderr, "DotProductSSE can't be used on Android\n"); + abort(); +} +inT32 IntDotProductSSE(const inT8* u, const inT8* v, int n) { + fprintf(stderr, "IntDotProductSSE can't be used on Android\n"); + abort(); +} +} // namespace tesseract + +#else // !defined(__SSE4_1__) +// Non-Android code here + +#include +#include +#include +#include "dotproductsse.h" +#include "host.h" + +namespace tesseract { + +// Computes and returns the dot product of the n-vectors u and v. +// Uses Intel SSE intrinsics to access the SIMD instruction set. +double DotProductSSE(const double* u, const double* v, int n) { + int max_offset = n - 2; + int offset = 0; + // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and + // v, and multiplying them together in parallel. + __m128d sum = _mm_setzero_pd(); + if (offset <= max_offset) { + offset = 2; + // Aligned load is reputedly faster but requires 16 byte aligned input. + if ((reinterpret_cast(u) & 15) == 0 && + (reinterpret_cast(v) & 15) == 0) { + // Use aligned load. + sum = _mm_load_pd(u); + __m128d floats2 = _mm_load_pd(v); + // Multiply. + sum = _mm_mul_pd(sum, floats2); + while (offset <= max_offset) { + __m128d floats1 = _mm_load_pd(u + offset); + floats2 = _mm_load_pd(v + offset); + offset += 2; + floats1 = _mm_mul_pd(floats1, floats2); + sum = _mm_add_pd(sum, floats1); + } + } else { + // Use unaligned load. + sum = _mm_loadu_pd(u); + __m128d floats2 = _mm_loadu_pd(v); + // Multiply. + sum = _mm_mul_pd(sum, floats2); + while (offset <= max_offset) { + __m128d floats1 = _mm_loadu_pd(u + offset); + floats2 = _mm_loadu_pd(v + offset); + offset += 2; + floats1 = _mm_mul_pd(floats1, floats2); + sum = _mm_add_pd(sum, floats1); + } + } + } + // Add the 2 sums in sum horizontally. + sum = _mm_hadd_pd(sum, sum); + // Extract the low result. + double result = _mm_cvtsd_f64(sum); + // Add on any left-over products. + while (offset < n) { + result += u[offset] * v[offset]; + ++offset; + } + return result; +} + +// Computes and returns the dot product of the n-vectors u and v. +// Uses Intel SSE intrinsics to access the SIMD instruction set. +inT32 IntDotProductSSE(const inT8* u, const inT8* v, int n) { + int max_offset = n - 8; + int offset = 0; + // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit + // values, extending to 16 bit, multiplying to make 32 bit results. + __m128i sum = _mm_setzero_si128(); + if (offset <= max_offset) { + offset = 8; + __m128i packed1 = _mm_loadl_epi64(reinterpret_cast(u)); + __m128i packed2 = _mm_loadl_epi64(reinterpret_cast(v)); + sum = _mm_cvtepi8_epi16(packed1); + packed2 = _mm_cvtepi8_epi16(packed2); + // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit + // ints to make 32 bit results, which are then horizontally added in pairs + // to make 4 32 bit results that still fit in a 128 bit register. + sum = _mm_madd_epi16(sum, packed2); + while (offset <= max_offset) { + packed1 = _mm_loadl_epi64(reinterpret_cast(u + offset)); + packed2 = _mm_loadl_epi64(reinterpret_cast(v + offset)); + offset += 8; + packed1 = _mm_cvtepi8_epi16(packed1); + packed2 = _mm_cvtepi8_epi16(packed2); + packed1 = _mm_madd_epi16(packed1, packed2); + sum = _mm_add_epi32(sum, packed1); + } + } + // Sum the 4 packed 32 bit sums and extract the low result. + sum = _mm_hadd_epi32(sum, sum); + sum = _mm_hadd_epi32(sum, sum); + inT32 result = _mm_cvtsi128_si32(sum); + while (offset < n) { + result += u[offset] * v[offset]; + ++offset; + } + return result; +} + +} // namespace tesseract. + +#endif // ANDROID_BUILD diff --git a/arch/dotproductsse.h b/arch/dotproductsse.h new file mode 100644 index 00000000..fa0a744f --- /dev/null +++ b/arch/dotproductsse.h @@ -0,0 +1,35 @@ +/////////////////////////////////////////////////////////////////////// +// File: dotproductsse.h +// Description: Architecture-specific dot-product function. +// Author: Ray Smith +// Created: Wed Jul 22 10:57:05 PDT 2015 +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_ARCH_DOTPRODUCTSSE_H_ +#define TESSERACT_ARCH_DOTPRODUCTSSE_H_ + +#include "host.h" + +namespace tesseract { + +// Computes and returns the dot product of the n-vectors u and v. +// Uses Intel SSE intrinsics to access the SIMD instruction set. +double DotProductSSE(const double* u, const double* v, int n); +// Computes and returns the dot product of the n-vectors u and v. +// Uses Intel SSE intrinsics to access the SIMD instruction set. +inT32 IntDotProductSSE(const inT8* u, const inT8* v, int n); + +} // namespace tesseract. + +#endif // TESSERACT_ARCH_DOTPRODUCTSSE_H_ diff --git a/arch/simddetect.cpp b/arch/simddetect.cpp new file mode 100644 index 00000000..e4c9ee49 --- /dev/null +++ b/arch/simddetect.cpp @@ -0,0 +1,68 @@ +/////////////////////////////////////////////////////////////////////// +// File: simddetect.h +// Description: Architecture detector. +// Author: Stefan Weil (based on code from Ray Smith) +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "simddetect.h" +#include "tprintf.h" + +#undef X86_BUILD +#if defined(__x86_64__) || defined(__i386__) || defined(_WIN32) +#if !defined(ANDROID_BUILD) +#define X86_BUILD 1 +#endif // !ANDROID_BUILD +#endif // x86 target + +#if defined(X86_BUILD) +#if defined(__GNUC__) +#include +#elif defined(_WIN32) +#include +#endif +#endif + +SIMDDetect SIMDDetect::detector; + +// If true, then AVX has been detected. +bool SIMDDetect::avx_available_; +// If true, then SSe4.1 has been detected. +bool SIMDDetect::sse_available_; + +// Constructor. +// Tests the architecture in a system-dependent way to detect AVX, SSE and +// any other available SIMD equipment. +// __GNUC__ is also defined by compilers that include GNU extensions such as +// clang. +SIMDDetect::SIMDDetect() { +#if defined(X86_BUILD) +#if defined(__GNUC__) + unsigned int eax, ebx, ecx, edx; + if (__get_cpuid(1, &eax, &ebx, &ecx, &edx) != 0) { + sse_available_ = (ecx & 0x00080000) != 0; + avx_available_ = (ecx & 0x10000000) != 0; + } +#elif defined(_WIN32) + int cpuInfo[4]; + __cpuid(cpuInfo, 0); + if (cpuInfo[0] >= 1) { + __cpuid(cpuInfo, 1); + sse_available_ = (cpuInfo[2] & 0x00080000) != 0; + avx_available_ = (cpuInfo[2] & 0x10000000) != 0; + } +#else +#error "I don't know how to test for SIMD with this compiler" +#endif +#endif // X86_BUILD +} diff --git a/arch/simddetect.h b/arch/simddetect.h new file mode 100644 index 00000000..17f23d53 --- /dev/null +++ b/arch/simddetect.h @@ -0,0 +1,41 @@ +/////////////////////////////////////////////////////////////////////// +// File: simddetect.h +// Description: Architecture detector. +// Author: Stefan Weil (based on code from Ray Smith) +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "platform.h" + +// Architecture detector. Add code here to detect any other architectures for +// SIMD-based faster dot product functions. Intended to be a single static +// object, but it does no real harm to have more than one. +class SIMDDetect { + public: + // Returns true if AVX is available on this system. + static inline bool IsAVXAvailable() { return detector.avx_available_; } + // Returns true if SSE4.1 is available on this system. + static inline bool IsSSEAvailable() { return detector.sse_available_; } + + private: + // Constructor, must set all static member variables. + SIMDDetect(); + + private: + // Singleton. + static SIMDDetect detector; + // If true, then AVX has been detected. + static TESS_API bool avx_available_; + // If true, then SSe4.1 has been detected. + static TESS_API bool sse_available_; +}; diff --git a/autogen.sh b/autogen.sh index ac44d357..7d35f48d 100755 --- a/autogen.sh +++ b/autogen.sh @@ -1,4 +1,13 @@ #!/bin/sh +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # This is a simple script which is meant to help developers # better deal with the GNU autotools, specifically: @@ -37,7 +46,20 @@ if [ "$1" = "clean" ]; then find . -iname "Makefile.in" -type f -exec rm '{}' + fi -# create m4 directory if it not exists +# Prevent any errors that might result from failing to properly invoke +# `libtoolize` or `glibtoolize,` whichever is present on your system, +# from occurring by testing for its existence and capturing the absolute path to +# its location for caching purposes prior to using it later on in 'Step 2:' +if command -v libtoolize >/dev/null 2>&1; then + LIBTOOLIZE="$(command -v libtoolize)" +elif command -v glibtoolize >/dev/null 2>&1; then + LIBTOOLIZE="$(command -v glibtoolize)" +else + echo "Unable to find a valid copy of libtoolize or glibtoolize in your PATH!" + bail_out +fi + +# create m4 directory if it does not exist if [ ! -d m4 ]; then mkdir m4 fi @@ -61,9 +83,9 @@ aclocal -I config || bail_out # --- Step 2: -echo "Running libtoolize" -libtoolize -f -c || glibtoolize -f -c || bail_out -libtoolize --automake || glibtoolize --automake || bail_out +echo "Running $LIBTOOLIZE" +$LIBTOOLIZE -f -c || bail_out +$LIBTOOLIZE --automake || bail_out # --- Step 3: Generate config.h.in from: # . configure.ac (look for AM_CONFIG_HEADER tag or AC_CONFIG_HEADER tag) diff --git a/ccmain/Makefile.am b/ccmain/Makefile.am index e82c0031..ddc2cb1b 100644 --- a/ccmain/Makefile.am +++ b/ccmain/Makefile.am @@ -1,12 +1,14 @@ AM_CPPFLAGS += \ -DUSE_STD_NAMESPACE \ -I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct \ + -I$(top_srcdir)/arch -I$(top_srcdir)/lstm \ -I$(top_srcdir)/viewer \ -I$(top_srcdir)/classify -I$(top_srcdir)/dict \ -I$(top_srcdir)/wordrec -I$(top_srcdir)/cutil \ -I$(top_srcdir)/textord -I$(top_srcdir)/opencl AM_CPPFLAGS += $(OPENCL_CPPFLAGS) +AM_CPPFLAGS += $(OPENMP_CXXFLAGS) if VISIBILITY AM_CPPFLAGS += -DTESS_EXPORTS \ @@ -33,18 +35,18 @@ libtesseract_main_la_LIBADD = \ ../ccstruct/libtesseract_ccstruct.la \ ../viewer/libtesseract_viewer.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../classify/libtesseract_classify.la \ ../cutil/libtesseract_cutil.la \ ../opencl/libtesseract_opencl.la - if !NO_CUBE_BUILD - libtesseract_main_la_LIBADD += ../cube/libtesseract_cube.la - endif endif libtesseract_main_la_SOURCES = \ adaptions.cpp applybox.cpp control.cpp \ docqual.cpp equationdetect.cpp fixspace.cpp fixxht.cpp \ - ltrresultiterator.cpp \ + linerec.cpp ltrresultiterator.cpp \ osdetect.cpp output.cpp pageiterator.cpp pagesegmain.cpp \ pagewalk.cpp par_control.cpp paragraphs.cpp paramsd.cpp pgedit.cpp recogtraining.cpp \ reject.cpp resultiterator.cpp superscript.cpp \ @@ -52,12 +54,3 @@ libtesseract_main_la_SOURCES = \ tfacepp.cpp thresholder.cpp \ werdit.cpp -if !NO_CUBE_BUILD -AM_CPPFLAGS += \ - -I$(top_srcdir)/neural_networks/runtime -I$(top_srcdir)/cube -noinst_HEADERS += \ - cube_reco_context.h cubeclassifier.h tesseract_cube_combiner.h -libtesseract_main_la_SOURCES += \ - cube_control.cpp cube_reco_context.cpp cubeclassifier.cpp \ - tesseract_cube_combiner.cpp -endif diff --git a/ccmain/control.cpp b/ccmain/control.cpp index 3970c542..3206add9 100644 --- a/ccmain/control.cpp +++ b/ccmain/control.cpp @@ -1,8 +1,8 @@ /****************************************************************** * File: control.cpp (Formerly control.c) * Description: Module-independent matcher controller. - * Author: Ray Smith - * Created: Thu Apr 23 11:09:58 BST 1992 + * Author: Ray Smith + * Created: Thu Apr 23 11:09:58 BST 1992 * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle * * (C) Copyright 1992, Hewlett-Packard Ltd. @@ -31,21 +31,22 @@ #include #endif #include -#include "ocrclass.h" -#include "werdit.h" +#include "callcpp.h" +#include "control.h" +#include "docqual.h" #include "drawfx.h" -#include "tessbox.h" -#include "tessvars.h" +#include "fixspace.h" +#include "globals.h" +#include "lstmrecognizer.h" +#include "ocrclass.h" +#include "output.h" #include "pgedit.h" #include "reject.h" -#include "fixspace.h" -#include "docqual.h" -#include "control.h" -#include "output.h" -#include "callcpp.h" -#include "globals.h" #include "sorthelper.h" +#include "tessbox.h" #include "tesseractclass.h" +#include "tessvars.h" +#include "werdit.h" #define MIN_FONT_ROW_COUNT 8 #define MAX_XHEIGHT_DIFF 3 @@ -73,7 +74,6 @@ void Tesseract::recog_pseudo_word(PAGE_RES* page_res, } } - /** * Recognize a single word in interactive mode. * @@ -85,7 +85,12 @@ BOOL8 Tesseract::recog_interactive(PAGE_RES_IT* pr_it) { WordData word_data(*pr_it); SetupWordPassN(2, &word_data); - classify_word_and_language(2, pr_it, &word_data); + // LSTM doesn't run on pass2, but we want to run pass2 for tesseract. + if (lstm_recognizer_ == NULL) { + classify_word_and_language(2, pr_it, &word_data); + } else { + classify_word_and_language(1, pr_it, &word_data); + } if (tessedit_debug_quality_metrics) { WERD_RES* word_res = pr_it->word(); word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual); @@ -188,8 +193,8 @@ void Tesseract::SetupWordPassN(int pass_n, WordData* word) { WERD_RES* word_res = new WERD_RES; word_res->InitForRetryRecognition(*word->word); word->lang_words.push_back(word_res); - // Cube doesn't get setup for pass2. - if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) { + // LSTM doesn't get setup for pass2. + if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) { word_res->SetupForRecognition( lang_t->unicharset, lang_t, BestPix(), lang_t->tessedit_ocr_engine_mode, NULL, @@ -219,16 +224,14 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, if (pass_n == 1) { monitor->progress = 70 * w / words->size(); if (monitor->progress_callback != NULL) { - TBOX box = pr_it->word()->word->bounding_box(); - (*monitor->progress_callback)(monitor->progress, - box.left(), box.right(), - box.top(), box.bottom()); + TBOX box = pr_it->word()->word->bounding_box(); + (*monitor->progress_callback)(monitor->progress, box.left(), + box.right(), box.top(), box.bottom()); } } else { monitor->progress = 70 + 30 * w / words->size(); - if (monitor->progress_callback!=NULL) { - (*monitor->progress_callback)(monitor->progress, - 0, 0, 0, 0); + if (monitor->progress_callback != NULL) { + (*monitor->progress_callback)(monitor->progress, 0, 0, 0, 0); } } if (monitor->deadline_exceeded() || @@ -253,7 +256,8 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor, pr_it->forward(); ASSERT_HOST(pr_it->word() != NULL); bool make_next_word_fuzzy = false; - if (ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) { + if (!AnyLSTMLang() && + ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) { // Needs to be setup again to see the new outlines in the chopped_word. SetupWordPassN(pass_n, word); } @@ -384,9 +388,8 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res, if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false; } - // The next passes can only be run if tesseract has been used, as cube - // doesn't set all the necessary outputs in WERD_RES. - if (AnyTessLang()) { + // The next passes are only required for Tess-only. + if (AnyTessLang() && !AnyLSTMLang()) { // ****************** Pass 3 ******************* // Fix fuzzy spaces. set_global_loc_code(LOC_FUZZY_SPACE); @@ -402,15 +405,6 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res, // ****************** Pass 5,6 ******************* rejection_passes(page_res, monitor, target_word_box, word_config); -#ifndef NO_CUBE_BUILD - // ****************** Pass 7 ******************* - // Cube combiner. - // If cube is loaded and its combiner is present, run it. - if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { - run_cube_combiner(page_res); - } -#endif - // ****************** Pass 8 ******************* font_recognition_pass(page_res); @@ -438,8 +432,13 @@ bool Tesseract::recog_all_words(PAGE_RES* page_res, for (page_res_it.restart_page(); page_res_it.word() != NULL; page_res_it.forward()) { WERD_RES* word = page_res_it.word(); - if (word->best_choice == NULL || word->best_choice->length() == 0) + POLY_BLOCK* pb = page_res_it.block()->block != NULL + ? page_res_it.block()->block->poly_block() + : NULL; + if (word->best_choice == NULL || word->best_choice->length() == 0 || + (word->best_choice->IsAllSpaces() && (pb == NULL || pb->IsText()))) { page_res_it.DeleteCurrentWord(); + } } if (monitor != NULL) { @@ -539,7 +538,7 @@ void Tesseract::bigram_correction_pass(PAGE_RES *page_res) { } } } - if (overrides_word1.size() >= 1) { + if (!overrides_word1.empty()) { // Excellent, we have some bigram matches. if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice, *overrides_word1[best_idx]) && @@ -755,16 +754,32 @@ void Tesseract::script_pos_pass(PAGE_RES* page_res) { } } -// Factored helper considers the indexed word and updates all the pointed -// values. -static void EvaluateWord(const PointerVector& words, int index, - float* rating, float* certainty, bool* bad, - bool* valid_permuter, int* right, int* next_left) { +// Helper finds the gap between the index word and the next. +static void WordGap(const PointerVector& words, int index, int* right, + int* next_left) { *right = -MAX_INT32; *next_left = MAX_INT32; if (index < words.size()) { + *right = words[index]->word->bounding_box().right(); + if (index + 1 < words.size()) + *next_left = words[index + 1]->word->bounding_box().left(); + } +} + +// Factored helper computes the rating, certainty, badness and validity of +// the permuter of the words in [first_index, end_index). +static void EvaluateWordSpan(const PointerVector& words, + int first_index, int end_index, float* rating, + float* certainty, bool* bad, + bool* valid_permuter) { + if (end_index <= first_index) { + *bad = true; + *valid_permuter = false; + } + for (int index = first_index; index < end_index && index < words.size(); + ++index) { WERD_CHOICE* choice = words[index]->best_choice; - if (choice == NULL) { + if (choice == nullptr) { *bad = true; } else { *rating += choice->rating(); @@ -772,12 +787,6 @@ static void EvaluateWord(const PointerVector& words, int index, if (!Dict::valid_word_permuter(choice->permuter(), false)) *valid_permuter = false; } - *right = words[index]->word->bounding_box().right(); - if (index + 1 < words.size()) - *next_left = words[index + 1]->word->bounding_box().left(); - } else { - *valid_permuter = false; - *bad = true; } } @@ -802,24 +811,13 @@ static int SelectBestWords(double rating_ratio, while (b < best_words->size() || n < new_words->size()) { // Start of the current run in each. int start_b = b, start_n = n; - // Rating of the current run in each. - float b_rating = 0.0f, n_rating = 0.0f; - // Certainty of the current run in each. - float b_certainty = 0.0f, n_certainty = 0.0f; - // True if any word is missing its best choice. - bool b_bad = false, n_bad = false; - // True if all words have a valid permuter. - bool b_valid_permuter = true, n_valid_permuter = true; - while (b < best_words->size() || n < new_words->size()) { int b_right = -MAX_INT32; int next_b_left = MAX_INT32; - EvaluateWord(*best_words, b, &b_rating, &b_certainty, &b_bad, - &b_valid_permuter, &b_right, &next_b_left); + WordGap(*best_words, b, &b_right, &next_b_left); int n_right = -MAX_INT32; int next_n_left = MAX_INT32; - EvaluateWord(*new_words, n, &n_rating, &n_certainty, &n_bad, - &n_valid_permuter, &n_right, &next_n_left); + WordGap(*new_words, n, &n_right, &next_n_left); if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) { // The word breaks overlap. [start_b,b] and [start_n, n] match. break; @@ -831,6 +829,20 @@ static int SelectBestWords(double rating_ratio, else ++n; } + // Rating of the current run in each. + float b_rating = 0.0f, n_rating = 0.0f; + // Certainty of the current run in each. + float b_certainty = 0.0f, n_certainty = 0.0f; + // True if any word is missing its best choice. + bool b_bad = false, n_bad = false; + // True if all words have a valid permuter. + bool b_valid_permuter = true, n_valid_permuter = true; + int end_b = b < best_words->size() ? b + 1 : b; + int end_n = n < new_words->size() ? n + 1 : n; + EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty, + &b_bad, &b_valid_permuter); + EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty, + &n_bad, &n_valid_permuter); bool new_better = false; if (!n_bad && (b_bad || (n_certainty > b_certainty && n_rating < b_rating) || @@ -838,7 +850,7 @@ static int SelectBestWords(double rating_ratio, n_rating < b_rating * rating_ratio && n_certainty > b_certainty - certainty_margin))) { // New is better. - for (int i = start_n; i <= n; ++i) { + for (int i = start_n; i < end_n; ++i) { out_words.push_back((*new_words)[i]); (*new_words)[i] = NULL; ++num_new; @@ -846,14 +858,12 @@ static int SelectBestWords(double rating_ratio, new_better = true; } else if (!b_bad) { // Current best is better. - for (int i = start_b; i <= b; ++i) { + for (int i = start_b; i < end_b; ++i) { out_words.push_back((*best_words)[i]); (*best_words)[i] = NULL; ++num_best; } } - int end_b = b < best_words->size() ? b + 1 : b; - int end_n = n < new_words->size() ? n + 1 : n; if (debug) { tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g" " valid dict: %d v %d\n", @@ -876,10 +886,9 @@ static int SelectBestWords(double rating_ratio, // Returns positive if this recognizer found more new best words than the // number kept from best_words. int Tesseract::RetryWithLanguage(const WordData& word_data, - WordRecognizer recognizer, + WordRecognizer recognizer, bool debug, WERD_RES** in_word, PointerVector* best_words) { - bool debug = classify_debug_level || cube_debug_level; if (debug) { tprintf("Trying word using lang %s, oem %d\n", lang.string(), static_cast(tessedit_ocr_engine_mode)); @@ -898,8 +907,7 @@ int Tesseract::RetryWithLanguage(const WordData& word_data, new_words[i]->DebugTopChoice("Lang result"); } // Initial version is a bit of a hack based on better certainty and rating - // (to reduce false positives from cube) or a dictionary vs non-dictionary - // word. + // or a dictionary vs non-dictionary word. return SelectBestWords(classify_max_rating_ratio, classify_max_certainty_margin, debug, &new_words, best_words); @@ -1283,7 +1291,8 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, // Points to the best result. May be word or in lang_words. WERD_RES* word = word_data->word; clock_t start_t = clock(); - if (classify_debug_level || cube_debug_level) { + bool debug = classify_debug_level > 0 || multilang_debug_level > 0; + if (debug) { tprintf("%s word with lang %s at:", word->done ? "Already done" : "Processing", most_recently_used_->lang.string()); @@ -1302,12 +1311,12 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, most_recently_used_ != sub_langs_[sub]; ++sub) {} } most_recently_used_->RetryWithLanguage( - *word_data, recognizer, &word_data->lang_words[sub], &best_words); + *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words); Tesseract* best_lang_tess = most_recently_used_; if (!WordsAcceptable(best_words)) { // Try all the other languages to see if they are any better. if (most_recently_used_ != this && - this->RetryWithLanguage(*word_data, recognizer, + this->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[sub_langs_.size()], &best_words) > 0) { best_lang_tess = this; @@ -1315,7 +1324,7 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT* pr_it, for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size(); ++i) { if (most_recently_used_ != sub_langs_[i] && - sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, + sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug, &word_data->lang_words[i], &best_words) > 0) { best_lang_tess = sub_langs_[i]; @@ -1357,11 +1366,25 @@ void Tesseract::classify_word_pass1(const WordData& word_data, BLOCK* block = word_data.block; prev_word_best_choice_ = word_data.prev_word != NULL ? word_data.prev_word->word->best_choice : NULL; -#ifndef NO_CUBE_BUILD - // If we only intend to run cube - run it and return. - if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { - cube_word_pass1(block, row, *in_word); - return; +#ifndef ANDROID_BUILD + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY || + tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { + if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { + LSTMRecognizeWord(*block, row, *in_word, out_words); + if (!out_words->empty()) + return; // Successful lstm recognition. + } + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { + // No fallback allowed, so use a fake. + (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset()); + return; + } + // Fall back to tesseract for failed words or odd words. + (*in_word)->SetupForRecognition(unicharset, this, BestPix(), + OEM_TESSERACT_ONLY, NULL, + classify_bln_numeric_mode, + textord_use_cjk_fp_model, + poly_allow_detailed_fx, row, block); } #endif WERD_RES* word = *in_word; @@ -1497,11 +1520,7 @@ void Tesseract::classify_word_pass2(const WordData& word_data, WERD_RES** in_word, PointerVector* out_words) { // Return if we do not want to run Tesseract. - if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY && - tessedit_ocr_engine_mode != OEM_TESSERACT_CUBE_COMBINED && - word_data.word->best_choice != NULL) - return; - if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { return; } ROW* row = word_data.row; @@ -1886,7 +1905,7 @@ static void find_modal_font( //good chars in word * Get the fonts for the word. */ void Tesseract::set_word_fonts(WERD_RES *word) { - // Don't try to set the word fonts for a cube word, as the configs + // Don't try to set the word fonts for an lstm word, as the configs // will be meaningless. if (word->chopped_word == NULL) return; ASSERT_HOST(word->best_choice != NULL); diff --git a/ccmain/cube_control.cpp b/ccmain/cube_control.cpp deleted file mode 100644 index 1430debc..00000000 --- a/ccmain/cube_control.cpp +++ /dev/null @@ -1,432 +0,0 @@ -/****************************************************************** - * File: cube_control.cpp - * Description: Tesseract class methods for invoking cube convolutional - * neural network word recognizer. - * Author: Raquel Romano - * Created: September 2009 - * - **********************************************************************/ - -// Include automatically generated configuration file if running autoconf. -#ifdef HAVE_CONFIG_H -#include "config_auto.h" -#endif - -#include "allheaders.h" - -#include "cube_object.h" -#include "cube_reco_context.h" -#include "tesseractclass.h" -#include "tesseract_cube_combiner.h" - -namespace tesseract { - -/** - * @name convert_prob_to_tess_certainty - * - * Normalize a probability in the range [0.0, 1.0] to a tesseract - * certainty in the range [-20.0, 0.0] - */ -static float convert_prob_to_tess_certainty(float prob) { - return (prob - 1.0) * 20.0; -} - -/** - * @name char_box_to_tbox - * - * Create a TBOX from a character bounding box. If nonzero, the - * x_offset accounts for any additional padding of the word box that - * should be taken into account. - * - */ -TBOX char_box_to_tbox(Box* char_box, TBOX word_box, int x_offset) { - l_int32 left; - l_int32 top; - l_int32 width; - l_int32 height; - l_int32 right; - l_int32 bottom; - - boxGetGeometry(char_box, &left, &top, &width, &height); - left += word_box.left() - x_offset; - right = left + width; - top = word_box.bottom() + word_box.height() - top; - bottom = top - height; - return TBOX(left, bottom, right, top); -} - -/** - * @name extract_cube_state - * - * Extract CharSamp objects and character bounding boxes from the - * CubeObject's state. The caller should free both structres. - * - */ -bool Tesseract::extract_cube_state(CubeObject* cube_obj, - int* num_chars, - Boxa** char_boxes, - CharSamp*** char_samples) { - if (!cube_obj) { - if (cube_debug_level > 0) { - tprintf("Cube WARNING (extract_cube_state): Invalid cube object " - "passed to extract_cube_state\n"); - } - return false; - } - - // Note that the CubeObject accessors return either the deslanted or - // regular objects search object or beam search object, whichever - // was used in the last call to Recognize() - CubeSearchObject* cube_search_obj = cube_obj->SrchObj(); - if (!cube_search_obj) { - if (cube_debug_level > 0) { - tprintf("Cube WARNING (Extract_cube_state): Could not retrieve " - "cube's search object in extract_cube_state.\n"); - } - return false; - } - BeamSearch *beam_search_obj = cube_obj->BeamObj(); - if (!beam_search_obj) { - if (cube_debug_level > 0) { - tprintf("Cube WARNING (Extract_cube_state): Could not retrieve " - "cube's beam search object in extract_cube_state.\n"); - } - return false; - } - - // Get the character samples and bounding boxes by backtracking - // through the beam search path - int best_node_index = beam_search_obj->BestPresortedNodeIndex(); - *char_samples = beam_search_obj->BackTrack( - cube_search_obj, best_node_index, num_chars, NULL, char_boxes); - if (!*char_samples) - return false; - return true; -} - -/** - * @name create_cube_box_word - * - * Fill the given BoxWord with boxes from character bounding - * boxes. The char_boxes have local coordinates w.r.t. the - * word bounding box, i.e., the left-most character bbox of each word - * has (0,0) left-top coord, but the BoxWord must be defined in page - * coordinates. - */ -bool Tesseract::create_cube_box_word(Boxa *char_boxes, - int num_chars, - TBOX word_box, - BoxWord* box_word) { - if (!box_word) { - if (cube_debug_level > 0) { - tprintf("Cube WARNING (create_cube_box_word): Invalid box_word.\n"); - } - return false; - } - - // Find the x-coordinate of left-most char_box, which could be - // nonzero if the word image was padded before recognition took place. - int x_offset = -1; - for (int i = 0; i < num_chars; ++i) { - Box* char_box = boxaGetBox(char_boxes, i, L_CLONE); - if (x_offset < 0 || char_box->x < x_offset) { - x_offset = char_box->x; - } - boxDestroy(&char_box); - } - - for (int i = 0; i < num_chars; ++i) { - Box* char_box = boxaGetBox(char_boxes, i, L_CLONE); - TBOX tbox = char_box_to_tbox(char_box, word_box, x_offset); - boxDestroy(&char_box); - box_word->InsertBox(i, tbox); - } - return true; -} - -/** - * @name init_cube_objects - * - * Instantiates Tesseract object's CubeRecoContext and TesseractCubeCombiner. - * Returns false if cube context could not be created or if load_combiner is - * true, but the combiner could not be loaded. - */ -bool Tesseract::init_cube_objects(bool load_combiner, - TessdataManager *tessdata_manager) { - ASSERT_HOST(cube_cntxt_ == NULL); - ASSERT_HOST(tess_cube_combiner_ == NULL); - - // Create the cube context object - cube_cntxt_ = CubeRecoContext::Create(this, tessdata_manager, &unicharset); - if (cube_cntxt_ == NULL) { - if (cube_debug_level > 0) { - tprintf("Cube WARNING (Tesseract::init_cube_objects()): Failed to " - "instantiate CubeRecoContext\n"); - } - return false; - } - - // Create the combiner object and load the combiner net for target languages. - if (load_combiner) { - tess_cube_combiner_ = new tesseract::TesseractCubeCombiner(cube_cntxt_); - if (!tess_cube_combiner_ || !tess_cube_combiner_->LoadCombinerNet()) { - delete cube_cntxt_; - cube_cntxt_ = NULL; - if (tess_cube_combiner_ != NULL) { - delete tess_cube_combiner_; - tess_cube_combiner_ = NULL; - } - if (cube_debug_level > 0) - tprintf("Cube ERROR (Failed to instantiate TesseractCubeCombiner\n"); - return false; - } - } - return true; -} - -/** - * @name run_cube_combiner - * - * Iterates through tesseract's results and calls cube on each word, - * combining the results with the existing tesseract result. - */ -void Tesseract::run_cube_combiner(PAGE_RES *page_res) { - if (page_res == NULL || tess_cube_combiner_ == NULL) - return; - PAGE_RES_IT page_res_it(page_res); - // Iterate through the word results and call cube on each word. - for (page_res_it.restart_page(); page_res_it.word () != NULL; - page_res_it.forward()) { - BLOCK* block = page_res_it.block()->block; - if (block->poly_block() != NULL && !block->poly_block()->IsText()) - continue; // Don't deal with non-text blocks. - WERD_RES* word = page_res_it.word(); - // Skip cube entirely if tesseract's certainty is greater than threshold. - int combiner_run_thresh = convert_prob_to_tess_certainty( - cube_cntxt_->Params()->CombinerRunThresh()); - if (word->best_choice->certainty() >= combiner_run_thresh) { - continue; - } - // Use the same language as Tesseract used for the word. - Tesseract* lang_tess = word->tesseract; - - // Setup a trial WERD_RES in which to classify with cube. - WERD_RES cube_word; - cube_word.InitForRetryRecognition(*word); - cube_word.SetupForRecognition(lang_tess->unicharset, this, BestPix(), - OEM_CUBE_ONLY, - NULL, false, false, false, - page_res_it.row()->row, - page_res_it.block()->block); - CubeObject *cube_obj = lang_tess->cube_recognize_word( - page_res_it.block()->block, &cube_word); - if (cube_obj != NULL) - lang_tess->cube_combine_word(cube_obj, &cube_word, word); - delete cube_obj; - } -} - -/** - * @name cube_word_pass1 - * - * Recognizes a single word using (only) cube. Compatible with - * Tesseract's classify_word_pass1/classify_word_pass2. - */ -void Tesseract::cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word) { - CubeObject *cube_obj = cube_recognize_word(block, word); - delete cube_obj; -} - -/** - * @name cube_recognize_word - * - * Cube recognizer to recognize a single word as with classify_word_pass1 - * but also returns the cube object in case the combiner is needed. - */ -CubeObject* Tesseract::cube_recognize_word(BLOCK* block, WERD_RES* word) { - if (!cube_binary_ || !cube_cntxt_) { - if (cube_debug_level > 0 && !cube_binary_) - tprintf("Tesseract::run_cube(): NULL binary image.\n"); - word->SetupFake(unicharset); - return NULL; - } - TBOX word_box = word->word->bounding_box(); - if (block != NULL && (block->re_rotation().x() != 1.0f || - block->re_rotation().y() != 0.0f)) { - // TODO(rays) We have to rotate the bounding box to get the true coords. - // This will be achieved in the future via DENORM. - // In the mean time, cube can't process this word. - if (cube_debug_level > 0) { - tprintf("Cube can't process rotated word at:"); - word_box.print(); - } - word->SetupFake(unicharset); - return NULL; - } - CubeObject* cube_obj = new tesseract::CubeObject( - cube_cntxt_, cube_binary_, word_box.left(), - pixGetHeight(cube_binary_) - word_box.top(), - word_box.width(), word_box.height()); - if (!cube_recognize(cube_obj, block, word)) { - delete cube_obj; - return NULL; - } - return cube_obj; -} - -/** - * @name cube_combine_word - * - * Combines the cube and tesseract results for a single word, leaving the - * result in tess_word. - */ -void Tesseract::cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word, - WERD_RES* tess_word) { - float combiner_prob = tess_cube_combiner_->CombineResults(tess_word, - cube_obj); - // If combiner probability is greater than tess/cube combiner - // classifier threshold, i.e. tesseract wins, then just return the - // tesseract result unchanged, as the combiner knows nothing about how - // correct the answer is. If cube and tesseract agree, then improve the - // scores before returning. - WERD_CHOICE* tess_best = tess_word->best_choice; - WERD_CHOICE* cube_best = cube_word->best_choice; - if (cube_debug_level || classify_debug_level) { - tprintf("Combiner prob = %g vs threshold %g\n", - combiner_prob, cube_cntxt_->Params()->CombinerClassifierThresh()); - } - if (combiner_prob >= - cube_cntxt_->Params()->CombinerClassifierThresh()) { - if (tess_best->unichar_string() == cube_best->unichar_string()) { - // Cube and tess agree, so improve the scores. - tess_best->set_rating(tess_best->rating() / 2); - tess_best->set_certainty(tess_best->certainty() / 2); - } - return; - } - // Cube wins. - // It is better for the language combiner to have all tesseract scores, - // so put them in the cube result. - cube_best->set_rating(tess_best->rating()); - cube_best->set_certainty(tess_best->certainty()); - if (cube_debug_level || classify_debug_level) { - tprintf("Cube INFO: tesseract result replaced by cube: %s -> %s\n", - tess_best->unichar_string().string(), - cube_best->unichar_string().string()); - } - tess_word->ConsumeWordResults(cube_word); -} - -/** - * @name cube_recognize - * - * Call cube on the current word, and write the result to word. - * Sets up a fake result and returns false if something goes wrong. - */ -bool Tesseract::cube_recognize(CubeObject *cube_obj, BLOCK* block, - WERD_RES *word) { - // Run cube - WordAltList *cube_alt_list = cube_obj->RecognizeWord(); - if (!cube_alt_list || cube_alt_list->AltCount() <= 0) { - if (cube_debug_level > 0) { - tprintf("Cube returned nothing for word at:"); - word->word->bounding_box().print(); - } - word->SetupFake(unicharset); - return false; - } - - // Get cube's best result and its probability, mapped to tesseract's - // certainty range - char_32 *cube_best_32 = cube_alt_list->Alt(0); - double cube_prob = CubeUtils::Cost2Prob(cube_alt_list->AltCost(0)); - float cube_certainty = convert_prob_to_tess_certainty(cube_prob); - string cube_best_str; - CubeUtils::UTF32ToUTF8(cube_best_32, &cube_best_str); - - // Retrieve Cube's character bounding boxes and CharSamples, - // corresponding to the most recent call to RecognizeWord(). - Boxa *char_boxes = NULL; - CharSamp **char_samples = NULL;; - int num_chars; - if (!extract_cube_state(cube_obj, &num_chars, &char_boxes, &char_samples) - && cube_debug_level > 0) { - tprintf("Cube WARNING (Tesseract::cube_recognize): Cannot extract " - "cube state.\n"); - word->SetupFake(unicharset); - return false; - } - - // Convert cube's character bounding boxes to a BoxWord. - BoxWord cube_box_word; - TBOX tess_word_box = word->word->bounding_box(); - if (word->denorm.block() != NULL) - tess_word_box.rotate(word->denorm.block()->re_rotation()); - bool box_word_success = create_cube_box_word(char_boxes, num_chars, - tess_word_box, - &cube_box_word); - boxaDestroy(&char_boxes); - if (!box_word_success) { - if (cube_debug_level > 0) { - tprintf("Cube WARNING (Tesseract::cube_recognize): Could not " - "create cube BoxWord\n"); - } - word->SetupFake(unicharset); - return false; - } - - // Fill tesseract result's fields with cube results - fill_werd_res(cube_box_word, cube_best_str.c_str(), word); - - // Create cube's best choice. - BLOB_CHOICE** choices = new BLOB_CHOICE*[num_chars]; - for (int i = 0; i < num_chars; ++i) { - UNICHAR_ID uch_id = - cube_cntxt_->CharacterSet()->UnicharID(char_samples[i]->StrLabel()); - choices[i] = new BLOB_CHOICE(uch_id, -cube_certainty, cube_certainty, - -1, 0.0f, 0.0f, 0.0f, BCC_STATIC_CLASSIFIER); - } - word->FakeClassifyWord(num_chars, choices); - // within a word, cube recognizes the word in reading order. - word->best_choice->set_unichars_in_script_order(true); - delete [] choices; - delete [] char_samples; - - // Some sanity checks - ASSERT_HOST(word->best_choice->length() == word->reject_map.length()); - - if (cube_debug_level || classify_debug_level) { - tprintf("Cube result: %s r=%g, c=%g\n", - word->best_choice->unichar_string().string(), - word->best_choice->rating(), - word->best_choice->certainty()); - } - return true; -} - -/** - * @name fill_werd_res - * - * Fill Tesseract's word result fields with cube's. - * - */ -void Tesseract::fill_werd_res(const BoxWord& cube_box_word, - const char* cube_best_str, - WERD_RES* tess_werd_res) { - delete tess_werd_res->box_word; - tess_werd_res->box_word = new BoxWord(cube_box_word); - tess_werd_res->box_word->ClipToOriginalWord(tess_werd_res->denorm.block(), - tess_werd_res->word); - // Fill text and remaining fields - tess_werd_res->word->set_text(cube_best_str); - tess_werd_res->tess_failed = FALSE; - tess_werd_res->tess_accepted = tess_acceptable_word(tess_werd_res); - // There is no output word, so we can' call AdaptableWord, but then I don't - // think we need to. Fudge the result with accepted. - tess_werd_res->tess_would_adapt = tess_werd_res->tess_accepted; - - // Set word to done, i.e., ignore all of tesseract's tests for rejection - tess_werd_res->done = tess_werd_res->tess_accepted; -} - -} // namespace tesseract diff --git a/ccmain/cube_reco_context.cpp b/ccmain/cube_reco_context.cpp deleted file mode 100644 index fed53f09..00000000 --- a/ccmain/cube_reco_context.cpp +++ /dev/null @@ -1,208 +0,0 @@ -/********************************************************************** - * File: cube_reco_context.cpp - * Description: Implementation of the Cube Recognition Context Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include - -#include "cube_reco_context.h" - -#include "classifier_factory.h" -#include "cube_tuning_params.h" -#include "dict.h" -#include "feature_bmp.h" -#include "tessdatamanager.h" -#include "tesseractclass.h" -#include "tess_lang_model.h" - -namespace tesseract { - -/** - * Instantiate a CubeRecoContext object using a Tesseract object. - * CubeRecoContext will not take ownership of tess_obj, but will - * record the pointer to it and will make use of various Tesseract - * components (language model, flags, etc). Thus the caller should - * keep tess_obj alive so long as the instantiated CubeRecoContext is used. - */ -CubeRecoContext::CubeRecoContext(Tesseract *tess_obj) { - tess_obj_ = tess_obj; - lang_ = ""; - loaded_ = false; - lang_mod_ = NULL; - params_ = NULL; - char_classifier_ = NULL; - char_set_ = NULL; - word_size_model_ = NULL; - char_bigrams_ = NULL; - word_unigrams_ = NULL; - noisy_input_ = false; - size_normalization_ = false; -} - -CubeRecoContext::~CubeRecoContext() { - if (char_classifier_ != NULL) { - delete char_classifier_; - char_classifier_ = NULL; - } - - if (word_size_model_ != NULL) { - delete word_size_model_; - word_size_model_ = NULL; - } - - if (char_set_ != NULL) { - delete char_set_; - char_set_ = NULL; - } - - if (char_bigrams_ != NULL) { - delete char_bigrams_; - char_bigrams_ = NULL; - } - - if (word_unigrams_ != NULL) { - delete word_unigrams_; - word_unigrams_ = NULL; - } - - if (lang_mod_ != NULL) { - delete lang_mod_; - lang_mod_ = NULL; - } - - if (params_ != NULL) { - delete params_; - params_ = NULL; - } -} - -/** - * Returns the path of the data files by looking up the TESSDATA_PREFIX - * environment variable and appending a "tessdata" directory to it - */ -bool CubeRecoContext::GetDataFilePath(string *path) const { - *path = tess_obj_->datadir.string(); - return true; -} - -/** - * The object initialization function that loads all the necessary - * components of a RecoContext. TessdataManager is used to load the - * data from [lang].traineddata file. If TESSDATA_CUBE_UNICHARSET - * component is present, Cube will be instantiated with the unicharset - * specified in this component and the corresponding dictionary - * (TESSDATA_CUBE_SYSTEM_DAWG), and will map Cube's unicharset to - * Tesseract's. Otherwise, TessdataManager will assume that Cube will - * be using Tesseract's unicharset and dawgs, and will load the - * unicharset from the TESSDATA_UNICHARSET component and will load the - * dawgs from TESSDATA_*_DAWG components. - */ -bool CubeRecoContext::Load(TessdataManager *tessdata_manager, - UNICHARSET *tess_unicharset) { - ASSERT_HOST(tess_obj_ != NULL); - tess_unicharset_ = tess_unicharset; - string data_file_path; - - // Get the data file path. - if (GetDataFilePath(&data_file_path) == false) { - fprintf(stderr, "Unable to get data file path\n"); - return false; - } - - // Get the language from the Tesseract object. - lang_ = tess_obj_->lang.string(); - - // Create the char set. - if ((char_set_ = - CharSet::Create(tessdata_manager, tess_unicharset)) == NULL) { - fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load " - "CharSet\n"); - return false; - } - // Create the language model. - string lm_file_name = data_file_path + lang_ + ".cube.lm"; - string lm_params; - if (!CubeUtils::ReadFileToString(lm_file_name, &lm_params)) { - fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read cube " - "language model params from %s\n", lm_file_name.c_str()); - return false; - } - lang_mod_ = new TessLangModel(lm_params, data_file_path, - tess_obj_->getDict().load_system_dawg, - tessdata_manager, this); - if (lang_mod_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to create " - "TessLangModel\n"); - return false; - } - - // Create the optional char bigrams object. - char_bigrams_ = CharBigrams::Create(data_file_path, lang_); - - // Create the optional word unigrams object. - word_unigrams_ = WordUnigrams::Create(data_file_path, lang_); - - // Create the optional size model. - word_size_model_ = WordSizeModel::Create(data_file_path, lang_, - char_set_, Contextual()); - - // Load tuning params. - params_ = CubeTuningParams::Create(data_file_path, lang_); - if (params_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to read " - "CubeTuningParams from %s\n", data_file_path.c_str()); - return false; - } - - // Create the char classifier. - char_classifier_ = CharClassifierFactory::Create(data_file_path, lang_, - lang_mod_, char_set_, - params_); - if (char_classifier_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeRecoContext::Load): unable to load " - "CharClassifierFactory object from %s\n", data_file_path.c_str()); - return false; - } - - loaded_ = true; - - return true; -} - -/** Creates a CubeRecoContext object using a tesseract object */ -CubeRecoContext * CubeRecoContext::Create(Tesseract *tess_obj, - TessdataManager *tessdata_manager, - UNICHARSET *tess_unicharset) { - // create the object - CubeRecoContext *cntxt = new CubeRecoContext(tess_obj); - if (cntxt == NULL) { - fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to create " - "CubeRecoContext object\n"); - return NULL; - } - // load the necessary components - if (cntxt->Load(tessdata_manager, tess_unicharset) == false) { - fprintf(stderr, "Cube ERROR (CubeRecoContext::Create): unable to init " - "CubeRecoContext object\n"); - delete cntxt; - return NULL; - } - // success - return cntxt; -} -} // tesseract} diff --git a/ccmain/cube_reco_context.h b/ccmain/cube_reco_context.h deleted file mode 100644 index 9b69b8d1..00000000 --- a/ccmain/cube_reco_context.h +++ /dev/null @@ -1,157 +0,0 @@ -/********************************************************************** - * File: cube_reco_context.h - * Description: Declaration of the Cube Recognition Context Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CubeRecoContext class abstracts the Cube OCR Engine. Typically a process -// (or a thread) would create one CubeRecoContext object per language. -// The CubeRecoContext object also provides methods to get and set the -// different attribues of the Cube OCR Engine. - -#ifndef CUBE_RECO_CONTEXT_H -#define CUBE_RECO_CONTEXT_H - -#include -#include "neural_net.h" -#include "lang_model.h" -#include "classifier_base.h" -#include "feature_base.h" -#include "char_set.h" -#include "word_size_model.h" -#include "char_bigrams.h" -#include "word_unigrams.h" - -namespace tesseract { - -class Tesseract; -class TessdataManager; - -class CubeRecoContext { - public: - // Reading order enum type - enum ReadOrder { - L2R, - R2L - }; - - // Instantiate using a Tesseract object - CubeRecoContext(Tesseract *tess_obj); - - ~CubeRecoContext(); - - // accessor functions - inline const string & Lang() const { return lang_; } - inline CharSet *CharacterSet() const { return char_set_; } - const UNICHARSET *TessUnicharset() const { return tess_unicharset_; } - inline CharClassifier *Classifier() const { return char_classifier_; } - inline WordSizeModel *SizeModel() const { return word_size_model_; } - inline CharBigrams *Bigrams() const { return char_bigrams_; } - inline WordUnigrams *WordUnigramsObj() const { return word_unigrams_; } - inline TuningParams *Params() const { return params_; } - inline LangModel *LangMod() const { return lang_mod_; } - - // the reading order of the language - inline ReadOrder ReadingOrder() const { - return ((lang_ == "ara") ? R2L : L2R); - } - - // does the language support case - inline bool HasCase() const { - return (lang_ != "ara" && lang_ != "hin"); - } - - inline bool Cursive() const { - return (lang_ == "ara"); - } - - inline bool HasItalics() const { - return (lang_ != "ara" && lang_ != "hin"); - } - - inline bool Contextual() const { - return (lang_ == "ara"); - } - - // RecoContext runtime flags accessor functions - inline bool SizeNormalization() const { return size_normalization_; } - inline bool NoisyInput() const { return noisy_input_; } - inline bool OOD() const { return lang_mod_->OOD(); } - inline bool Numeric() const { return lang_mod_->Numeric(); } - inline bool WordList() const { return lang_mod_->WordList(); } - inline bool Punc() const { return lang_mod_->Punc(); } - inline bool CaseSensitive() const { - return char_classifier_->CaseSensitive(); - } - - inline void SetSizeNormalization(bool size_normalization) { - size_normalization_ = size_normalization; - } - inline void SetNoisyInput(bool noisy_input) { - noisy_input_ = noisy_input; - } - inline void SetOOD(bool ood_enabled) { - lang_mod_->SetOOD(ood_enabled); - } - inline void SetNumeric(bool numeric_enabled) { - lang_mod_->SetNumeric(numeric_enabled); - } - inline void SetWordList(bool word_list_enabled) { - lang_mod_->SetWordList(word_list_enabled); - } - inline void SetPunc(bool punc_enabled) { - lang_mod_->SetPunc(punc_enabled); - } - inline void SetCaseSensitive(bool case_sensitive) { - char_classifier_->SetCaseSensitive(case_sensitive); - } - inline tesseract::Tesseract *TesseractObject() const { - return tess_obj_; - } - - // Returns the path of the data files - bool GetDataFilePath(string *path) const; - // Creates a CubeRecoContext object using a tesseract object. Data - // files are loaded via the tessdata_manager, and the tesseract - // unicharset is provided in order to map Cube's unicharset to - // Tesseract's in the case where the two unicharsets differ. - static CubeRecoContext *Create(Tesseract *tess_obj, - TessdataManager *tessdata_manager, - UNICHARSET *tess_unicharset); - - private: - bool loaded_; - string lang_; - CharSet *char_set_; - UNICHARSET *tess_unicharset_; - WordSizeModel *word_size_model_; - CharClassifier *char_classifier_; - CharBigrams *char_bigrams_; - WordUnigrams *word_unigrams_; - TuningParams *params_; - LangModel *lang_mod_; - Tesseract *tess_obj_; // CubeRecoContext does not own this pointer - bool size_normalization_; - bool noisy_input_; - - // Loads and initialized all the necessary components of a - // CubeRecoContext. See .cpp for more details. - bool Load(TessdataManager *tessdata_manager, - UNICHARSET *tess_unicharset); -}; -} - -#endif // CUBE_RECO_CONTEXT_H diff --git a/ccmain/cubeclassifier.cpp b/ccmain/cubeclassifier.cpp deleted file mode 100644 index 5cc892db..00000000 --- a/ccmain/cubeclassifier.cpp +++ /dev/null @@ -1,134 +0,0 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// Author: rays@google.com (Ray Smith) -/////////////////////////////////////////////////////////////////////// -// File: cubeclassifier.cpp -// Description: Cube implementation of a ShapeClassifier. -// Author: Ray Smith -// Created: Wed Nov 23 10:39:45 PST 2011 -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -/////////////////////////////////////////////////////////////////////// - -#include "cubeclassifier.h" - -#include "char_altlist.h" -#include "char_set.h" -#include "cube_object.h" -#include "cube_reco_context.h" -#include "tessclassifier.h" -#include "tesseractclass.h" -#include "trainingsample.h" -#include "unicharset.h" - -namespace tesseract { - -CubeClassifier::CubeClassifier(tesseract::Tesseract* tesseract) - : cube_cntxt_(tesseract->GetCubeRecoContext()), - shape_table_(*tesseract->shape_table()) { -} -CubeClassifier::~CubeClassifier() { -} - -/// Classifies the given [training] sample, writing to results. -/// See ShapeClassifier for a full description. -int CubeClassifier::UnicharClassifySample( - const TrainingSample& sample, Pix* page_pix, int debug, - UNICHAR_ID keep_this, GenericVector* results) { - results->clear(); - if (page_pix == NULL) return 0; - - ASSERT_HOST(cube_cntxt_ != NULL); - const TBOX& char_box = sample.bounding_box(); - CubeObject* cube_obj = new tesseract::CubeObject( - cube_cntxt_, page_pix, char_box.left(), - pixGetHeight(page_pix) - char_box.top(), - char_box.width(), char_box.height()); - CharAltList* alt_list = cube_obj->RecognizeChar(); - if (alt_list != NULL) { - alt_list->Sort(); - CharSet* char_set = cube_cntxt_->CharacterSet(); - for (int i = 0; i < alt_list->AltCount(); ++i) { - // Convert cube representation to a shape_id. - int alt_id = alt_list->Alt(i); - int unichar_id = char_set->UnicharID(char_set->ClassString(alt_id)); - if (unichar_id >= 0) - results->push_back(UnicharRating(unichar_id, alt_list->AltProb(i))); - } - delete alt_list; - } - delete cube_obj; - return results->size(); -} - -/** Provides access to the ShapeTable that this classifier works with. */ -const ShapeTable* CubeClassifier::GetShapeTable() const { - return &shape_table_; -} - -CubeTessClassifier::CubeTessClassifier(tesseract::Tesseract* tesseract) - : cube_cntxt_(tesseract->GetCubeRecoContext()), - shape_table_(*tesseract->shape_table()), - pruner_(new TessClassifier(true, tesseract)) { -} -CubeTessClassifier::~CubeTessClassifier() { - delete pruner_; -} - -/// Classifies the given [training] sample, writing to results. -/// See ShapeClassifier for a full description. -int CubeTessClassifier::UnicharClassifySample( - const TrainingSample& sample, Pix* page_pix, int debug, - UNICHAR_ID keep_this, GenericVector* results) { - int num_results = pruner_->UnicharClassifySample(sample, page_pix, debug, - keep_this, results); - if (page_pix == NULL) return num_results; - - ASSERT_HOST(cube_cntxt_ != NULL); - const TBOX& char_box = sample.bounding_box(); - CubeObject* cube_obj = new tesseract::CubeObject( - cube_cntxt_, page_pix, char_box.left(), - pixGetHeight(page_pix) - char_box.top(), - char_box.width(), char_box.height()); - CharAltList* alt_list = cube_obj->RecognizeChar(); - CharSet* char_set = cube_cntxt_->CharacterSet(); - if (alt_list != NULL) { - for (int r = 0; r < num_results; ++r) { - // Get the best cube probability of the unichar in the result. - double best_prob = 0.0; - for (int i = 0; i < alt_list->AltCount(); ++i) { - int alt_id = alt_list->Alt(i); - int unichar_id = char_set->UnicharID(char_set->ClassString(alt_id)); - if (unichar_id == (*results)[r].unichar_id && - alt_list->AltProb(i) > best_prob) { - best_prob = alt_list->AltProb(i); - } - } - (*results)[r].rating = best_prob; - } - delete alt_list; - // Re-sort by rating. - results->sort(&UnicharRating::SortDescendingRating); - } - delete cube_obj; - return results->size(); -} - -/** Provides access to the ShapeTable that this classifier works with. */ -const ShapeTable* CubeTessClassifier::GetShapeTable() const { - return &shape_table_; -} - -} // namespace tesseract - - - diff --git a/ccmain/cubeclassifier.h b/ccmain/cubeclassifier.h deleted file mode 100644 index 98bdb5cf..00000000 --- a/ccmain/cubeclassifier.h +++ /dev/null @@ -1,80 +0,0 @@ -// Copyright 2011 Google Inc. All Rights Reserved. -// Author: rays@google.com (Ray Smith) -/////////////////////////////////////////////////////////////////////// -// File: cubeclassifier.h -// Description: Cube implementation of a ShapeClassifier. -// Author: Ray Smith -// Created: Wed Nov 23 10:36:32 PST 2011 -// -// (C) Copyright 2011, Google Inc. -// Licensed under the Apache License, Version 2.0 (the "License"); -// you may not use this file except in compliance with the License. -// You may obtain a copy of the License at -// http://www.apache.org/licenses/LICENSE-2.0 -// Unless required by applicable law or agreed to in writing, software -// distributed under the License is distributed on an "AS IS" BASIS, -// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -// See the License for the specific language governing permissions and -// limitations under the License. -// -/////////////////////////////////////////////////////////////////////// - -#ifndef THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_ -#define THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_ - -#include "shapeclassifier.h" - -namespace tesseract { - -class Classify; -class CubeRecoContext; -class ShapeTable; -class TessClassifier; -class Tesseract; -class TrainingSample; -struct UnicharRating; - -// Cube implementation of a ShapeClassifier. -class CubeClassifier : public ShapeClassifier { - public: - explicit CubeClassifier(Tesseract* tesseract); - virtual ~CubeClassifier(); - - // Classifies the given [training] sample, writing to results. - // See ShapeClassifier for a full description. - virtual int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix, - int debug, UNICHAR_ID keep_this, - GenericVector* results); - // Provides access to the ShapeTable that this classifier works with. - virtual const ShapeTable* GetShapeTable() const; - - private: - // Cube objects. - CubeRecoContext* cube_cntxt_; - const ShapeTable& shape_table_; -}; - -// Combination of Tesseract class pruner with scoring by cube. -class CubeTessClassifier : public ShapeClassifier { - public: - explicit CubeTessClassifier(Tesseract* tesseract); - virtual ~CubeTessClassifier(); - - // Classifies the given [training] sample, writing to results. - // See ShapeClassifier for a full description. - virtual int UnicharClassifySample(const TrainingSample& sample, Pix* page_pix, - int debug, UNICHAR_ID keep_this, - GenericVector* results); - // Provides access to the ShapeTable that this classifier works with. - virtual const ShapeTable* GetShapeTable() const; - - private: - // Cube objects. - CubeRecoContext* cube_cntxt_; - const ShapeTable& shape_table_; - TessClassifier* pruner_; -}; - -} // namespace tesseract - -#endif /* THIRD_PARTY_TESSERACT_CCMAIN_CUBECLASSIFIER_H_ */ diff --git a/ccmain/docqual.cpp b/ccmain/docqual.cpp index c6e7f17e..4706fb3b 100644 --- a/ccmain/docqual.cpp +++ b/ccmain/docqual.cpp @@ -1,8 +1,8 @@ /****************************************************************** * File: docqual.cpp (Formerly docqual.c) * Description: Document Quality Metrics - * Author: Phil Cheatle - * Created: Mon May 9 11:27:28 BST 1994 + * Author: Phil Cheatle + * Created: Mon May 9 11:27:28 BST 1994 * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -98,8 +98,8 @@ void Tesseract::word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count) { - if (word->bln_boxes == NULL || - word->rebuild_word == NULL || word->rebuild_word->blobs.empty()) { + if (word->bln_boxes == NULL || word->rebuild_word == NULL || + word->rebuild_word->blobs.empty()) { *match_count = 0; *accepted_match_count = 0; return; @@ -132,7 +132,7 @@ inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) { int expected_outline_count; if (STRING (outlines_odd).contains (c)) - return 0; //Don't use this char + return 0; // Don't use this char else if (STRING (outlines_2).contains (c)) expected_outline_count = 2; else @@ -151,17 +151,16 @@ void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it, } } - /************************************************************************* * unrej_good_quality_words() * Accept potential rejects in words which pass the following checks: * - Contains a potential reject * - Word looks like a sensible alpha word. * - Word segmentation is the same as the original image - * - All characters have the expected number of outlines + * - All characters have the expected number of outlines * NOTE - the rejection counts are recalculated after unrejection * - CAN'T do it in a single pass without a bit of fiddling - * - keep it simple but inefficient + * - keep it simple but inefficient *************************************************************************/ void Tesseract::unrej_good_quality_words( //unreject potential PAGE_RES_IT &page_res_it) { @@ -403,7 +402,6 @@ void Tesseract::doc_and_block_rejection( //reject big chunks } // namespace tesseract - /************************************************************************* * reject_whole_page() * Don't believe any of it - set the reject map to 00..00 in all words diff --git a/ccmain/equationdetect.cpp b/ccmain/equationdetect.cpp index 06aab249..3ff60c9e 100644 --- a/ccmain/equationdetect.cpp +++ b/ccmain/equationdetect.cpp @@ -624,10 +624,6 @@ void EquationDetect::IdentifySeedParts() { } float EquationDetect::ComputeForegroundDensity(const TBOX& tbox) { -#if LIBLEPT_MINOR_VERSION < 69 && LIBLEPT_MAJOR_VERSION <= 1 - // This will disable the detector because no seed will be identified. - return 1.0f; -#else Pix *pix_bi = lang_tesseract_->pix_binary(); int pix_height = pixGetHeight(pix_bi); Box* box = boxCreate(tbox.left(), pix_height - tbox.top(), @@ -639,7 +635,6 @@ float EquationDetect::ComputeForegroundDensity(const TBOX& tbox) { boxDestroy(&box); return fract; -#endif } bool EquationDetect::CheckSeedFgDensity(const float density_th, diff --git a/ccmain/equationdetect.h b/ccmain/equationdetect.h index 09b55396..ddfdf5f5 100644 --- a/ccmain/equationdetect.h +++ b/ccmain/equationdetect.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H__ -#define TESSERACT_CCMAIN_EQUATIONDETECT_H__ +#ifndef TESSERACT_CCMAIN_EQUATIONDETECT_H_ +#define TESSERACT_CCMAIN_EQUATIONDETECT_H_ #include "blobbox.h" #include "equationdetectbase.h" diff --git a/ccmain/fixspace.cpp b/ccmain/fixspace.cpp index f58c9610..5fbe8c9a 100644 --- a/ccmain/fixspace.cpp +++ b/ccmain/fixspace.cpp @@ -3,8 +3,8 @@ * Description: Implements a pass over the page res, exploring the alternative * spacing possibilities, trying to use context to improve the * word spacing -* Author: Phil Cheatle -* Created: Thu Oct 21 11:38:43 BST 1993 +* Author: Phil Cheatle +* Created: Thu Oct 21 11:38:43 BST 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -211,7 +211,6 @@ void Tesseract::match_current_words(WERD_RES_LIST &words, ROW *row, } } - /** * @name eval_word_spacing() * The basic measure is the number of characters in contextually confirmed diff --git a/ccmain/linerec.cpp b/ccmain/linerec.cpp new file mode 100644 index 00000000..6c242100 --- /dev/null +++ b/ccmain/linerec.cpp @@ -0,0 +1,333 @@ +/////////////////////////////////////////////////////////////////////// +// File: linerec.cpp +// Description: Top-level line-based recognition module for Tesseract. +// Author: Ray Smith +// Created: Thu May 02 09:47:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "tesseractclass.h" + +#include "allheaders.h" +#include "boxread.h" +#include "imagedata.h" +#ifndef ANDROID_BUILD +#include "lstmrecognizer.h" +#include "recodebeam.h" +#endif +#include "ndminx.h" +#include "pageres.h" +#include "tprintf.h" + +namespace tesseract { + +// Arbitarary penalty for non-dictionary words. +// TODO(rays) How to learn this? +const float kNonDictionaryPenalty = 5.0f; +// Scale factor to make certainty more comparable to Tesseract. +const float kCertaintyScale = 7.0f; +// Worst acceptable certainty for a dictionary word. +const float kWorstDictCertainty = -25.0f; + +// Generates training data for training a line recognizer, eg LSTM. +// Breaks the page into lines, according to the boxes, and writes them to a +// serialized DocumentData based on output_basename. +void Tesseract::TrainLineRecognizer(const STRING& input_imagename, + const STRING& output_basename, + BLOCK_LIST *block_list) { + STRING lstmf_name = output_basename + ".lstmf"; + DocumentData images(lstmf_name); + if (applybox_page > 0) { + // Load existing document for the previous pages. + if (!images.LoadDocument(lstmf_name.string(), "eng", 0, 0, NULL)) { + tprintf("Failed to read training data from %s!\n", lstmf_name.string()); + return; + } + } + GenericVector boxes; + GenericVector texts; + // Get the boxes for this page, if there are any. + if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, NULL, + NULL) || + boxes.empty()) { + tprintf("Failed to read boxes from %s\n", input_imagename.string()); + return; + } + TrainFromBoxes(boxes, texts, block_list, &images); + images.Shuffle(); + if (!images.SaveDocument(lstmf_name.string(), NULL)) { + tprintf("Failed to write training data to %s!\n", lstmf_name.string()); + } +} + +// Generates training data for training a line recognizer, eg LSTM. +// Breaks the boxes into lines, normalizes them, converts to ImageData and +// appends them to the given training_data. +void Tesseract::TrainFromBoxes(const GenericVector& boxes, + const GenericVector& texts, + BLOCK_LIST *block_list, + DocumentData* training_data) { + int box_count = boxes.size(); + // Process all the text lines in this page, as defined by the boxes. + int end_box = 0; + // Don't let \t, which marks newlines in the box file, get into the line + // content, as that makes the line unusable in training. + while (end_box < texts.size() && texts[end_box] == "\t") ++end_box; + for (int start_box = end_box; start_box < box_count; start_box = end_box) { + // Find the textline of boxes starting at start and their bounding box. + TBOX line_box = boxes[start_box]; + STRING line_str = texts[start_box]; + for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t"; + ++end_box) { + line_box += boxes[end_box]; + line_str += texts[end_box]; + } + // Find the most overlapping block. + BLOCK* best_block = NULL; + int best_overlap = 0; + BLOCK_IT b_it(block_list); + for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) { + BLOCK* block = b_it.data(); + if (block->poly_block() != NULL && !block->poly_block()->IsText()) + continue; // Not a text block. + TBOX block_box = block->bounding_box(); + block_box.rotate(block->re_rotation()); + if (block_box.major_overlap(line_box)) { + TBOX overlap_box = line_box.intersection(block_box); + if (overlap_box.area() > best_overlap) { + best_overlap = overlap_box.area(); + best_block = block; + } + } + } + ImageData* imagedata = NULL; + if (best_block == NULL) { + tprintf("No block overlapping textline: %s\n", line_str.string()); + } else { + imagedata = GetLineData(line_box, boxes, texts, start_box, end_box, + *best_block); + } + if (imagedata != NULL) + training_data->AddPageToDocument(imagedata); + // Don't let \t, which marks newlines in the box file, get into the line + // content, as that makes the line unusable in training. + while (end_box < texts.size() && texts[end_box] == "\t") ++end_box; + } +} + +// Returns an Imagedata containing the image of the given box, +// and ground truth boxes/truth text if available in the input. +// The image is not normalized in any way. +ImageData* Tesseract::GetLineData(const TBOX& line_box, + const GenericVector& boxes, + const GenericVector& texts, + int start_box, int end_box, + const BLOCK& block) { + TBOX revised_box; + ImageData* image_data = GetRectImage(line_box, block, kImagePadding, + &revised_box); + if (image_data == NULL) return NULL; + image_data->set_page_number(applybox_page); + // Copy the boxes and shift them so they are relative to the image. + FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y()); + ICOORD shift = -revised_box.botleft(); + GenericVector line_boxes; + GenericVector line_texts; + for (int b = start_box; b < end_box; ++b) { + TBOX box = boxes[b]; + box.rotate(block_rotation); + box.move(shift); + line_boxes.push_back(box); + line_texts.push_back(texts[b]); + } + GenericVector page_numbers; + page_numbers.init_to_size(line_boxes.size(), applybox_page); + image_data->AddBoxes(line_boxes, line_texts, page_numbers); + return image_data; +} + +// Helper gets the image of a rectangle, using the block.re_rotation() if +// needed to get to the image, and rotating the result back to horizontal +// layout. (CJK characters will be on their left sides) The vertical text flag +// is set in the returned ImageData if the text was originally vertical, which +// can be used to invoke a different CJK recognition engine. The revised_box +// is also returned to enable calculation of output bounding boxes. +ImageData* Tesseract::GetRectImage(const TBOX& box, const BLOCK& block, + int padding, TBOX* revised_box) const { + TBOX wbox = box; + wbox.pad(padding, padding); + *revised_box = wbox; + // Number of clockwise 90 degree rotations needed to get back to tesseract + // coords from the clipped image. + int num_rotations = 0; + if (block.re_rotation().y() > 0.0f) + num_rotations = 1; + else if (block.re_rotation().x() < 0.0f) + num_rotations = 2; + else if (block.re_rotation().y() < 0.0f) + num_rotations = 3; + // Handle two cases automatically: 1 the box came from the block, 2 the box + // came from a box file, and refers to the image, which the block may not. + if (block.bounding_box().major_overlap(*revised_box)) + revised_box->rotate(block.re_rotation()); + // Now revised_box always refers to the image. + // BestPix is never colormapped, but may be of any depth. + Pix* pix = BestPix(); + int width = pixGetWidth(pix); + int height = pixGetHeight(pix); + TBOX image_box(0, 0, width, height); + // Clip to image bounds; + *revised_box &= image_box; + if (revised_box->null_box()) return NULL; + Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(), + revised_box->width(), revised_box->height()); + Pix* box_pix = pixClipRectangle(pix, clip_box, NULL); + if (box_pix == NULL) return NULL; + boxDestroy(&clip_box); + if (num_rotations > 0) { + Pix* rot_pix = pixRotateOrth(box_pix, num_rotations); + pixDestroy(&box_pix); + box_pix = rot_pix; + } + // Convert sub-8-bit images to 8 bit. + int depth = pixGetDepth(box_pix); + if (depth < 8) { + Pix* grey; + grey = pixConvertTo8(box_pix, false); + pixDestroy(&box_pix); + box_pix = grey; + } + bool vertical_text = false; + if (num_rotations > 0) { + // Rotated the clipped revised box back to internal coordinates. + FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y()); + revised_box->rotate(rotation); + if (num_rotations != 2) + vertical_text = true; + } + return new ImageData(vertical_text, box_pix); +} + +#ifndef ANDROID_BUILD +// Recognizes a word or group of words, converting to WERD_RES in *words. +// Analogous to classify_word_pass1, but can handle a group of words as well. +void Tesseract::LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word, + PointerVector* words) { + TBOX word_box = word->word->bounding_box(); + // Get the word image - no frills. + if (tessedit_pageseg_mode == PSM_SINGLE_WORD || + tessedit_pageseg_mode == PSM_RAW_LINE) { + // In single word mode, use the whole image without any other row/word + // interpretation. + word_box = TBOX(0, 0, ImageWidth(), ImageHeight()); + } else { + float baseline = row->base_line((word_box.left() + word_box.right()) / 2); + if (baseline + row->descenders() < word_box.bottom()) + word_box.set_bottom(baseline + row->descenders()); + if (baseline + row->x_height() + row->ascenders() > word_box.top()) + word_box.set_top(baseline + row->x_height() + row->ascenders()); + } + ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box); + if (im_data == NULL) return; + lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0, + kWorstDictCertainty / kCertaintyScale, + lstm_use_matrix, &unicharset, word_box, 2.0, + false, words); + delete im_data; + SearchWords(words); +} + +// Apply segmentation search to the given set of words, within the constraints +// of the existing ratings matrix. If there is already a best_choice on a word +// leaves it untouched and just sets the done/accepted etc flags. +void Tesseract::SearchWords(PointerVector* words) { + // Run the segmentation search on the network outputs and make a BoxWord + // for each of the output words. + // If we drop a word as junk, then there is always a space in front of the + // next. + const Dict* stopper_dict = lstm_recognizer_->GetDict(); + if (stopper_dict == nullptr) stopper_dict = &getDict(); + bool any_nonspace_delimited = false; + for (int w = 0; w < words->size(); ++w) { + WERD_RES* word = (*words)[w]; + if (word->best_choice != nullptr && + word->best_choice->ContainsAnyNonSpaceDelimited()) { + any_nonspace_delimited = true; + break; + } + } + for (int w = 0; w < words->size(); ++w) { + WERD_RES* word = (*words)[w]; + if (word->best_choice == NULL) { + // If we are using the beam search, the unicharset had better match! + word->SetupWordScript(unicharset); + WordSearch(word); + } else if (word->best_choice->unicharset() == &unicharset && + !lstm_recognizer_->IsRecoding()) { + // We set up the word without using the dictionary, so set the permuter + // now, but we can only do it because the unicharsets match. + word->best_choice->set_permuter( + getDict().valid_word(*word->best_choice, true)); + } + if (word->best_choice == NULL) { + // It is a dud. + word->SetupFake(lstm_recognizer_->GetUnicharset()); + } else { + // Set the best state. + for (int i = 0; i < word->best_choice->length(); ++i) { + int length = word->best_choice->state(i); + word->best_state.push_back(length); + } + word->reject_map.initialise(word->best_choice->length()); + word->tess_failed = false; + word->tess_accepted = true; + word->tess_would_adapt = false; + word->done = true; + word->tesseract = this; + float word_certainty = MIN(word->space_certainty, + word->best_choice->certainty()); + word_certainty *= kCertaintyScale; + // Arbitrary ding factor for non-dictionary words. + if (!lstm_recognizer_->IsRecoding() && + !Dict::valid_word_permuter(word->best_choice->permuter(), true)) + word_certainty -= kNonDictionaryPenalty; + if (getDict().stopper_debug_level >= 1) { + tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n", + word->best_choice->certainty(), word->space_certainty, + MIN(word->space_certainty, word->best_choice->certainty()) * + kCertaintyScale, + word_certainty); + word->best_choice->print(); + } + word->best_choice->set_certainty(word_certainty); + // Discard words that are impossibly bad, but allow a bit more for + // dictionary words, and keep bad words in non-space-delimited langs. + if (word_certainty >= RecodeBeamSearch::kMinCertainty || + any_nonspace_delimited || + (word_certainty >= kWorstDictCertainty && + Dict::valid_word_permuter(word->best_choice->permuter(), true))) { + word->tess_accepted = stopper_dict->AcceptableResult(word); + } else { + if (getDict().stopper_debug_level >= 1) { + tprintf("Deleting word with certainty %g\n", word_certainty); + word->best_choice->print(); + } + // It is a dud. + word->SetupFake(lstm_recognizer_->GetUnicharset()); + } + } + } +} +#endif // ANDROID_BUILD + +} // namespace tesseract. diff --git a/ccmain/ltrresultiterator.cpp b/ccmain/ltrresultiterator.cpp index d5b85946..ae582b30 100644 --- a/ccmain/ltrresultiterator.cpp +++ b/ccmain/ltrresultiterator.cpp @@ -145,13 +145,12 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const { return 0.0f; } -void LTRResultIterator::RowAttributes(float* row_height, - float* descenders, +void LTRResultIterator::RowAttributes(float* row_height, float* descenders, float* ascenders) const { - *row_height = it_->row()->row->x_height() + it_->row()-> row->ascenders() - - it_->row()->row->descenders(); - *descenders = it_->row()->row->descenders(); - *ascenders = it_->row()->row->ascenders(); + *row_height = it_->row()->row->x_height() + it_->row()->row->ascenders() - + it_->row()->row->descenders(); + *descenders = it_->row()->row->descenders(); + *ascenders = it_->row()->row->ascenders(); } // Returns the font attributes of the current word. If iterating at a higher @@ -221,6 +220,12 @@ bool LTRResultIterator::WordIsFromDictionary() const { permuter == USER_DAWG_PERM; } +// Returns the number of blanks before the current word. +int LTRResultIterator::BlanksBeforeWord() const { + if (it_->word() == NULL) return 1; + return it_->word()->word->space(); +} + // Returns true if the current word is numeric. bool LTRResultIterator::WordIsNumeric() const { if (it_->word() == NULL) return false; // Already at the end! diff --git a/ccmain/ltrresultiterator.h b/ccmain/ltrresultiterator.h index 8819c2a0..95722cae 100644 --- a/ccmain/ltrresultiterator.h +++ b/ccmain/ltrresultiterator.h @@ -18,8 +18,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__ -#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__ +#ifndef TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ +#define TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ #include "platform.h" #include "pageiterator.h" @@ -92,8 +92,7 @@ class TESS_API LTRResultIterator : public PageIterator { float Confidence(PageIteratorLevel level) const; // Returns the attributes of the current row. - void RowAttributes(float* row_height, - float* descenders, + void RowAttributes(float* row_height, float* descenders, float* ascenders) const; // ============= Functions that refer to words only ============. @@ -125,6 +124,9 @@ class TESS_API LTRResultIterator : public PageIterator { // Returns true if the current word was found in a dictionary. bool WordIsFromDictionary() const; + // Returns the number of blanks before the current word. + int BlanksBeforeWord() const; + // Returns true if the current word is numeric. bool WordIsNumeric() const; @@ -216,4 +218,4 @@ class ChoiceIterator { } // namespace tesseract. -#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H__ +#endif // TESSERACT_CCMAIN_LTR_RESULT_ITERATOR_H_ diff --git a/ccmain/mutableiterator.h b/ccmain/mutableiterator.h index f097f47e..54759ba4 100644 --- a/ccmain/mutableiterator.h +++ b/ccmain/mutableiterator.h @@ -18,8 +18,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H__ -#define TESSERACT_CCMAIN_MUTABLEITERATOR_H__ +#ifndef TESSERACT_CCMAIN_MUTABLEITERATOR_H_ +#define TESSERACT_CCMAIN_MUTABLEITERATOR_H_ #include "resultiterator.h" @@ -61,4 +61,4 @@ class MutableIterator : public ResultIterator { } // namespace tesseract. -#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H__ +#endif // TESSERACT_CCMAIN_MUTABLEITERATOR_H_ diff --git a/ccmain/osdetect.cpp b/ccmain/osdetect.cpp index f2fe94a1..998b3bed 100644 --- a/ccmain/osdetect.cpp +++ b/ccmain/osdetect.cpp @@ -164,13 +164,19 @@ void remove_nontext_regions(tesseract::Tesseract *tess, BLOCK_LIST *blocks, int vertical_y = 1; tesseract::TabVector_LIST v_lines; tesseract::TabVector_LIST h_lines; - int resolution = (kMinCredibleResolution > pixGetXRes(pix)) ? - kMinCredibleResolution : pixGetXRes(pix); + int resolution; + if (kMinCredibleResolution > pixGetXRes(pix)) { + resolution = kMinCredibleResolution; + tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n", + pixGetXRes(pix), resolution); + } else { + resolution = pixGetXRes(pix); + } tesseract::LineFinder::FindAndRemoveLines(resolution, false, pix, &vertical_x, &vertical_y, NULL, &v_lines, &h_lines); - Pix* im_pix = tesseract::ImageFind::FindImages(pix); + Pix* im_pix = tesseract::ImageFind::FindImages(pix, nullptr); if (im_pix != NULL) { pixSubtract(pix, pix, im_pix); pixDestroy(&im_pix); diff --git a/ccmain/osdetect.h b/ccmain/osdetect.h index c5dc8d50..34a8f5e7 100644 --- a/ccmain/osdetect.h +++ b/ccmain/osdetect.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCMAIN_OSDETECT_H__ -#define TESSERACT_CCMAIN_OSDETECT_H__ +#ifndef TESSERACT_CCMAIN_OSDETECT_H_ +#define TESSERACT_CCMAIN_OSDETECT_H_ #include "strngs.h" #include "unicharset.h" @@ -135,4 +135,4 @@ bool os_detect_blob(BLOBNBOX* bbox, OrientationDetector* o, // applied for the text to be upright (readable). TESS_API int OrientationIdToValue(const int& id); -#endif // TESSERACT_CCMAIN_OSDETECT_H__ +#endif // TESSERACT_CCMAIN_OSDETECT_H_ diff --git a/ccmain/output.cpp b/ccmain/output.cpp index ddfcfc54..6fca63e4 100644 --- a/ccmain/output.cpp +++ b/ccmain/output.cpp @@ -1,8 +1,8 @@ /****************************************************************** * File: output.cpp (Formerly output.c) * Description: Output pass - * Author: Phil Cheatle - * Created: Thu Aug 4 10:56:08 BST 1994 + * Author: Phil Cheatle + * Created: Thu Aug 4 10:56:08 BST 1994 * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -78,18 +78,16 @@ void Tesseract::output_pass( //Tess output pass //send to api while (page_res_it.word () != NULL) { check_debug_pt (page_res_it.word (), 120); - if (target_word_box) - { - - TBOX current_word_box=page_res_it.word ()->word->bounding_box(); - FCOORD center_pt((current_word_box.right()+current_word_box.left())/2,(current_word_box.bottom()+current_word_box.top())/2); - if (!target_word_box->contains(center_pt)) - { - page_res_it.forward (); - continue; - } - - } + if (target_word_box) { + TBOX current_word_box = page_res_it.word()->word->bounding_box(); + FCOORD center_pt( + (current_word_box.right() + current_word_box.left()) / 2, + (current_word_box.bottom() + current_word_box.top()) / 2); + if (!target_word_box->contains(center_pt)) { + page_res_it.forward(); + continue; + } + } if (tessedit_write_block_separators && block_of_last_word != page_res_it.block ()) { block_of_last_word = page_res_it.block (); @@ -337,7 +335,7 @@ void Tesseract::set_unlv_suspects(WERD_RES *word_res) { rating_per_ch = word.rating() / word_res->reject_map.length(); if (rating_per_ch >= suspect_rating_per_ch) - return; //Don't touch bad ratings + return; // Don't touch bad ratings if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) { /* Unreject any Tess Acceptable word - but NOT tess reject chs*/ diff --git a/ccmain/pageiterator.cpp b/ccmain/pageiterator.cpp index fc15840c..7d7865ae 100644 --- a/ccmain/pageiterator.cpp +++ b/ccmain/pageiterator.cpp @@ -87,7 +87,7 @@ const PageIterator& PageIterator::operator=(const PageIterator& src) { rect_top_ = src.rect_top_; rect_width_ = src.rect_width_; rect_height_ = src.rect_height_; - if (it_ != NULL) delete it_; + delete it_; it_ = new PAGE_RES_IT(*src.it_); BeginWord(src.blob_index_); return *this; @@ -597,10 +597,8 @@ void PageIterator::BeginWord(int offset) { } word_ = NULL; // We will be iterating the box_word. - if (cblob_it_ != NULL) { - delete cblob_it_; - cblob_it_ = NULL; - } + delete cblob_it_; + cblob_it_ = NULL; } else { // No recognition yet, so a "symbol" is a cblob. word_ = word_res->word; diff --git a/ccmain/pageiterator.h b/ccmain/pageiterator.h index 56c78150..719a1b60 100644 --- a/ccmain/pageiterator.h +++ b/ccmain/pageiterator.h @@ -18,8 +18,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H__ -#define TESSERACT_CCMAIN_PAGEITERATOR_H__ +#ifndef TESSERACT_CCMAIN_PAGEITERATOR_H_ +#define TESSERACT_CCMAIN_PAGEITERATOR_H_ #include "publictypes.h" #include "platform.h" @@ -361,4 +361,4 @@ class TESS_API PageIterator { } // namespace tesseract. -#endif // TESSERACT_CCMAIN_PAGEITERATOR_H__ +#endif // TESSERACT_CCMAIN_PAGEITERATOR_H_ diff --git a/ccmain/pagesegmain.cpp b/ccmain/pagesegmain.cpp index 4e3c3420..d481dc66 100644 --- a/ccmain/pagesegmain.cpp +++ b/ccmain/pagesegmain.cpp @@ -18,9 +18,6 @@ **********************************************************************/ #ifdef _WIN32 -#ifndef __GNUC__ -#include -#endif // __GNUC__ #ifndef unlink #include #endif @@ -40,6 +37,7 @@ #include "blobbox.h" #include "blread.h" #include "colfind.h" +#include "debugpixa.h" #include "equationdetect.h" #include "imagefind.h" #include "linefind.h" @@ -179,28 +177,6 @@ int Tesseract::SegmentPage(const STRING* input_file, BLOCK_LIST* blocks, return auto_page_seg_ret_val; } -// Helper writes a grey image to a file for use by scrollviewer. -// Normally for speed we don't display the image in the layout debug windows. -// If textord_debug_images is true, we draw the image as a background to some -// of the debug windows. printable determines whether these -// images are optimized for printing instead of screen display. -static void WriteDebugBackgroundImage(bool printable, Pix* pix_binary) { - Pix* grey_pix = pixCreate(pixGetWidth(pix_binary), - pixGetHeight(pix_binary), 8); - // Printable images are light grey on white, but for screen display - // they are black on dark grey so the other colors show up well. - if (printable) { - pixSetAll(grey_pix); - pixSetMasked(grey_pix, pix_binary, 192); - } else { - pixSetAllArbitrary(grey_pix, 64); - pixSetMasked(grey_pix, pix_binary, 0); - } - AlignedBlob::IncrementDebugPix(); - pixWrite(AlignedBlob::textord_debug_pix().string(), grey_pix, IFF_PNG); - pixDestroy(&grey_pix); -} - /** * Auto page segmentation. Divide the page image into blocks of uniform * text linespacing and images. @@ -229,9 +205,6 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, TO_BLOCK_LIST* to_blocks, BLOBNBOX_LIST* diacritic_blobs, Tesseract* osd_tess, OSResults* osr) { - if (textord_debug_images) { - WriteDebugBackgroundImage(textord_debug_printable, pix_binary_); - } Pix* photomask_pix = NULL; Pix* musicmask_pix = NULL; // The blocks made by the ColumnFinder. Moved to blocks before return. @@ -253,9 +226,10 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, if (equ_detect_) { finder->SetEquationDetect(equ_detect_); } - result = finder->FindBlocks( - pageseg_mode, scaled_color_, scaled_factor_, to_block, photomask_pix, - pix_thresholds_, pix_grey_, &found_blocks, diacritic_blobs, to_blocks); + result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_, + to_block, photomask_pix, pix_thresholds_, + pix_grey_, &pixa_debug_, &found_blocks, + diacritic_blobs, to_blocks); if (result >= 0) finder->GetDeskewVectors(&deskew_, &reskew_); delete finder; @@ -268,11 +242,6 @@ int Tesseract::AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST* blocks, BLOCK_IT block_it(blocks); // Move the found blocks to the input/output blocks. block_it.add_list_after(&found_blocks); - - if (textord_debug_images) { - // The debug image is no longer needed so delete it. - unlink(AlignedBlob::textord_debug_pix().string()); - } return result; } @@ -314,19 +283,21 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( ASSERT_HOST(pix_binary_ != NULL); if (tessedit_dump_pageseg_images) { - pixWrite("tessinput.png", pix_binary_, IFF_PNG); + pixa_debug_.AddPix(pix_binary_, "PageSegInput"); } // Leptonica is used to find the rule/separator lines in the input. LineFinder::FindAndRemoveLines(source_resolution_, textord_tabfind_show_vlines, pix_binary_, &vertical_x, &vertical_y, music_mask_pix, &v_lines, &h_lines); - if (tessedit_dump_pageseg_images) - pixWrite("tessnolines.png", pix_binary_, IFF_PNG); + if (tessedit_dump_pageseg_images) { + pixa_debug_.AddPix(pix_binary_, "NoLines"); + } // Leptonica is used to find a mask of the photo regions in the input. - *photo_mask_pix = ImageFind::FindImages(pix_binary_); - if (tessedit_dump_pageseg_images) - pixWrite("tessnoimages.png", pix_binary_, IFF_PNG); + *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_); + if (tessedit_dump_pageseg_images) { + pixa_debug_.AddPix(pix_binary_, "NoImages"); + } if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear(); // The rest of the algorithm uses the usual connected components. @@ -412,9 +383,10 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( "Don't rotate.\n", osd_margin); osd_orientation = 0; } else { - tprintf("OSD: Weak margin (%.2f) for %d blob text block, " - "but using orientation anyway: %d\n", - osd_margin, osd_blobs.length(), osd_orientation); + tprintf( + "OSD: Weak margin (%.2f) for %d blob text block, " + "but using orientation anyway: %d\n", + osd_margin, osd_blobs.length(), osd_orientation); } } } diff --git a/ccmain/par_control.cpp b/ccmain/par_control.cpp index 7a7d0415..be8d3563 100644 --- a/ccmain/par_control.cpp +++ b/ccmain/par_control.cpp @@ -18,9 +18,9 @@ /////////////////////////////////////////////////////////////////////// #include "tesseractclass.h" -#ifdef OPENMP +#ifdef _OPENMP #include -#endif // OPENMP +#endif // _OPENMP namespace tesseract { @@ -53,7 +53,9 @@ void Tesseract::PrerecAllWordsPar(const GenericVector& words) { } // Pre-classify all the blobs. if (tessedit_parallelize > 1) { - #pragma omp parallel for num_threads(10) +#ifdef _OPENMP +#pragma omp parallel for num_threads(10) +#endif // _OPENMP for (int b = 0; b < blobs.size(); ++b) { *blobs[b].choices = blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL); diff --git a/ccmain/paragraphs.cpp b/ccmain/paragraphs.cpp index b46f9f3e..c7d21a91 100644 --- a/ccmain/paragraphs.cpp +++ b/ccmain/paragraphs.cpp @@ -2052,7 +2052,7 @@ void ConvertHypothesizedModelRunsToParagraphs( bool single_line_paragraph = false; SetOfModels models; rows[start].NonNullHypotheses(&models); - if (models.size() > 0) { + if (!models.empty()) { model = models[0]; if (rows[start].GetLineType(model) != LT_BODY) single_line_paragraph = true; @@ -2113,6 +2113,7 @@ void ConvertHypothesizedModelRunsToParagraphs( if ((*row_owners)[row] != NULL) { tprintf("Memory leak! ConvertHypothesizeModelRunsToParagraphs() called " "more than once!\n"); + delete (*row_owners)[row]; } (*row_owners)[row] = p; } @@ -2189,17 +2190,17 @@ void LeftoverSegments(const GenericVector &rows, SetOfModels models_w_crowns; rows[i].StrongHypotheses(&models); rows[i].NonNullHypotheses(&models_w_crowns); - if (models.empty() && models_w_crowns.size() > 0) { + if (models.empty() && !models_w_crowns.empty()) { // Crown paragraph. Is it followed by a modeled line? for (int end = i + 1; end < rows.size(); end++) { SetOfModels end_models; SetOfModels strong_end_models; rows[end].NonNullHypotheses(&end_models); rows[end].StrongHypotheses(&strong_end_models); - if (end_models.size() == 0) { + if (end_models.empty()) { needs_fixing = true; break; - } else if (strong_end_models.size() > 0) { + } else if (!strong_end_models.empty()) { needs_fixing = false; break; } @@ -2484,7 +2485,7 @@ void InitializeRowInfo(bool after_recognition, info->ltr = ltr >= rtl; info->has_leaders = num_leaders > 3; info->num_words = werds.size(); - if (werds.size() > 0) { + if (!werds.empty()) { WERD_RES *lword = werds[0], *rword = werds[werds.size() - 1]; info->lword_text = lword->best_choice->unichar_string().string(); info->rword_text = rword->best_choice->unichar_string().string(); @@ -2537,7 +2538,7 @@ void DetectParagraphs(int debug_level, // If we're called before text recognition, we might not have // tight block bounding boxes, so trim by the minimum on each side. - if (row_infos.size() > 0) { + if (!row_infos.empty()) { int min_lmargin = row_infos[0].pix_ldistance; int min_rmargin = row_infos[0].pix_rdistance; for (int i = 1; i < row_infos.size(); i++) { diff --git a/ccmain/paramsd.cpp b/ccmain/paramsd.cpp index 7784f853..e0e60539 100644 --- a/ccmain/paramsd.cpp +++ b/ccmain/paramsd.cpp @@ -329,13 +329,19 @@ void ParamsEditor::WriteParams(char *filename, fclose(fp); sprintf (msg_str, "Overwrite file " "%s" "? (Y/N)", filename); int a = sv_window_->ShowYesNoDialog(msg_str); - if (a == 'n') { return; } // don't write + if (a == 'n') { + return; + } // don't write } fp = fopen (filename, "wb"); // can we write to it? if (fp == NULL) { - sv_window_->AddMessage("Can't write to file " "%s" "", filename); + sv_window_->AddMessage( + "Can't write to file " + "%s" + "", + filename); return; } diff --git a/ccmain/paramsd.h b/ccmain/paramsd.h index c45cebd4..81a03d18 100644 --- a/ccmain/paramsd.h +++ b/ccmain/paramsd.h @@ -19,14 +19,12 @@ // // Tesseract parameter editor is used to edit all the parameters used // within tesseract from the ui. +#ifndef TESSERACT_CCMAIN_PARAMSD_H_ +#define TESSERACT_CCMAIN_PARAMSD_H_ + #ifndef GRAPHICS_DISABLED -#ifndef VARABLED_H -#define VARABLED_H #include "elst.h" -#ifndef NO_CUBE_BUILD -#include "scrollview.h" -#endif #include "params.h" #include "tesseractclass.h" @@ -122,5 +120,5 @@ class ParamsEditor : public SVEventHandler { ScrollView* sv_window_; }; -#endif -#endif +#endif // GRAPHICS_DISABLED +#endif // TESSERACT_CCMAIN_PARAMSD_H_ diff --git a/ccmain/pgedit.cpp b/ccmain/pgedit.cpp index d78c0dac..5e235954 100644 --- a/ccmain/pgedit.cpp +++ b/ccmain/pgedit.cpp @@ -191,7 +191,7 @@ ScrollView* bln_word_window_handle() { // return handle */ void build_image_window(int width, int height) { - if (image_win != NULL) { delete image_win; } + delete image_win; image_win = new ScrollView(editor_image_win_name.string(), editor_image_xpos, editor_image_ypos, width + 1, diff --git a/ccmain/reject.cpp b/ccmain/reject.cpp index aacc80dd..72f9d873 100644 --- a/ccmain/reject.cpp +++ b/ccmain/reject.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: reject.cpp (Formerly reject.c) * Description: Rejection functions used in tessedit - * Author: Phil Cheatle - * Created: Wed Sep 23 16:50:21 BST 1992 + * Author: Phil Cheatle + * Created: Wed Sep 23 16:50:21 BST 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/ccmain/resultiterator.h b/ccmain/resultiterator.h index d6e8a918..e5516836 100644 --- a/ccmain/resultiterator.h +++ b/ccmain/resultiterator.h @@ -19,8 +19,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H__ -#define TESSERACT_CCMAIN_RESULT_ITERATOR_H__ +#ifndef TESSERACT_CCMAIN_RESULT_ITERATOR_H_ +#define TESSERACT_CCMAIN_RESULT_ITERATOR_H_ #include "platform.h" #include "ltrresultiterator.h" @@ -241,4 +241,4 @@ class TESS_API ResultIterator : public LTRResultIterator { } // namespace tesseract. -#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H__ +#endif // TESSERACT_CCMAIN_RESULT_ITERATOR_H_ diff --git a/ccmain/tessedit.cpp b/ccmain/tessedit.cpp index dd96ba0e..defea65e 100644 --- a/ccmain/tessedit.cpp +++ b/ccmain/tessedit.cpp @@ -40,11 +40,14 @@ #include "efio.h" #include "danerror.h" #include "globals.h" +#ifndef ANDROID_BUILD +#include "lstmrecognizer.h" +#endif #include "tesseractclass.h" #include "params.h" #define VARDIR "configs/" /*variables files */ - //config under api + // config under api #define API_CONFIG "configs/api_config" ETEXT_DESC *global_monitor = NULL; // progress monitor @@ -89,8 +92,8 @@ bool Tesseract::init_tesseract_lang_data( const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector *vars_vec, - const GenericVector *vars_values, - bool set_only_non_debug_params) { + const GenericVector *vars_values, bool set_only_non_debug_params, + TessdataManager *mgr) { // Set the basename, compute the data directory. main_setup(arg0, textbase); @@ -102,20 +105,39 @@ bool Tesseract::init_tesseract_lang_data( // Initialize TessdataManager. STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix; - if (!tessdata_manager.Init(tessdata_path.string(), - tessdata_manager_debug_level)) { - return false; + if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) { + // Try without tessdata. + m_data_sub_dir.set_value(""); + main_setup(arg0, textbase); + language_data_path_prefix = datadir; + language_data_path_prefix += lang; + language_data_path_prefix += "."; + tessdata_path = language_data_path_prefix + kTrainedDataSuffix; + if (!mgr->Init(tessdata_path.string())) { + tprintf("Error opening data file %s\n", tessdata_path.string()); + tprintf( + "Please make sure the TESSDATA_PREFIX environment variable is set" + " to your \"tessdata\" directory.\n"); + return false; + } + } + if (oem == OEM_DEFAULT) { + // Set the engine mode from availability, which can then be overidden by + // the config file when we read it below. + if (!mgr->IsLSTMAvailable()) { + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); + } else if (!mgr->IsBaseAvailable()) { + tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY); + } else { + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED); + } } // If a language specific config file (lang.config) exists, load it in. - if (tessdata_manager.SeekToStart(TESSDATA_LANG_CONFIG)) { - ParamUtils::ReadParamsFromFp( - tessdata_manager.GetDataFilePtr(), - tessdata_manager.GetEndOffset(TESSDATA_LANG_CONFIG), - SET_PARAM_CONSTRAINT_NONE, this->params()); - if (tessdata_manager_debug_level) { - tprintf("Loaded language config file\n"); - } + TFile fp; + if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) { + ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp, + this->params()); } SetParamConstraint set_params_constraint = set_only_non_debug_params ? @@ -145,10 +167,6 @@ bool Tesseract::init_tesseract_lang_data( if (params_file != NULL) { ParamUtils::PrintParams(params_file, this->params()); fclose(params_file); - if (tessdata_manager_debug_level > 0) { - tprintf("Wrote parameters to %s\n", - tessedit_write_params_to_file.string()); - } } else { tprintf("Failed to open %s for writing params.\n", tessedit_write_params_to_file.string()); @@ -157,30 +175,48 @@ bool Tesseract::init_tesseract_lang_data( // Determine which ocr engine(s) should be loaded and used for recognition. if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem); - if (tessdata_manager_debug_level) { - tprintf("Loading Tesseract/Cube with tessedit_ocr_engine_mode %d\n", - static_cast(tessedit_ocr_engine_mode)); - } // If we are only loading the config file (and so not planning on doing any // recognition) then there's nothing else do here. if (tessedit_init_config_only) { - if (tessdata_manager_debug_level) { - tprintf("Returning after loading config file\n"); - } return true; } +// The various OcrEngineMode settings (see publictypes.h) determine which +// engine-specific data files need to be loaded. +// If LSTM_ONLY is requested, the base Tesseract files are *Not* required. +#ifndef ANDROID_BUILD + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY || + tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) { + if (mgr->swap()) { + tprintf("Error: LSTM requested on big-endian hardware!!\n"); + tprintf("Big-endian not yet supported! Loading tesseract.\n"); + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); + } else if (mgr->GetComponent(TESSDATA_LSTM, &fp)) { + lstm_recognizer_ = new LSTMRecognizer; + ASSERT_HOST(lstm_recognizer_->DeSerialize(mgr->swap(), &fp)); + if (lstm_use_matrix) lstm_recognizer_->LoadDictionary(language, mgr); + } else { + tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n"); + tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY); + } + } +#endif + // Load the unicharset - if (!tessdata_manager.SeekToStart(TESSDATA_UNICHARSET) || - !unicharset.load_from_file(tessdata_manager.GetDataFilePtr())) { + if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) { + // Avoid requiring a unicharset when we aren't running base tesseract. +#ifndef ANDROID_BUILD + unicharset.CopyFrom(lstm_recognizer_->GetUnicharset()); +#endif + } else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) || + !unicharset.load_from_file(&fp, false)) { return false; } if (unicharset.size() > MAX_NUM_CLASSES) { tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n"); return false; } - if (tessdata_manager_debug_level) tprintf("Loaded unicharset\n"); right_to_left_ = unicharset.major_right_to_left(); // Setup initial unichar ambigs table and read universal ambigs. @@ -189,33 +225,11 @@ bool Tesseract::init_tesseract_lang_data( unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption); unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset); - if (!tessedit_ambigs_training && - tessdata_manager.SeekToStart(TESSDATA_AMBIGS)) { - TFile ambigs_file; - ambigs_file.Open(tessdata_manager.GetDataFilePtr(), - tessdata_manager.GetEndOffset(TESSDATA_AMBIGS) + 1); - unichar_ambigs.LoadUnicharAmbigs( - encoder_unicharset, - &ambigs_file, - ambigs_debug_level, use_ambigs_for_adaption, &unicharset); - if (tessdata_manager_debug_level) tprintf("Loaded ambigs\n"); + if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) { + unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp, + ambigs_debug_level, + use_ambigs_for_adaption, &unicharset); } - - // The various OcrEngineMode settings (see publictypes.h) determine which - // engine-specific data files need to be loaded. Currently everything needs - // the base tesseract data, which supplies other useful information, but - // alternative engines, such as cube and LSTM are optional. -#ifndef NO_CUBE_BUILD - if (tessedit_ocr_engine_mode == OEM_CUBE_ONLY) { - ASSERT_HOST(init_cube_objects(false, &tessdata_manager)); - if (tessdata_manager_debug_level) - tprintf("Loaded Cube w/out combiner\n"); - } else if (tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED) { - ASSERT_HOST(init_cube_objects(true, &tessdata_manager)); - if (tessdata_manager_debug_level) - tprintf("Loaded Cube with combiner\n"); - } -#endif // Init ParamsModel. // Load pass1 and pass2 weights (for now these two sets are the same, but in // the future separate sets of weights can be generated). @@ -223,15 +237,12 @@ bool Tesseract::init_tesseract_lang_data( p < ParamsModel::PTRAIN_NUM_PASSES; ++p) { language_model_->getParamsModel().SetPass( static_cast(p)); - if (tessdata_manager.SeekToStart(TESSDATA_PARAMS_MODEL)) { - if (!language_model_->getParamsModel().LoadFromFp( - lang.string(), tessdata_manager.GetDataFilePtr(), - tessdata_manager.GetEndOffset(TESSDATA_PARAMS_MODEL))) { + if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) { + if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) { return false; } } } - if (tessdata_manager_debug_level) language_model_->getParamsModel().Print(); return true; } @@ -276,8 +287,6 @@ void Tesseract::ParseLanguageString(const char* lang_str, remains = next; // Check whether lang_code is already in the target vector and add. if (!IsStrInList(lang_code, *target)) { - if (tessdata_manager_debug_level) - tprintf("Adding language '%s' to list\n", lang_code.string()); target->push_back(lang_code); } } @@ -287,12 +296,13 @@ void Tesseract::ParseLanguageString(const char* lang_str, // string and recursively any additional languages required by any language // traineddata file (via tessedit_load_sublangs in its config) that is loaded. // See init_tesseract_internal for args. -int Tesseract::init_tesseract( - const char *arg0, const char *textbase, const char *language, - OcrEngineMode oem, char **configs, int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, - bool set_only_non_debug_params) { +int Tesseract::init_tesseract(const char *arg0, const char *textbase, + const char *language, OcrEngineMode oem, + char **configs, int configs_size, + const GenericVector *vars_vec, + const GenericVector *vars_values, + bool set_only_non_debug_params, + TessdataManager *mgr) { GenericVector langs_to_load; GenericVector langs_not_to_load; ParseLanguageString(language, &langs_to_load, &langs_not_to_load); @@ -314,15 +324,15 @@ int Tesseract::init_tesseract( } int result = tess_to_init->init_tesseract_internal( - arg0, textbase, lang_str, oem, configs, configs_size, - vars_vec, vars_values, set_only_non_debug_params); + arg0, textbase, lang_str, oem, configs, configs_size, vars_vec, + vars_values, set_only_non_debug_params, mgr); + // Forget that language, but keep any reader we were given. + mgr->Clear(); if (!loaded_primary) { if (result < 0) { tprintf("Failed loading language '%s'\n", lang_str); } else { - if (tessdata_manager_debug_level) - tprintf("Loaded language '%s' as main language\n", lang_str); ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), &langs_to_load, &langs_not_to_load); loaded_primary = true; @@ -332,8 +342,6 @@ int Tesseract::init_tesseract( tprintf("Failed loading language '%s'\n", lang_str); delete tess_to_init; } else { - if (tessdata_manager_debug_level) - tprintf("Loaded language '%s' as secondary language\n", lang_str); sub_langs_.push_back(tess_to_init); // Add any languages that this language requires ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(), @@ -358,16 +366,11 @@ int Tesseract::init_tesseract( this->language_model_->getParamsModel()); } tprintf("Using params model of the primary language\n"); - if (tessdata_manager_debug_level) { - this->language_model_->getParamsModel().Print(); - } } else { this->language_model_->getParamsModel().Clear(); for (int s = 0; s < sub_langs_.size(); ++s) { sub_langs_[s]->language_model_->getParamsModel().Clear(); } - if (tessdata_manager_debug_level) - tprintf("Using default language params\n"); } } @@ -391,33 +394,26 @@ int Tesseract::init_tesseract( // in vars_vec. // If set_only_init_params is true, then only the initialization variables // will be set. -int Tesseract::init_tesseract_internal( - const char *arg0, const char *textbase, const char *language, - OcrEngineMode oem, char **configs, int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, - bool set_only_non_debug_params) { +int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase, + const char *language, OcrEngineMode oem, + char **configs, int configs_size, + const GenericVector *vars_vec, + const GenericVector *vars_values, + bool set_only_non_debug_params, + TessdataManager *mgr) { if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs, configs_size, vars_vec, vars_values, - set_only_non_debug_params)) { + set_only_non_debug_params, mgr)) { return -1; } if (tessedit_init_config_only) { - tessdata_manager.End(); return 0; } - // If only Cube will be used, skip loading Tesseract classifier's - // pre-trained templates. - bool init_tesseract_classifier = - (tessedit_ocr_engine_mode == OEM_TESSERACT_ONLY || - tessedit_ocr_engine_mode == OEM_TESSERACT_CUBE_COMBINED); - // If only Cube will be used and if it has its own Unicharset, - // skip initializing permuter and loading Tesseract Dawgs. - bool init_dict = - !(tessedit_ocr_engine_mode == OEM_CUBE_ONLY && - tessdata_manager.SeekToStart(TESSDATA_CUBE_UNICHARSET)); - program_editup(textbase, init_tesseract_classifier, init_dict); - tessdata_manager.End(); + // If only LSTM will be used, skip loading Tesseract classifier's + // pre-trained templates and dictionary. + bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY; + program_editup(textbase, init_tesseract ? mgr : nullptr, + init_tesseract ? mgr : nullptr); return 0; //Normal exit } @@ -462,14 +458,14 @@ void Tesseract::SetupUniversalFontIds() { } // init the LM component -int Tesseract::init_tesseract_lm(const char *arg0, - const char *textbase, - const char *language) { +int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase, + const char *language, TessdataManager *mgr) { if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY, - NULL, 0, NULL, NULL, false)) + NULL, 0, NULL, NULL, false, mgr)) return -1; - getDict().Load(Dict::GlobalDawgCache()); - tessdata_manager.End(); + getDict().SetupForLoad(Dict::GlobalDawgCache()); + getDict().Load(lang, mgr); + getDict().FinishLoad(); return 0; } diff --git a/ccmain/tesseract_cube_combiner.cpp b/ccmain/tesseract_cube_combiner.cpp deleted file mode 100644 index e17bd04c..00000000 --- a/ccmain/tesseract_cube_combiner.cpp +++ /dev/null @@ -1,306 +0,0 @@ -/********************************************************************** - * File: tesseract_cube_combiner.h - * Description: Declaration of the Tesseract & Cube results combiner Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The TesseractCubeCombiner class provides the functionality of combining -// the recognition results of Tesseract and Cube at the word level - -#include -#include -#include -#include - -#include "tesseract_cube_combiner.h" - -#include "cube_object.h" -#include "cube_reco_context.h" -#include "cube_utils.h" -#include "neural_net.h" -#include "tesseractclass.h" -#include "word_altlist.h" - -namespace tesseract { - -TesseractCubeCombiner::TesseractCubeCombiner(CubeRecoContext *cube_cntxt) { - cube_cntxt_ = cube_cntxt; - combiner_net_ = NULL; -} - -TesseractCubeCombiner::~TesseractCubeCombiner() { - if (combiner_net_ != NULL) { - delete combiner_net_; - combiner_net_ = NULL; - } -} - -bool TesseractCubeCombiner::LoadCombinerNet() { - ASSERT_HOST(cube_cntxt_); - // Compute the path of the combiner net - string data_path; - cube_cntxt_->GetDataFilePath(&data_path); - string net_file_name = data_path + cube_cntxt_->Lang() + - ".tesseract_cube.nn"; - - // Return false if file does not exist - FILE *fp = fopen(net_file_name.c_str(), "rb"); - if (fp == NULL) - return false; - else - fclose(fp); - - // Load and validate net - combiner_net_ = NeuralNet::FromFile(net_file_name); - if (combiner_net_ == NULL) { - tprintf("Could not read combiner net file %s", net_file_name.c_str()); - return false; - } else if (combiner_net_->out_cnt() != 2) { - tprintf("Invalid combiner net file %s! Output count != 2\n", - net_file_name.c_str()); - delete combiner_net_; - combiner_net_ = NULL; - return false; - } - return true; -} - -// Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally -// strips punc and/or normalizes case and then converts back -string TesseractCubeCombiner::NormalizeString(const string &str, - bool remove_punc, - bool norm_case) { - // convert to UTF32 - string_32 str32; - CubeUtils::UTF8ToUTF32(str.c_str(), &str32); - // strip punc and normalize - string_32 new_str32; - for (int idx = 0; idx < str32.length(); idx++) { - // if no punc removal is required or not a punctuation character - if (!remove_punc || iswpunct(str32[idx]) == 0) { - char_32 norm_char = str32[idx]; - // normalize case if required - if (norm_case && iswalpha(norm_char)) { - norm_char = towlower(norm_char); - } - new_str32.push_back(norm_char); - } - } - // convert back to UTF8 - string new_str; - CubeUtils::UTF32ToUTF8(new_str32.c_str(), &new_str); - return new_str; -} - -// Compares 2 strings optionally ignoring punctuation -int TesseractCubeCombiner::CompareStrings(const string &str1, - const string &str2, - bool ignore_punc, - bool ignore_case) { - if (!ignore_punc && !ignore_case) { - return str1.compare(str2); - } - string norm_str1 = NormalizeString(str1, ignore_punc, ignore_case); - string norm_str2 = NormalizeString(str2, ignore_punc, ignore_case); - return norm_str1.compare(norm_str2); -} - -// Check if a string is a valid Tess dict word or not -bool TesseractCubeCombiner::ValidWord(const string &str) { - return (cube_cntxt_->TesseractObject()->getDict().valid_word(str.c_str()) - > 0); -} - -// Public method for computing the combiner features. The agreement -// output parameter will be true if both answers are identical, -// and false otherwise. -bool TesseractCubeCombiner::ComputeCombinerFeatures(const string &tess_str, - int tess_confidence, - CubeObject *cube_obj, - WordAltList *cube_alt_list, - vector *features, - bool *agreement) { - features->clear(); - *agreement = false; - if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) - return false; - - // Get Cube's best string; return false if empty - char_32 *cube_best_str32 = cube_alt_list->Alt(0); - if (cube_best_str32 == NULL || CubeUtils::StrLen(cube_best_str32) < 1) - return false; - string cube_best_str; - int cube_best_cost = cube_alt_list->AltCost(0); - int cube_best_bigram_cost = 0; - bool cube_best_bigram_cost_valid = true; - if (cube_cntxt_->Bigrams()) - cube_best_bigram_cost = cube_cntxt_->Bigrams()-> - Cost(cube_best_str32, cube_cntxt_->CharacterSet()); - else - cube_best_bigram_cost_valid = false; - CubeUtils::UTF32ToUTF8(cube_best_str32, &cube_best_str); - - // Get Tesseract's UTF32 string - string_32 tess_str32; - CubeUtils::UTF8ToUTF32(tess_str.c_str(), &tess_str32); - - // Compute agreement flag - *agreement = (tess_str.compare(cube_best_str) == 0); - - // Get Cube's second best string; if empty, return false - char_32 *cube_next_best_str32; - string cube_next_best_str; - int cube_next_best_cost = WORST_COST; - if (cube_alt_list->AltCount() > 1) { - cube_next_best_str32 = cube_alt_list->Alt(1); - if (cube_next_best_str32 == NULL || - CubeUtils::StrLen(cube_next_best_str32) == 0) { - return false; - } - cube_next_best_cost = cube_alt_list->AltCost(1); - CubeUtils::UTF32ToUTF8(cube_next_best_str32, &cube_next_best_str); - } - // Rank of Tesseract's top result in Cube's alternate list - int tess_rank = 0; - for (tess_rank = 0; tess_rank < cube_alt_list->AltCount(); tess_rank++) { - string alt_str; - CubeUtils::UTF32ToUTF8(cube_alt_list->Alt(tess_rank), &alt_str); - if (alt_str == tess_str) - break; - } - - // Cube's cost for tesseract's result. Note that this modifies the - // state of cube_obj, including its alternate list by calling RecognizeWord() - int tess_cost = cube_obj->WordCost(tess_str.c_str()); - // Cube's bigram cost of Tesseract's string - int tess_bigram_cost = 0; - int tess_bigram_cost_valid = true; - if (cube_cntxt_->Bigrams()) - tess_bigram_cost = cube_cntxt_->Bigrams()-> - Cost(tess_str32.c_str(), cube_cntxt_->CharacterSet()); - else - tess_bigram_cost_valid = false; - - // Tesseract confidence - features->push_back(tess_confidence); - // Cube cost of Tesseract string - features->push_back(tess_cost); - // Cube Rank of Tesseract string - features->push_back(tess_rank); - // length of Tesseract OCR string - features->push_back(tess_str.length()); - // Tesseract OCR string in dictionary - features->push_back(ValidWord(tess_str)); - if (tess_bigram_cost_valid) { - // bigram cost of Tesseract string - features->push_back(tess_bigram_cost); - } - // Cube tess_cost of Cube best string - features->push_back(cube_best_cost); - // Cube tess_cost of Cube next best string - features->push_back(cube_next_best_cost); - // length of Cube string - features->push_back(cube_best_str.length()); - // Cube string in dictionary - features->push_back(ValidWord(cube_best_str)); - if (cube_best_bigram_cost_valid) { - // bigram cost of Cube string - features->push_back(cube_best_bigram_cost); - } - // case-insensitive string comparison, including punctuation - int compare_nocase_punc = CompareStrings(cube_best_str, - tess_str, false, true); - features->push_back(compare_nocase_punc == 0); - // case-sensitive string comparison, ignoring punctuation - int compare_case_nopunc = CompareStrings(cube_best_str, - tess_str, true, false); - features->push_back(compare_case_nopunc == 0); - // case-insensitive string comparison, ignoring punctuation - int compare_nocase_nopunc = CompareStrings(cube_best_str, - tess_str, true, true); - features->push_back(compare_nocase_nopunc == 0); - return true; -} - -// The CubeObject parameter is used for 2 purposes: 1) to retrieve -// cube's alt list, and 2) to compute cube's word cost for the -// tesseract result. The call to CubeObject::WordCost() modifies -// the object's alternate list, so previous state will be lost. -float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res, - CubeObject *cube_obj) { - // If no combiner is loaded or the cube object is undefined, - // tesseract wins with probability 1.0 - if (combiner_net_ == NULL || cube_obj == NULL) { - tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): " - "Cube objects not initialized; defaulting to Tesseract\n"); - return 1.0; - } - - // Retrieve the alternate list from the CubeObject's current state. - // If the alt list empty, tesseract wins with probability 1.0 - WordAltList *cube_alt_list = cube_obj->AlternateList(); - if (cube_alt_list == NULL) - cube_alt_list = cube_obj->RecognizeWord(); - if (cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) { - tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): " - "Cube returned no results; defaulting to Tesseract\n"); - return 1.0; - } - return CombineResults(tess_res, cube_obj, cube_alt_list); -} - -// The alt_list parameter is expected to have been extracted from the -// CubeObject that recognized the word to be combined. The cube_obj -// parameter passed may be either same instance or a separate instance to -// be used only by the combiner. In both cases, its alternate -// list will be modified by an internal call to RecognizeWord(). -float TesseractCubeCombiner::CombineResults(WERD_RES *tess_res, - CubeObject *cube_obj, - WordAltList *cube_alt_list) { - // If no combiner is loaded or the cube object is undefined, or the - // alt list is empty, tesseract wins with probability 1.0 - if (combiner_net_ == NULL || cube_obj == NULL || - cube_alt_list == NULL || cube_alt_list->AltCount() <= 0) { - tprintf("Cube WARNING (TesseractCubeCombiner::CombineResults): " - "Cube result cannot be retrieved; defaulting to Tesseract\n"); - return 1.0; - } - - // Tesseract result string, tesseract confidence, and cost of - // tesseract result according to cube - string tess_str = tess_res->best_choice->unichar_string().string(); - // Map certainty [-20.0, 0.0] to confidence [0, 100] - int tess_confidence = MIN(100, MAX(1, static_cast( - 100 + (5 * tess_res->best_choice->certainty())))); - - // Compute the combiner features. If feature computation fails or - // answers are identical, tesseract wins with probability 1.0 - vector features; - bool agreement; - bool combiner_success = ComputeCombinerFeatures(tess_str, tess_confidence, - cube_obj, cube_alt_list, - &features, &agreement); - if (!combiner_success || agreement) - return 1.0; - - // Classify combiner feature vector and return output (probability - // of tesseract class). - double net_out[2]; - if (!combiner_net_->FeedForward(&features[0], net_out)) - return 1.0; - return net_out[1]; -} -} diff --git a/ccmain/tesseract_cube_combiner.h b/ccmain/tesseract_cube_combiner.h deleted file mode 100644 index 49a0e2f4..00000000 --- a/ccmain/tesseract_cube_combiner.h +++ /dev/null @@ -1,103 +0,0 @@ -/********************************************************************** - * File: tesseract_cube_combiner.h - * Description: Declaration of the Tesseract & Cube results combiner Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The TesseractCubeCombiner class provides the functionality of combining -// the recognition results of Tesseract and Cube at the word level - -#ifndef TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H -#define TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H - -#include -#include -#include "pageres.h" - -#ifdef _WIN32 -#include -using namespace std; -#endif - -#ifdef USE_STD_NAMESPACE -using std::string; -using std::vector; -#endif - -namespace tesseract { - -class CubeObject; -class NeuralNet; -class CubeRecoContext; -class WordAltList; - -class TesseractCubeCombiner { - public: - explicit TesseractCubeCombiner(CubeRecoContext *cube_cntxt); - virtual ~TesseractCubeCombiner(); - - // There are 2 public methods for combining the results of tesseract - // and cube. Both return the probability that the Tesseract result is - // correct. The difference between the two interfaces is in how the - // passed-in CubeObject is used. - - // The CubeObject parameter is used for 2 purposes: 1) to retrieve - // cube's alt list, and 2) to compute cube's word cost for the - // tesseract result. Both uses may modify the state of the - // CubeObject (including the BeamSearch state) with a call to - // RecognizeWord(). - float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj); - - // The alt_list parameter is expected to have been extracted from the - // CubeObject that recognized the word to be combined. The cube_obj - // parameter passed in is a separate instance to be used only by - // the combiner. - float CombineResults(WERD_RES *tess_res, CubeObject *cube_obj, - WordAltList *alt_list); - - // Public method for computing the combiner features. The agreement - // output parameter will be true if both answers are identical, - // false otherwise. Modifies the cube_alt_list, so no assumptions - // should be made about its state upon return. - bool ComputeCombinerFeatures(const string &tess_res, - int tess_confidence, - CubeObject *cube_obj, - WordAltList *cube_alt_list, - vector *features, - bool *agreement); - - // Is the word valid according to Tesseract's language model - bool ValidWord(const string &str); - - // Loads the combiner neural network from file, using cube_cntxt_ - // to find path. - bool LoadCombinerNet(); - private: - // Normalize a UTF-8 string. Converts the UTF-8 string to UTF32 and optionally - // strips punc and/or normalizes case and then converts back - string NormalizeString(const string &str, bool remove_punc, bool norm_case); - - // Compares 2 strings after optionally normalizing them and or stripping - // punctuation - int CompareStrings(const string &str1, const string &str2, bool ignore_punc, - bool norm_case); - - NeuralNet *combiner_net_; // pointer to the combiner NeuralNet object - CubeRecoContext *cube_cntxt_; // used for language ID and data paths -}; -} - -#endif // TESSERACT_CCMAIN_TESSERACT_CUBE_COMBINER_H diff --git a/ccmain/tesseractclass.cpp b/ccmain/tesseractclass.cpp index 8db50fbd..2089fce8 100644 --- a/ccmain/tesseractclass.cpp +++ b/ccmain/tesseractclass.cpp @@ -42,14 +42,11 @@ #include "tesseractclass.h" #include "allheaders.h" -#ifndef NO_CUBE_BUILD -#include "cube_reco_context.h" -#endif #include "edgblob.h" #include "equationdetect.h" #include "globals.h" -#ifndef NO_CUBE_BUILD -#include "tesseract_cube_combiner.h" +#ifndef ANDROID_BUILD +#include "lstmrecognizer.h" #endif namespace tesseract { @@ -65,6 +62,9 @@ Tesseract::Tesseract() "Generate training data from boxed chars", this->params()), BOOL_MEMBER(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars", this->params()), + BOOL_MEMBER(tessedit_train_line_recognizer, false, + "Break input into lines and remap boxes if present", + this->params()), BOOL_MEMBER(tessedit_dump_pageseg_images, false, "Dump intermediate images made during page segmentation", this->params()), @@ -76,11 +76,10 @@ Tesseract::Tesseract() " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)", this->params()), - INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, - "Which OCR engine(s) to run (Tesseract, Cube, both)." - " Defaults to loading and running only Tesseract" - " (no Cube,no combiner)." - " Values from OcrEngineMode enum in tesseractclass.h)", + INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT, + "Which OCR engine(s) to run (Tesseract, LSTM, both)." + " Defaults to loading and running the most accurate" + " available.", this->params()), STRING_MEMBER(tessedit_char_blacklist, "", "Blacklist of chars not to recognize", this->params()), @@ -215,13 +214,16 @@ Tesseract::Tesseract() BOOL_MEMBER(test_pt, false, "Test for point", this->params()), double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()), double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()), + INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.", + this->params()), INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.", this->params()), BOOL_MEMBER(paragraph_text_based, true, "Run paragraph detection on the post-text-recognition " "(more accurate)", this->params()), - INT_MEMBER(cube_debug_level, 0, "Print cube debug info.", this->params()), + BOOL_MEMBER(lstm_use_matrix, 1, + "Use ratings matrix/beam search with lstm", this->params()), STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines", this->params()), STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines", @@ -265,7 +267,7 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_debug_quality_metrics, false, "Output data to debug file", this->params()), - BOOL_MEMBER(bland_unrej, false, "unrej potential with no chekcs", + BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks", this->params()), double_MEMBER(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit", this->params()), @@ -389,6 +391,9 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params()), + BOOL_MEMBER(textonly_pdf, false, + "Create PDF with only one invisible text layer", + this->params()), STRING_MEMBER(unrecognised_char, "|", "Output char for unidentified blobs", this->params()), INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()), @@ -398,8 +403,8 @@ Tesseract::Tesseract() "Don't suspect dict wds longer than this", this->params()), BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected", this->params()), - double_MEMBER(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit", - this->params()), + double_MEMBER(suspect_rating_per_ch, 999.9, + "Don't touch bad rating limit", this->params()), double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit", this->params()), BOOL_MEMBER(tessedit_minimal_rejection, false, @@ -452,7 +457,7 @@ Tesseract::Tesseract() this->params()), INT_MEMBER(tessedit_page_number, -1, "-1 -> All pages" - " , else specifc page to process", + " , else specific page to process", this->params()), BOOL_MEMBER(tessedit_write_images, false, "Capture the image from the IPE", this->params()), @@ -461,10 +466,6 @@ Tesseract::Tesseract() STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()), BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word", this->params()), - INT_MEMBER(tessdata_manager_debug_level, 0, - "Debug level for" - " TessdataManager functions.", - this->params()), STRING_MEMBER(tessedit_load_sublangs, "", "List of languages to load with this one", this->params()), BOOL_MEMBER(tessedit_use_primary_params_model, false, @@ -512,7 +513,6 @@ Tesseract::Tesseract() "Page separator (default is form feed control character)", this->params()), - // The following parameters were deprecated and removed from their // original // locations. The parameters are temporarily kept here to give Tesseract @@ -604,8 +604,8 @@ Tesseract::Tesseract() backup_config_file_(NULL), pix_binary_(NULL), - cube_binary_(NULL), pix_grey_(NULL), + pix_original_(NULL), pix_thresholds_(NULL), source_resolution_(0), textord_(this), @@ -616,33 +616,28 @@ Tesseract::Tesseract() reskew_(1.0f, 0.0f), most_recently_used_(this), font_table_size_(0), -#ifndef NO_CUBE_BUILD - cube_cntxt_(NULL), - tess_cube_combiner_(NULL), + equ_detect_(NULL), +#ifndef ANDROID_BUILD + lstm_recognizer_(NULL), #endif - equ_detect_(NULL) { + train_line_page_num_(0) { } Tesseract::~Tesseract() { Clear(); + pixDestroy(&pix_original_); end_tesseract(); sub_langs_.delete_data_pointers(); -#ifndef NO_CUBE_BUILD - // Delete cube objects. - if (cube_cntxt_ != NULL) { - delete cube_cntxt_; - cube_cntxt_ = NULL; - } - if (tess_cube_combiner_ != NULL) { - delete tess_cube_combiner_; - tess_cube_combiner_ = NULL; - } +#ifndef ANDROID_BUILD + delete lstm_recognizer_; + lstm_recognizer_ = NULL; #endif } void Tesseract::Clear() { + STRING debug_name = imagebasename + "_debug.pdf"; + pixa_debug_.WritePDF(debug_name.string()); pixDestroy(&pix_binary_); - pixDestroy(&cube_binary_); pixDestroy(&pix_grey_); pixDestroy(&pix_thresholds_); pixDestroy(&scaled_color_); @@ -692,8 +687,6 @@ void Tesseract::SetBlackAndWhitelist() { // page segmentation. void Tesseract::PrepareForPageseg() { textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model); - pixDestroy(&cube_binary_); - cube_binary_ = pixClone(pix_binary()); // Find the max splitter strategy over all langs. ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy = static_cast( @@ -704,9 +697,6 @@ void Tesseract::PrepareForPageseg() { static_cast(sub_langs_[i]->pageseg_devanagari_split_strategy)); if (pageseg_strategy > max_pageseg_strategy) max_pageseg_strategy = pageseg_strategy; - // Clone the cube image to all the sub langs too. - pixDestroy(&sub_langs_[i]->cube_binary_); - sub_langs_[i]->cube_binary_ = pixClone(pix_binary()); pixDestroy(&sub_langs_[i]->pix_binary_); sub_langs_[i]->pix_binary_ = pixClone(pix_binary()); } @@ -714,7 +704,7 @@ void Tesseract::PrepareForPageseg() { // the newly splitted image. splitter_.set_orig_pix(pix_binary()); splitter_.set_pageseg_split_strategy(max_pageseg_strategy); - if (splitter_.Split(true)) { + if (splitter_.Split(true, &pixa_debug_)) { ASSERT_HOST(splitter_.splitted_image()); pixDestroy(&pix_binary_); pix_binary_ = pixClone(splitter_.splitted_image()); @@ -743,7 +733,7 @@ void Tesseract::PrepareForTessOCR(BLOCK_LIST* block_list, splitter_.set_segmentation_block_list(block_list); splitter_.set_ocr_split_strategy(max_ocr_strategy); // Run the splitter for OCR - bool split_for_ocr = splitter_.Split(false); + bool split_for_ocr = splitter_.Split(false, &pixa_debug_); // Restore pix_binary to the binarized original pix for future reference. ASSERT_HOST(splitter_.orig_pix()); pixDestroy(&pix_binary_); diff --git a/ccmain/tesseractclass.h b/ccmain/tesseractclass.h index 91d25bc8..0ac59783 100644 --- a/ccmain/tesseractclass.h +++ b/ccmain/tesseractclass.h @@ -23,22 +23,22 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H__ -#define TESSERACT_CCMAIN_TESSERACTCLASS_H__ +#ifndef TESSERACT_CCMAIN_TESSERACTCLASS_H_ +#define TESSERACT_CCMAIN_TESSERACTCLASS_H_ #include "allheaders.h" #include "control.h" -#include "docqual.h" +#include "debugpixa.h" #include "devanagari_processing.h" +#include "docqual.h" #include "genericvector.h" -#include "params.h" #include "ocrclass.h" +#include "params.h" #include "textord.h" #include "wordrec.h" class BLOB_CHOICE_LIST_CLIST; class BLOCK_LIST; -class CharSamp; struct OSResults; class PAGE_RES; class PAGE_RES_IT; @@ -77,8 +77,7 @@ class WERD_RES; // WordRec (wordrec/wordrec.h) // ^ Members include: WERD*, DENORM* // Tesseract (ccmain/tesseractclass.h) -// Members include: Pix*, CubeRecoContext*, -// TesseractCubeCombiner* +// Members include: Pix* // // Other important classes: // @@ -97,16 +96,11 @@ class WERD_RES; namespace tesseract { class ColumnFinder; -#ifndef NO_CUBE_BUILD -class CubeLineObject; -class CubeObject; -class CubeRecoContext; -#endif +class DocumentData; class EquationDetect; +class ImageData; +class LSTMRecognizer; class Tesseract; -#ifndef NO_CUBE_BUILD -class TesseractCubeCombiner; -#endif // A collection of various variables for statistics and debugging. struct TesseractStats { @@ -189,7 +183,7 @@ class Tesseract : public Wordrec { } // Destroy any existing pix and return a pointer to the pointer. Pix** mutable_pix_binary() { - Clear(); + pixDestroy(&pix_binary_); return &pix_binary_; } Pix* pix_binary() const { @@ -202,16 +196,24 @@ class Tesseract : public Wordrec { pixDestroy(&pix_grey_); pix_grey_ = grey_pix; } - // Returns a pointer to a Pix representing the best available image of the - // page. The image will be 8-bit grey if the input was grey or color. Note - // that in grey 0 is black and 255 is white. If the input was binary, then - // the returned Pix will be binary. Note that here black is 1 and white is 0. - // To tell the difference pixGetDepth() will return 8 or 1. - // In either case, the return value is a borrowed Pix, and should not be - // deleted or pixDestroyed. - Pix* BestPix() const { - return pix_grey_ != NULL ? pix_grey_ : pix_binary_; + Pix* pix_original() const { return pix_original_; } + // Takes ownership of the given original_pix. + void set_pix_original(Pix* original_pix) { + pixDestroy(&pix_original_); + pix_original_ = original_pix; + // Clone to sublangs as well. + for (int i = 0; i < sub_langs_.size(); ++i) + sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix) + : nullptr); } + // Returns a pointer to a Pix representing the best available (original) image + // of the page. Can be of any bit depth, but never color-mapped, as that has + // always been dealt with. Note that in grey and color, 0 is black and 255 is + // white. If the input was binary, then black is 1 and white is 0. + // To tell the difference pixGetDepth() will return 32, 8 or 1. + // In any case, the return value is a borrowed Pix, and should not be + // deleted or pixDestroyed. + Pix* BestPix() const { return pix_original_; } void set_pix_thresholds(Pix* thresholds) { pixDestroy(&pix_thresholds_); pix_thresholds_ = thresholds; @@ -254,11 +256,19 @@ class Tesseract : public Wordrec { Tesseract* get_sub_lang(int index) const { return sub_langs_[index]; } - // Returns true if any language uses Tesseract (as opposed to cube). + // Returns true if any language uses Tesseract (as opposed to LSTM). bool AnyTessLang() const { - if (tessedit_ocr_engine_mode != OEM_CUBE_ONLY) return true; + if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true; for (int i = 0; i < sub_langs_.size(); ++i) { - if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_CUBE_ONLY) + if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true; + } + return false; + } + // Returns true if any language uses the LSTM. + bool AnyLSTMLang() const { + if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) return true; + for (int i = 0; i < sub_langs_.size(); ++i) { + if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) return true; } return false; @@ -293,6 +303,46 @@ class Tesseract : public Wordrec { // par_control.cpp void PrerecAllWordsPar(const GenericVector& words); + //// linerec.cpp + // Generates training data for training a line recognizer, eg LSTM. + // Breaks the page into lines, according to the boxes, and writes them to a + // serialized DocumentData based on output_basename. + void TrainLineRecognizer(const STRING& input_imagename, + const STRING& output_basename, + BLOCK_LIST *block_list); + // Generates training data for training a line recognizer, eg LSTM. + // Breaks the boxes into lines, normalizes them, converts to ImageData and + // appends them to the given training_data. + void TrainFromBoxes(const GenericVector& boxes, + const GenericVector& texts, + BLOCK_LIST *block_list, + DocumentData* training_data); + + // Returns an Imagedata containing the image of the given textline, + // and ground truth boxes/truth text if available in the input. + // The image is not normalized in any way. + ImageData* GetLineData(const TBOX& line_box, + const GenericVector& boxes, + const GenericVector& texts, + int start_box, int end_box, + const BLOCK& block); + // Helper gets the image of a rectangle, using the block.re_rotation() if + // needed to get to the image, and rotating the result back to horizontal + // layout. (CJK characters will be on their left sides) The vertical text flag + // is set in the returned ImageData if the text was originally vertical, which + // can be used to invoke a different CJK recognition engine. The revised_box + // is also returned to enable calculation of output bounding boxes. + ImageData* GetRectImage(const TBOX& box, const BLOCK& block, int padding, + TBOX* revised_box) const; + // Recognizes a word or group of words, converting to WERD_RES in *words. + // Analogous to classify_word_pass1, but can handle a group of words as well. + void LSTMRecognizeWord(const BLOCK& block, ROW *row, WERD_RES *word, + PointerVector* words); + // Apply segmentation search to the given set of words, within the constraints + // of the existing ratings matrix. If there is already a best_choice on a word + // leaves it untouched and just sets the done/accepted etc flags. + void SearchWords(PointerVector* words); + //// control.h ///////////////////////////////////////////////////////// bool ProcessTargetWord(const TBOX& word_box, const TBOX& target_word_box, const char* word_config, int pass); @@ -324,9 +374,8 @@ class Tesseract : public Wordrec { // Helper to recognize the word using the given (language-specific) tesseract. // Returns positive if this recognizer found more new best words than the // number kept from best_words. - int RetryWithLanguage(const WordData& word_data, - WordRecognizer recognizer, - WERD_RES** in_word, + int RetryWithLanguage(const WordData& word_data, WordRecognizer recognizer, + bool debug, WERD_RES** in_word, PointerVector* best_words); // Moves good-looking "noise"/diacritics from the reject list to the main // blob list on the current word. Returns true if anything was done, and @@ -428,34 +477,6 @@ class Tesseract : public Wordrec { int *left_ok, int *right_ok) const; - //// cube_control.cpp /////////////////////////////////////////////////// -#ifndef NO_CUBE_BUILD - bool init_cube_objects(bool load_combiner, - TessdataManager *tessdata_manager); - // Iterates through tesseract's results and calls cube on each word, - // combining the results with the existing tesseract result. - void run_cube_combiner(PAGE_RES *page_res); - // Recognizes a single word using (only) cube. Compatible with - // Tesseract's classify_word_pass1/classify_word_pass2. - void cube_word_pass1(BLOCK* block, ROW *row, WERD_RES *word); - // Cube recognizer to recognize a single word as with classify_word_pass1 - // but also returns the cube object in case the combiner is needed. - CubeObject* cube_recognize_word(BLOCK* block, WERD_RES* word); - // Combines the cube and tesseract results for a single word, leaving the - // result in tess_word. - void cube_combine_word(CubeObject* cube_obj, WERD_RES* cube_word, - WERD_RES* tess_word); - // Call cube on the current word, and write the result to word. - // Sets up a fake result and returns false if something goes wrong. - bool cube_recognize(CubeObject *cube_obj, BLOCK* block, WERD_RES *word); - void fill_werd_res(const BoxWord& cube_box_word, - const char* cube_best_str, - WERD_RES* tess_werd_res); - bool extract_cube_state(CubeObject* cube_obj, int* num_chars, - Boxa** char_boxes, CharSamp*** char_samples); - bool create_cube_box_word(Boxa *char_boxes, int num_chars, - TBOX word_box, BoxWord* box_word); -#endif //// output.h ////////////////////////////////////////////////////////// void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box); @@ -475,20 +496,17 @@ class Tesseract : public Wordrec { // string and recursively any additional languages required by any language // traineddata file (via tessedit_load_sublangs in its config) that is loaded. // See init_tesseract_internal for args. - int init_tesseract(const char *arg0, - const char *textbase, - const char *language, - OcrEngineMode oem, - char **configs, - int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, - bool set_only_init_params); + int init_tesseract(const char* arg0, const char* textbase, + const char* language, OcrEngineMode oem, char** configs, + int configs_size, const GenericVector* vars_vec, + const GenericVector* vars_values, + bool set_only_init_params, TessdataManager* mgr); int init_tesseract(const char *datapath, const char *language, OcrEngineMode oem) { - return init_tesseract(datapath, NULL, language, oem, - NULL, 0, NULL, NULL, false); + TessdataManager mgr; + return init_tesseract(datapath, NULL, language, oem, NULL, 0, NULL, NULL, + false, &mgr); } // Common initialization for a single language. // arg0 is the datapath for the tessdata directory, which could be the @@ -506,36 +524,30 @@ class Tesseract : public Wordrec { // in vars_vec. // If set_only_init_params is true, then only the initialization variables // will be set. - int init_tesseract_internal(const char *arg0, - const char *textbase, - const char *language, - OcrEngineMode oem, - char **configs, - int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, - bool set_only_init_params); + int init_tesseract_internal(const char* arg0, const char* textbase, + const char* language, OcrEngineMode oem, + char** configs, int configs_size, + const GenericVector* vars_vec, + const GenericVector* vars_values, + bool set_only_init_params, TessdataManager* mgr); // Set the universal_id member of each font to be unique among all // instances of the same font loaded. void SetupUniversalFontIds(); - int init_tesseract_lm(const char *arg0, - const char *textbase, - const char *language); + int init_tesseract_lm(const char* arg0, const char* textbase, + const char* language, TessdataManager* mgr); void recognize_page(STRING& image_name); void end_tesseract(); - bool init_tesseract_lang_data(const char *arg0, - const char *textbase, - const char *language, - OcrEngineMode oem, - char **configs, - int configs_size, - const GenericVector *vars_vec, - const GenericVector *vars_values, - bool set_only_init_params); + bool init_tesseract_lang_data(const char* arg0, const char* textbase, + const char* language, OcrEngineMode oem, + char** configs, int configs_size, + const GenericVector* vars_vec, + const GenericVector* vars_values, + bool set_only_init_params, + TessdataManager* mgr); void ParseLanguageString(const char* lang_str, GenericVector* to_load, @@ -783,16 +795,17 @@ class Tesseract : public Wordrec { "Generate training data from boxed chars"); BOOL_VAR_H(tessedit_make_boxes_from_boxes, false, "Generate more boxes from boxed chars"); + BOOL_VAR_H(tessedit_train_line_recognizer, false, + "Break input into lines and remap boxes if present"); BOOL_VAR_H(tessedit_dump_pageseg_images, false, "Dump intermediate images made during page segmentation"); INT_VAR_H(tessedit_pageseg_mode, PSM_SINGLE_BLOCK, "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"); - INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_TESSERACT_ONLY, - "Which OCR engine(s) to run (Tesseract, Cube, both). Defaults" - " to loading and running only Tesseract (no Cube, no combiner)." - " (Values from OcrEngineMode enum in tesseractclass.h)"); + INT_VAR_H(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT, + "Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" + " to loading and running the most accurate available."); STRING_VAR_H(tessedit_char_blacklist, "", "Blacklist of chars not to recognize"); STRING_VAR_H(tessedit_char_whitelist, "", @@ -886,11 +899,12 @@ class Tesseract : public Wordrec { BOOL_VAR_H(test_pt, false, "Test for point"); double_VAR_H(test_pt_x, 99999.99, "xcoord"); double_VAR_H(test_pt_y, 99999.99, "ycoord"); + INT_VAR_H(multilang_debug_level, 0, "Print multilang debug info."); INT_VAR_H(paragraph_debug_level, 0, "Print paragraph debug info."); BOOL_VAR_H(paragraph_text_based, true, "Run paragraph detection on the post-text-recognition " "(more accurate)"); - INT_VAR_H(cube_debug_level, 1, "Print cube debug info."); + BOOL_VAR_H(lstm_use_matrix, 1, "Use ratings matrix/beam searct with lstm"); STRING_VAR_H(outlines_odd, "%| ", "Non standard number of outlines"); STRING_VAR_H(outlines_2, "ij!?%\":;", "Non standard number of outlines"); BOOL_VAR_H(docqual_excuse_outline_errs, false, @@ -926,7 +940,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_debug_doc_rejection, false, "Page stats"); BOOL_VAR_H(tessedit_debug_quality_metrics, false, "Output data to debug file"); - BOOL_VAR_H(bland_unrej, false, "unrej potential with no chekcs"); + BOOL_VAR_H(bland_unrej, false, "unrej potential with no checks"); double_VAR_H(quality_rowrej_pc, 1.1, "good_quality_doc gte good char limit"); BOOL_VAR_H(unlv_tilde_crunching, true, @@ -1005,13 +1019,14 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_create_hocr, false, "Write .html hOCR output file"); BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); + BOOL_VAR_H(textonly_pdf, false, + "Create PDF with only one invisible text layer"); STRING_VAR_H(unrecognised_char, "|", "Output char for unidentified blobs"); INT_VAR_H(suspect_level, 99, "Suspect marker level"); INT_VAR_H(suspect_space_level, 100, "Min suspect level for rejecting spaces"); - INT_VAR_H(suspect_short_words, 2, - "Don't Suspect dict wds longer than this"); + INT_VAR_H(suspect_short_words, 2, "Don't Suspect dict wds longer than this"); BOOL_VAR_H(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected"); double_VAR_H(suspect_rating_per_ch, 999.9, "Don't touch bad rating limit"); double_VAR_H(suspect_accept_rating, -999.9, "Accept good rating limit"); @@ -1045,13 +1060,11 @@ class Tesseract : public Wordrec { INT_VAR_H(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this"); BOOL_VAR_H(tessedit_create_boxfile, false, "Output text with boxes"); INT_VAR_H(tessedit_page_number, -1, - "-1 -> All pages, else specifc page to process"); + "-1 -> All pages, else specific page to process"); BOOL_VAR_H(tessedit_write_images, false, "Capture the image from the IPE"); BOOL_VAR_H(interactive_display_mode, false, "Run interactively?"); STRING_VAR_H(file_type, ".tif", "Filename extension"); BOOL_VAR_H(tessedit_override_permuter, true, "According to dict_word"); - INT_VAR_H(tessdata_manager_debug_level, 0, - "Debug level for TessdataManager functions."); STRING_VAR_H(tessedit_load_sublangs, "", "List of languages to load with this one"); BOOL_VAR_H(tessedit_use_primary_params_model, false, @@ -1157,10 +1170,6 @@ class Tesseract : public Wordrec { PAGE_RES_IT* pr_it, FILE *output_file); -#ifndef NO_CUBE_BUILD - inline CubeRecoContext *GetCubeRecoContext() { return cube_cntxt_; } -#endif - private: // The filename of a backup config file. If not null, then we currently // have a temporary debug config file loaded, and backup_config_file_ @@ -1171,12 +1180,14 @@ class Tesseract : public Wordrec { // Image used for input to layout analysis and tesseract recognition. // May be modified by the ShiroRekhaSplitter to eliminate the top-line. Pix* pix_binary_; - // Unmodified image used for input to cube. Always valid. - Pix* cube_binary_; // Grey-level input image if the input was not binary, otherwise NULL. Pix* pix_grey_; + // Original input image. Color if the input was color. + Pix* pix_original_; // Thresholds that were used to generate the thresholded image from grey. Pix* pix_thresholds_; + // Debug images. If non-empty, will be written on destruction. + DebugPixa pixa_debug_; // Input image resolution after any scaling. The resolution is not well // transmitted by operations on Pix, so we keep an independent record here. int source_resolution_; @@ -1199,16 +1210,14 @@ class Tesseract : public Wordrec { Tesseract* most_recently_used_; // The size of the font table, ie max possible font id + 1. int font_table_size_; -#ifndef NO_CUBE_BUILD - // Cube objects. - CubeRecoContext* cube_cntxt_; - TesseractCubeCombiner *tess_cube_combiner_; -#endif // Equation detector. Note: this pointer is NOT owned by the class. EquationDetect* equ_detect_; + // LSTM recognizer, if available. + LSTMRecognizer* lstm_recognizer_; + // Output "page" number (actually line number) using TrainLineRecognizer. + int train_line_page_num_; }; } // namespace tesseract - -#endif // TESSERACT_CCMAIN_TESSERACTCLASS_H__ +#endif // TESSERACT_CCMAIN_TESSERACTCLASS_H_ diff --git a/ccmain/thresholder.cpp b/ccmain/thresholder.cpp index df6abd01..77069bc9 100644 --- a/ccmain/thresholder.cpp +++ b/ccmain/thresholder.cpp @@ -152,19 +152,27 @@ void ImageThresholder::SetImage(const Pix* pix) { int depth; pixGetDimensions(src, &image_width_, &image_height_, &depth); // Convert the image as necessary so it is one of binary, plain RGB, or - // 8 bit with no colormap. - if (depth > 1 && depth < 8) { + // 8 bit with no colormap. Guarantee that we always end up with our own copy, + // not just a clone of the input. + if (pixGetColormap(src)) { + Pix* tmp = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC); + depth = pixGetDepth(tmp); + if (depth > 1 && depth < 8) { + pix_ = pixConvertTo8(tmp, false); + pixDestroy(&tmp); + } else { + pix_ = tmp; + } + } else if (depth > 1 && depth < 8) { pix_ = pixConvertTo8(src, false); - } else if (pixGetColormap(src)) { - pix_ = pixRemoveColormap(src, REMOVE_CMAP_BASED_ON_SRC); } else { - pix_ = pixClone(src); + pix_ = pixCopy(NULL, src); } depth = pixGetDepth(pix_); pix_channels_ = depth / 8; pix_wpl_ = pixGetWpl(pix_); scale_ = 1; - estimated_res_ = yres_ = pixGetYRes(src); + estimated_res_ = yres_ = pixGetYRes(pix_); Init(); } @@ -173,8 +181,11 @@ void ImageThresholder::SetImage(const Pix* pix) { // Caller must use pixDestroy to free the created Pix. void ImageThresholder::ThresholdToPix(PageSegMode pageseg_mode, Pix** pix) { if (pix_channels_ == 0) { - // We have a binary image, so it just has to be cloned. - *pix = GetPixRect(); + // We have a binary image, but it still has to be copied, as this API + // allows the caller to modify the output. + Pix* original = GetPixRect(); + *pix = pixCopy(nullptr, original); + pixDestroy(&original); } else { OtsuThresholdRectToPix(pix_, pix); } @@ -257,10 +268,10 @@ void ImageThresholder::OtsuThresholdRectToPix(Pix* src_pix, OpenclDevice od; if ((num_channels == 4 || num_channels == 1) && od.selectedDeviceIsOpenCL() && rect_top_ == 0 && rect_left_ == 0 ) { - od.ThresholdRectToPixOCL((const unsigned char*)pixGetData(src_pix), - num_channels, pixGetWpl(src_pix) * 4, - thresholds, hi_values, out_pix /*pix_OCL*/, - rect_height_, rect_width_, rect_top_, rect_left_); + od.ThresholdRectToPixOCL((unsigned char*)pixGetData(src_pix), num_channels, + pixGetWpl(src_pix) * 4, thresholds, hi_values, + out_pix /*pix_OCL*/, rect_height_, rect_width_, + rect_top_, rect_left_); } else { #endif ThresholdRectToPix(src_pix, num_channels, thresholds, hi_values, out_pix); diff --git a/ccmain/thresholder.h b/ccmain/thresholder.h index 92e2de5f..0ff8af22 100644 --- a/ccmain/thresholder.h +++ b/ccmain/thresholder.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCMAIN_THRESHOLDER_H__ -#define TESSERACT_CCMAIN_THRESHOLDER_H__ +#ifndef TESSERACT_CCMAIN_THRESHOLDER_H_ +#define TESSERACT_CCMAIN_THRESHOLDER_H_ #include "platform.h" #include "publictypes.h" @@ -186,4 +186,4 @@ class TESS_API ImageThresholder { } // namespace tesseract. -#endif // TESSERACT_CCMAIN_THRESHOLDER_H__ +#endif // TESSERACT_CCMAIN_THRESHOLDER_H_ diff --git a/ccstruct/Makefile.am b/ccstruct/Makefile.am index 8a26a684..2f9dc036 100644 --- a/ccstruct/Makefile.am +++ b/ccstruct/Makefile.am @@ -12,7 +12,7 @@ endif include_HEADERS = publictypes.h noinst_HEADERS = \ blamer.h blckerr.h blobbox.h blobs.h blread.h boxread.h boxword.h ccstruct.h coutln.h crakedge.h \ - detlinefit.h dppoint.h fontinfo.h genblob.h hpdsizes.h \ + debugpixa.h detlinefit.h dppoint.h fontinfo.h genblob.h hpdsizes.h \ imagedata.h \ ipoints.h \ linlsq.h matrix.h mod128.h normalis.h \ diff --git a/ccstruct/blamer.cpp b/ccstruct/blamer.cpp index 5d2837d0..4573e9b3 100644 --- a/ccstruct/blamer.cpp +++ b/ccstruct/blamer.cpp @@ -317,7 +317,7 @@ void BlamerBundle::SetChopperBlame(const WERD_RES* word, bool debug) { int num_blobs = word->chopped_word->blobs.size(); int box_index = 0; int blob_index = 0; - inT16 truth_x; + inT16 truth_x = -1; while (box_index < truth_word_.length() && blob_index < num_blobs) { truth_x = norm_truth_word_.BlobBox(box_index).right(); TBLOB * curr_blob = word->chopped_word->blobs[blob_index]; diff --git a/ccstruct/blobbox.cpp b/ccstruct/blobbox.cpp index 280096b5..3ffb9dc9 100644 --- a/ccstruct/blobbox.cpp +++ b/ccstruct/blobbox.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: blobbox.cpp (Formerly blobnbox.c) * Description: Code for the textord blob class. - * Author: Ray Smith - * Created: Thu Jul 30 09:08:51 BST 1992 + * Author: Ray Smith + * Created: Thu Jul 30 09:08:51 BST 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -31,7 +31,9 @@ #define PROJECTION_MARGIN 10 //arbitrary #define EXTERN -ELISTIZE (BLOBNBOX) ELIST2IZE (TO_ROW) ELISTIZE (TO_BLOCK) +ELISTIZE(BLOBNBOX) +ELIST2IZE(TO_ROW) +ELISTIZE(TO_BLOCK) // Up to 30 degrees is allowed for rotations of diacritic blobs. const double kCosSmallAngle = 0.866; @@ -176,7 +178,7 @@ void BLOBNBOX::NeighbourGaps(int gaps[BND_COUNT]) const { gaps[dir] = MAX_INT16; BLOBNBOX* neighbour = neighbours_[dir]; if (neighbour != NULL) { - TBOX n_box = neighbour->bounding_box(); + const TBOX& n_box = neighbour->bounding_box(); if (dir == BND_LEFT || dir == BND_RIGHT) { gaps[dir] = box.x_gap(n_box); } else { diff --git a/ccstruct/blobs.cpp b/ccstruct/blobs.cpp index ad499407..f5b427ec 100644 --- a/ccstruct/blobs.cpp +++ b/ccstruct/blobs.cpp @@ -815,12 +815,10 @@ void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, float input_y_offset = 0.0f; float final_y_offset = static_cast(kBlnBaselineOffset); float scale = kBlnXHeight / x_height; - if (hint == tesseract::OEM_CUBE_ONLY || row == NULL) { + if (row == NULL) { word_middle = word_box.left(); input_y_offset = word_box.bottom(); final_y_offset = 0.0f; - if (hint == tesseract::OEM_CUBE_ONLY) - scale = 1.0f; } else { input_y_offset = row->base_line(word_middle) + baseline_shift; } @@ -834,7 +832,7 @@ void TWERD::BLNormalize(const BLOCK* block, const ROW* row, Pix* pix, baseline = blob_box.bottom(); blob_scale = ClipToRange(kBlnXHeight * 4.0f / (3 * blob_box.height()), scale, scale * 1.5f); - } else if (row != NULL && hint != tesseract::OEM_CUBE_ONLY) { + } else if (row != NULL) { baseline = row->base_line(mid_x) + baseline_shift; } // The image will be 8-bit grey if the input was grey or color. Note that in diff --git a/ccstruct/boxread.cpp b/ccstruct/boxread.cpp index f4aedca5..d6ceebb4 100644 --- a/ccstruct/boxread.cpp +++ b/ccstruct/boxread.cpp @@ -34,8 +34,7 @@ FILE* OpenBoxFile(const STRING& fname) { STRING filename = BoxFileName(fname); FILE* box_file = NULL; if (!(box_file = fopen(filename.string(), "rb"))) { - CANTOPENFILE.error("read_next_box", TESSEXIT, - "Can't open box file %s", + CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s", filename.string()); } return box_file; @@ -56,6 +55,8 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename, GenericVector box_data; if (!tesseract::LoadDataFromFile(BoxFileName(filename), &box_data)) return false; + // Convert the array of bytes to a string, so it can be used by the parser. + box_data.push_back('\0'); return ReadMemBoxes(target_page, skip_blanks, &box_data[0], boxes, texts, box_texts, pages); } diff --git a/ccstruct/boxread.h b/ccstruct/boxread.h index f12853df..b5c7bb47 100644 --- a/ccstruct/boxread.h +++ b/ccstruct/boxread.h @@ -17,8 +17,8 @@ * **********************************************************************/ -#ifndef TESSERACT_CCUTIL_BOXREAD_H__ -#define TESSERACT_CCUTIL_BOXREAD_H__ +#ifndef TESSERACT_CCUTIL_BOXREAD_H_ +#define TESSERACT_CCUTIL_BOXREAD_H_ #include #include "genericvector.h" @@ -82,4 +82,4 @@ bool ParseBoxFileStr(const char* boxfile_str, int* page_number, void MakeBoxFileStr(const char* unichar_str, const TBOX& box, int page_num, STRING* box_str); -#endif // TESSERACT_CCUTIL_BOXREAD_H__ +#endif // TESSERACT_CCUTIL_BOXREAD_H_ diff --git a/ccstruct/boxword.h b/ccstruct/boxword.h index 742bbb8e..86022a5e 100644 --- a/ccstruct/boxword.h +++ b/ccstruct/boxword.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CSTRUCT_BOXWORD_H__ -#define TESSERACT_CSTRUCT_BOXWORD_H__ +#ifndef TESSERACT_CSTRUCT_BOXWORD_H_ +#define TESSERACT_CSTRUCT_BOXWORD_H_ #include "genericvector.h" #include "rect.h" @@ -82,9 +82,7 @@ class BoxWord { const TBOX& bounding_box() const { return bbox_; } - int length() const { - return length_; - } + int length() const { return length_; } const TBOX& BlobBox(int index) const { return boxes_[index]; } @@ -99,5 +97,4 @@ class BoxWord { } // namespace tesseract. - -#endif // TESSERACT_CSTRUCT_BOXWORD_H__ +#endif // TESSERACT_CSTRUCT_BOXWORD_H_ diff --git a/ccstruct/ccstruct.h b/ccstruct/ccstruct.h index 3f92122d..2c8aac16 100644 --- a/ccstruct/ccstruct.h +++ b/ccstruct/ccstruct.h @@ -16,8 +16,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCSTRUCT_CCSTRUCT_H__ -#define TESSERACT_CCSTRUCT_CCSTRUCT_H__ +#ifndef TESSERACT_CCSTRUCT_CCSTRUCT_H_ +#define TESSERACT_CCSTRUCT_CCSTRUCT_H_ #include "cutil.h" @@ -40,5 +40,4 @@ class CCStruct : public CUtil { class Tesseract; } // namespace tesseract - -#endif // TESSERACT_CCSTRUCT_CCSTRUCT_H__ +#endif // TESSERACT_CCSTRUCT_CCSTRUCT_H_ diff --git a/ccstruct/coutln.cpp b/ccstruct/coutln.cpp index bc2b119d..238272d2 100644 --- a/ccstruct/coutln.cpp +++ b/ccstruct/coutln.cpp @@ -48,9 +48,9 @@ ICOORD C_OUTLINE::step_coords[4] = { * @param length length of loop */ -C_OUTLINE::C_OUTLINE (CRACKEDGE * startpt, ICOORD bot_left, - ICOORD top_right, inT16 length) - : box (bot_left, top_right), start (startpt->pos), offsets(NULL) { +C_OUTLINE::C_OUTLINE(CRACKEDGE* startpt, ICOORD bot_left, ICOORD top_right, + inT16 length) + : box(bot_left, top_right), start(startpt->pos), offsets(NULL) { inT16 stepindex; //index to step CRACKEDGE *edgept; //current point @@ -71,7 +71,6 @@ C_OUTLINE::C_OUTLINE (CRACKEDGE * startpt, ICOORD bot_left, } } - /** * @name C_OUTLINE::C_OUTLINE * @@ -139,7 +138,7 @@ inT16 length //length of loop * @param rotation rotate to coord */ -C_OUTLINE::C_OUTLINE(C_OUTLINE *srcline, FCOORD rotation) : offsets(NULL) { +C_OUTLINE::C_OUTLINE(C_OUTLINE* srcline, FCOORD rotation) : offsets(NULL) { TBOX new_box; //easy bounding inT16 stepindex; //index to step inT16 dirdiff; //direction change @@ -300,7 +299,6 @@ inT32 C_OUTLINE::perimeter() const { return total_steps; } - /** * @name C_OUTLINE::outer_area * @@ -332,7 +330,6 @@ inT32 C_OUTLINE::outer_area() const { return total; } - /** * @name C_OUTLINE::count_transitions * @@ -459,7 +456,6 @@ inT32 C_OUTLINE::count_transitions(inT32 threshold) { return total; } - /** * @name C_OUTLINE::operator< * @@ -468,8 +464,7 @@ inT32 C_OUTLINE::count_transitions(inT32 threshold) { */ BOOL8 -C_OUTLINE::operator< (const C_OUTLINE & other) const -{ +C_OUTLINE::operator<(const C_OUTLINE& other) const { inT16 count = 0; //winding count ICOORD pos; //position of point inT32 stepindex; //index to cstep @@ -495,7 +490,6 @@ C_OUTLINE::operator< (const C_OUTLINE & other) const return count != 0; } - /** * @name C_OUTLINE::winding_number * @@ -534,7 +528,6 @@ inT16 C_OUTLINE::winding_number(ICOORD point) const { return count; //winding number } - /** * C_OUTLINE::turn_direction * @@ -563,7 +556,6 @@ inT16 C_OUTLINE::turn_direction() const { //winding number return count; //winding number } - /** * @name C_OUTLINE::reverse * @@ -586,7 +578,6 @@ void C_OUTLINE::reverse() { //reverse drection } } - /** * @name C_OUTLINE::move * @@ -661,14 +652,27 @@ static void ComputeGradient(const l_uint32* data, int wpl, int x, int y, int width, int height, ICOORD* gradient) { const l_uint32* line = data + y * wpl; - int pix_x_y = x < width && y < height ? - GET_DATA_BYTE(const_cast (reinterpret_cast(line)), x) : 255; - int pix_x_prevy = x < width && y > 0 ? - GET_DATA_BYTE(const_cast (reinterpret_cast(line - wpl)), x) : 255; - int pix_prevx_prevy = x > 0 && y > 0 ? - GET_DATA_BYTE(const_cast (reinterpret_cast(line - wpl)), x - 1) : 255; - int pix_prevx_y = x > 0 && y < height ? - GET_DATA_BYTE(const_cast (reinterpret_cast(line)), x - 1) : 255; + int pix_x_y = + x < width && y < height + ? GET_DATA_BYTE( + const_cast(reinterpret_cast(line)), x) + : 255; + int pix_x_prevy = + x < width && y > 0 + ? GET_DATA_BYTE( + const_cast(reinterpret_cast(line - wpl)), x) + : 255; + int pix_prevx_prevy = + x > 0 && y > 0 + ? GET_DATA_BYTE( + const_cast(reinterpret_cast(line - wpl)), + x - 1) + : 255; + int pix_prevx_y = + x > 0 && y < height + ? GET_DATA_BYTE( + const_cast(reinterpret_cast(line)), x - 1) + : 255; gradient->set_x(pix_x_y + pix_x_prevy - (pix_prevx_y + pix_prevx_prevy)); gradient->set_y(pix_x_prevy + pix_prevx_prevy - (pix_x_y + pix_prevx_y)); } @@ -684,8 +688,10 @@ static bool EvaluateVerticalDiff(const l_uint32* data, int wpl, int diff_sign, if (y <= 0 || y >= height) return false; const l_uint32* line = data + y * wpl; - int pixel1 = GET_DATA_BYTE(const_cast (reinterpret_cast(line - wpl)), x); - int pixel2 = GET_DATA_BYTE(const_cast (reinterpret_cast(line)), x); + int pixel1 = GET_DATA_BYTE( + const_cast(reinterpret_cast(line - wpl)), x); + int pixel2 = + GET_DATA_BYTE(const_cast(reinterpret_cast(line)), x); int diff = (pixel2 - pixel1) * diff_sign; if (diff > *best_diff) { *best_diff = diff; @@ -705,8 +711,10 @@ static bool EvaluateHorizontalDiff(const l_uint32* line, int diff_sign, int* best_diff, int* best_sum, int* best_x) { if (x <= 0 || x >= width) return false; - int pixel1 = GET_DATA_BYTE(const_cast (reinterpret_cast(line)), x - 1); - int pixel2 = GET_DATA_BYTE(const_cast (reinterpret_cast(line)), x); + int pixel1 = GET_DATA_BYTE( + const_cast(reinterpret_cast(line)), x - 1); + int pixel2 = + GET_DATA_BYTE(const_cast(reinterpret_cast(line)), x); int diff = (pixel2 - pixel1) * diff_sign; if (diff > *best_diff) { *best_diff = diff; @@ -954,8 +962,7 @@ void C_OUTLINE::render_outline(int left, int top, Pix* pix) const { */ #ifndef GRAPHICS_DISABLED -void C_OUTLINE::plot(ScrollView* window, - ScrollView::Color colour) const { +void C_OUTLINE::plot(ScrollView* window, ScrollView::Color colour) const { inT16 stepindex; // index to cstep ICOORD pos; // current position DIR128 stepdir; // direction of step @@ -1016,7 +1023,6 @@ void C_OUTLINE::plot_normed(const DENORM& denorm, ScrollView::Color colour, } #endif - /** * @name C_OUTLINE::operator= * @@ -1024,7 +1030,7 @@ void C_OUTLINE::plot_normed(const DENORM& denorm, ScrollView::Color colour, * @param source assign from this */ -C_OUTLINE & C_OUTLINE::operator= (const C_OUTLINE & source) { +C_OUTLINE& C_OUTLINE::operator=(const C_OUTLINE& source) { box = source.box; start = source.start; if (steps != NULL) diff --git a/ccstruct/debugpixa.h b/ccstruct/debugpixa.h new file mode 100644 index 00000000..37e63215 --- /dev/null +++ b/ccstruct/debugpixa.h @@ -0,0 +1,52 @@ +#ifndef TESSERACT_CCSTRUCT_DEBUGPIXA_H_ +#define TESSERACT_CCSTRUCT_DEBUGPIXA_H_ + +#include "allheaders.h" + +namespace tesseract { + +// Class to hold a Pixa collection of debug images with captions and save them +// to a PDF file. +class DebugPixa { + public: + // TODO(rays) add another constructor with size control. + DebugPixa() { + pixa_ = pixaCreate(0); + fonts_ = bmfCreate(nullptr, 14); + } + // If the filename_ has been set and there are any debug images, they are + // written to the set filename_. + ~DebugPixa() { + pixaDestroy(&pixa_); + bmfDestroy(&fonts_); + } + + // Adds the given pix to the set of pages in the PDF file, with the given + // caption added to the top. + void AddPix(const Pix* pix, const char* caption) { + int depth = pixGetDepth(const_cast(pix)); + int color = depth < 8 ? 1 : (depth > 8 ? 0x00ff0000 : 0x80); + Pix* pix_debug = pixAddSingleTextblock( + const_cast(pix), fonts_, caption, color, L_ADD_BELOW, nullptr); + pixaAddPix(pixa_, pix_debug, L_INSERT); + } + + // Sets the destination filename and enables images to be written to a PDF + // on destruction. + void WritePDF(const char* filename) { + if (pixaGetCount(pixa_) > 0) { + pixaConvertToPdf(pixa_, 300, 1.0f, 0, 0, "AllDebugImages", filename); + pixaClear(pixa_); + } + } + + private: + // The collection of images to put in the PDF. + Pixa* pixa_; + // The fonts used to draw text captions. + L_Bmf* fonts_; +}; + +} // namespace tesseract + +#endif // TESSERACT_CCSTRUCT_DEBUGPIXA_H_ diff --git a/ccstruct/dppoint.h b/ccstruct/dppoint.h index fd87bb91..d7f75dba 100644 --- a/ccstruct/dppoint.h +++ b/ccstruct/dppoint.h @@ -17,8 +17,8 @@ * **********************************************************************/ -#ifndef TESSERACT_CCSTRUCT_DPPOINT_H__ -#define TESSERACT_CCSTRUCT_DPPOINT_H__ +#ifndef TESSERACT_CCSTRUCT_DPPOINT_H_ +#define TESSERACT_CCSTRUCT_DPPOINT_H_ #include "host.h" @@ -98,5 +98,4 @@ class DPPoint { } // namespace tesseract. -#endif // TESSERACT_CCSTRUCT_DPPOINT_H__ - +#endif // TESSERACT_CCSTRUCT_DPPOINT_H_ diff --git a/ccstruct/fontinfo.cpp b/ccstruct/fontinfo.cpp index d3e6f375..c3cda825 100644 --- a/ccstruct/fontinfo.cpp +++ b/ccstruct/fontinfo.cpp @@ -31,7 +31,7 @@ bool FontInfo::Serialize(FILE* fp) const { } // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. -bool FontInfo::DeSerialize(bool swap, FILE* fp) { +bool FontInfo::DeSerialize(bool swap, TFile* fp) { if (!read_info(fp, this, swap)) return false; if (!read_spacing_info(fp, this, swap)) return false; return true; @@ -51,7 +51,7 @@ bool FontInfoTable::Serialize(FILE* fp) const { } // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. -bool FontInfoTable::DeSerialize(bool swap, FILE* fp) { +bool FontInfoTable::DeSerialize(bool swap, TFile* fp) { truncate(0); return this->DeSerializeClasses(swap, fp); } @@ -149,19 +149,15 @@ void FontSetDeleteCallback(FontSet fs) { /*---------------------------------------------------------------------------*/ // Callbacks used by UnicityTable to read/write FontInfo/FontSet structures. -bool read_info(FILE* f, FontInfo* fi, bool swap) { +bool read_info(TFile* f, FontInfo* fi, bool swap) { inT32 size; - if (fread(&size, sizeof(size), 1, f) != 1) return false; - if (swap) - Reverse32(&size); + if (f->FReadEndian(&size, sizeof(size), 1, swap) != 1) return false; char* font_name = new char[size + 1]; fi->name = font_name; - if (static_cast(fread(font_name, sizeof(*font_name), size, f)) != size) - return false; + if (f->FRead(font_name, sizeof(*font_name), size) != size) return false; font_name[size] = '\0'; - if (fread(&fi->properties, sizeof(fi->properties), 1, f) != 1) return false; - if (swap) - Reverse32(&fi->properties); + if (f->FReadEndian(&fi->properties, sizeof(fi->properties), 1, swap) != 1) + return false; return true; } @@ -174,26 +170,22 @@ bool write_info(FILE* f, const FontInfo& fi) { return true; } -bool read_spacing_info(FILE *f, FontInfo* fi, bool swap) { +bool read_spacing_info(TFile* f, FontInfo* fi, bool swap) { inT32 vec_size, kern_size; - if (fread(&vec_size, sizeof(vec_size), 1, f) != 1) return false; - if (swap) Reverse32(&vec_size); + if (f->FReadEndian(&vec_size, sizeof(vec_size), 1, swap) != 1) return false; ASSERT_HOST(vec_size >= 0); if (vec_size == 0) return true; fi->init_spacing(vec_size); for (int i = 0; i < vec_size; ++i) { FontSpacingInfo *fs = new FontSpacingInfo(); - if (fread(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, f) != 1 || - fread(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, f) != 1 || - fread(&kern_size, sizeof(kern_size), 1, f) != 1) { + if (f->FReadEndian(&fs->x_gap_before, sizeof(fs->x_gap_before), 1, swap) != + 1 || + f->FReadEndian(&fs->x_gap_after, sizeof(fs->x_gap_after), 1, swap) != + 1 || + f->FReadEndian(&kern_size, sizeof(kern_size), 1, swap) != 1) { delete fs; return false; } - if (swap) { - ReverseN(&(fs->x_gap_before), sizeof(fs->x_gap_before)); - ReverseN(&(fs->x_gap_after), sizeof(fs->x_gap_after)); - Reverse32(&kern_size); - } if (kern_size < 0) { // indication of a NULL entry in fi->spacing_vec delete fs; continue; @@ -237,16 +229,12 @@ bool write_spacing_info(FILE* f, const FontInfo& fi) { return true; } -bool read_set(FILE* f, FontSet* fs, bool swap) { - if (fread(&fs->size, sizeof(fs->size), 1, f) != 1) return false; - if (swap) - Reverse32(&fs->size); +bool read_set(TFile* f, FontSet* fs, bool swap) { + if (f->FReadEndian(&fs->size, sizeof(fs->size), 1, swap) != 1) return false; fs->configs = new int[fs->size]; - for (int i = 0; i < fs->size; ++i) { - if (fread(&fs->configs[i], sizeof(fs->configs[i]), 1, f) != 1) return false; - if (swap) - Reverse32(&fs->configs[i]); - } + if (f->FReadEndian(fs->configs, sizeof(fs->configs[0]), fs->size, swap) != + fs->size) + return false; return true; } diff --git a/ccstruct/fontinfo.h b/ccstruct/fontinfo.h index 5f2d4208..abeaa096 100644 --- a/ccstruct/fontinfo.h +++ b/ccstruct/fontinfo.h @@ -67,7 +67,7 @@ struct FontInfo { bool Serialize(FILE* fp) const; // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. - bool DeSerialize(bool swap, FILE* fp); + bool DeSerialize(bool swap, TFile* fp); // Reserves unicharset_size spots in spacing_vec. void init_spacing(int unicharset_size) { @@ -152,7 +152,7 @@ class FontInfoTable : public GenericVector { bool Serialize(FILE* fp) const; // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. - bool DeSerialize(bool swap, FILE* fp); + bool DeSerialize(bool swap, TFile* fp); // Returns true if the given set of fonts includes one with the same // properties as font_id. @@ -177,11 +177,11 @@ void FontInfoDeleteCallback(FontInfo f); void FontSetDeleteCallback(FontSet fs); // Callbacks used by UnicityTable to read/write FontInfo/FontSet structures. -bool read_info(FILE* f, FontInfo* fi, bool swap); +bool read_info(TFile* f, FontInfo* fi, bool swap); bool write_info(FILE* f, const FontInfo& fi); -bool read_spacing_info(FILE *f, FontInfo* fi, bool swap); +bool read_spacing_info(TFile* f, FontInfo* fi, bool swap); bool write_spacing_info(FILE* f, const FontInfo& fi); -bool read_set(FILE* f, FontSet* fs, bool swap); +bool read_set(TFile* f, FontSet* fs, bool swap); bool write_set(FILE* f, const FontSet& fs); } // namespace tesseract. diff --git a/ccstruct/hpdsizes.h b/ccstruct/hpdsizes.h index 2670e21b..f4d886a0 100644 --- a/ccstruct/hpdsizes.h +++ b/ccstruct/hpdsizes.h @@ -1,3 +1,12 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef HPDSIZES_H #define HPDSIZES_H diff --git a/ccstruct/imagedata.cpp b/ccstruct/imagedata.cpp index 3c244c77..ce185a98 100644 --- a/ccstruct/imagedata.cpp +++ b/ccstruct/imagedata.cpp @@ -24,12 +24,22 @@ #include "imagedata.h" +#if defined(__MINGW32__) +#include +#else +#include +#endif + #include "allheaders.h" #include "boxread.h" #include "callcpp.h" #include "helpers.h" #include "tprintf.h" +// Number of documents to read ahead while training. Doesn't need to be very +// large. +const int kMaxReadAhead = 8; + namespace tesseract { WordFeature::WordFeature() : x_(0), y_(0), dir_(0) { @@ -182,6 +192,19 @@ bool ImageData::DeSerialize(bool swap, TFile* fp) { return true; } +// As DeSerialize, but only seeks past the data - hence a static method. +bool ImageData::SkipDeSerialize(bool swap, TFile* fp) { + if (!STRING::SkipDeSerialize(swap, fp)) return false; + inT32 page_number; + if (fp->FRead(&page_number, sizeof(page_number), 1) != 1) return false; + if (!GenericVector::SkipDeSerialize(swap, fp)) return false; + if (!STRING::SkipDeSerialize(swap, fp)) return false; + if (!GenericVector::SkipDeSerialize(swap, fp)) return false; + if (!GenericVector::SkipDeSerializeClasses(swap, fp)) return false; + inT8 vertical = 0; + return fp->FRead(&vertical, sizeof(vertical), 1) == 1; +} + // Saves the given Pix as a PNG-encoded string and destroys it. void ImageData::SetPix(Pix* pix) { SetPixInternal(pix, &image_data_); @@ -195,37 +218,34 @@ Pix* ImageData::GetPix() const { // Gets anything and everything with a non-NULL pointer, prescaled to a // given target_height (if 0, then the original image height), and aligned. // Also returns (if not NULL) the width and height of the scaled image. -// The return value is the scale factor that was applied to the image to -// achieve the target_height. -float ImageData::PreScale(int target_height, Pix** pix, - int* scaled_width, int* scaled_height, - GenericVector* boxes) const { +// The return value is the scaled Pix, which must be pixDestroyed after use, +// and scale_factor (if not NULL) is set to the scale factor that was applied +// to the image to achieve the target_height. +Pix* ImageData::PreScale(int target_height, int max_height, float* scale_factor, + int* scaled_width, int* scaled_height, + GenericVector* boxes) const { int input_width = 0; int input_height = 0; Pix* src_pix = GetPix(); ASSERT_HOST(src_pix != NULL); input_width = pixGetWidth(src_pix); input_height = pixGetHeight(src_pix); - if (target_height == 0) - target_height = input_height; + if (target_height == 0) { + target_height = MIN(input_height, max_height); + } float im_factor = static_cast(target_height) / input_height; if (scaled_width != NULL) *scaled_width = IntCastRounded(im_factor * input_width); if (scaled_height != NULL) *scaled_height = target_height; - if (pix != NULL) { - // Get the scaled image. - pixDestroy(pix); - *pix = pixScale(src_pix, im_factor, im_factor); - if (*pix == NULL) { - tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n", - input_width, input_height, im_factor); - } - if (scaled_width != NULL) - *scaled_width = pixGetWidth(*pix); - if (scaled_height != NULL) - *scaled_height = pixGetHeight(*pix); + // Get the scaled image. + Pix* pix = pixScale(src_pix, im_factor, im_factor); + if (pix == NULL) { + tprintf("Scaling pix of size %d, %d by factor %g made null pix!!\n", + input_width, input_height, im_factor); } + if (scaled_width != NULL) *scaled_width = pixGetWidth(pix); + if (scaled_height != NULL) *scaled_height = pixGetHeight(pix); pixDestroy(&src_pix); if (boxes != NULL) { // Get the boxes. @@ -241,7 +261,8 @@ float ImageData::PreScale(int target_height, Pix** pix, boxes->push_back(box); } } - return im_factor; + if (scale_factor != NULL) *scale_factor = im_factor; + return pix; } int ImageData::MemoryUsed() const { @@ -266,19 +287,20 @@ void ImageData::Display() const { // Draw the boxes. win->Pen(ScrollView::RED); win->Brush(ScrollView::NONE); - win->TextAttributes("Arial", kTextSize, false, false, false); - for (int b = 0; b < boxes_.size(); ++b) { - boxes_[b].plot(win); - win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string()); - TBOX scaled(boxes_[b]); - scaled.scale(256.0 / height); - scaled.plot(win); + int text_size = kTextSize; + if (!boxes_.empty() && boxes_[0].height() * 2 < text_size) + text_size = boxes_[0].height() * 2; + win->TextAttributes("Arial", text_size, false, false, false); + if (!boxes_.empty()) { + for (int b = 0; b < boxes_.size(); ++b) { + boxes_[b].plot(win); + win->Text(boxes_[b].left(), height + kTextSize, box_texts_[b].string()); + } + } else { + // The full transcription. + win->Pen(ScrollView::CYAN); + win->Text(0, height + kTextSize * 2, transcription_.string()); } - // The full transcription. - win->Pen(ScrollView::CYAN); - win->Text(0, height + kTextSize * 2, transcription_.string()); - // Add the features. - win->Pen(ScrollView::GREEN); win->Update(); window_wait(win); #endif @@ -340,27 +362,51 @@ bool ImageData::AddBoxes(const char* box_text) { return false; } -DocumentData::DocumentData(const STRING& name) - : document_name_(name), pages_offset_(0), total_pages_(0), - memory_used_(0), max_memory_(0), reader_(NULL) {} +// Thread function to call ReCachePages. +void* ReCachePagesFunc(void* data) { + DocumentData* document_data = reinterpret_cast(data); + document_data->ReCachePages(); + return NULL; +} -DocumentData::~DocumentData() {} +DocumentData::DocumentData(const STRING& name) + : document_name_(name), + pages_offset_(-1), + total_pages_(-1), + memory_used_(0), + max_memory_(0), + reader_(NULL) {} + +DocumentData::~DocumentData() { + SVAutoLock lock_p(&pages_mutex_); + SVAutoLock lock_g(&general_mutex_); +} // Reads all the pages in the given lstmf filename to the cache. The reader // is used to read the file. bool DocumentData::LoadDocument(const char* filename, const char* lang, int start_page, inT64 max_memory, FileReader reader) { + SetDocument(filename, lang, max_memory, reader); + pages_offset_ = start_page; + return ReCachePages(); +} + +// Sets up the document, without actually loading it. +void DocumentData::SetDocument(const char* filename, const char* lang, + inT64 max_memory, FileReader reader) { + SVAutoLock lock_p(&pages_mutex_); + SVAutoLock lock(&general_mutex_); document_name_ = filename; lang_ = lang; - pages_offset_ = start_page; + pages_offset_ = -1; max_memory_ = max_memory; reader_ = reader; - return ReCachePages(); } // Writes all the pages to the given filename. Returns false on error. bool DocumentData::SaveDocument(const char* filename, FileWriter writer) { + SVAutoLock lock(&pages_mutex_); TFile fp; fp.OpenWrite(NULL); if (!pages_.Serialize(&fp) || !fp.CloseWrite(filename, writer)) { @@ -370,112 +416,184 @@ bool DocumentData::SaveDocument(const char* filename, FileWriter writer) { return true; } bool DocumentData::SaveToBuffer(GenericVector* buffer) { + SVAutoLock lock(&pages_mutex_); TFile fp; fp.OpenWrite(buffer); return pages_.Serialize(&fp); } -// Returns a pointer to the page with the given index, modulo the total -// number of pages, recaching if needed. -const ImageData* DocumentData::GetPage(int index) { - index = Modulo(index, total_pages_); - if (index < pages_offset_ || index >= pages_offset_ + pages_.size()) { - pages_offset_ = index; - if (!ReCachePages()) return NULL; - } - return pages_[index - pages_offset_]; +// Adds the given page data to this document, counting up memory. +void DocumentData::AddPageToDocument(ImageData* page) { + SVAutoLock lock(&pages_mutex_); + pages_.push_back(page); + set_memory_used(memory_used() + page->MemoryUsed()); } -// Loads as many pages can fit in max_memory_ starting at index pages_offset_. +// If the given index is not currently loaded, loads it using a separate +// thread. +void DocumentData::LoadPageInBackground(int index) { + ImageData* page = NULL; + if (IsPageAvailable(index, &page)) return; + SVAutoLock lock(&pages_mutex_); + if (pages_offset_ == index) return; + pages_offset_ = index; + pages_.clear(); + SVSync::StartThread(ReCachePagesFunc, this); +} + +// Returns a pointer to the page with the given index, modulo the total +// number of pages. Blocks until the background load is completed. +const ImageData* DocumentData::GetPage(int index) { + ImageData* page = NULL; + while (!IsPageAvailable(index, &page)) { + // If there is no background load scheduled, schedule one now. + pages_mutex_.Lock(); + bool needs_loading = pages_offset_ != index; + pages_mutex_.Unlock(); + if (needs_loading) LoadPageInBackground(index); + // We can't directly load the page, or the background load will delete it + // while the caller is using it, so give it a chance to work. +#if defined(__MINGW32__) + sleep(1); +#else + std::this_thread::sleep_for(std::chrono::seconds(1)); +#endif + } + return page; +} + +// Returns true if the requested page is available, and provides a pointer, +// which may be NULL if the document is empty. May block, even though it +// doesn't guarantee to return true. +bool DocumentData::IsPageAvailable(int index, ImageData** page) { + SVAutoLock lock(&pages_mutex_); + int num_pages = NumPages(); + if (num_pages == 0 || index < 0) { + *page = NULL; // Empty Document. + return true; + } + if (num_pages > 0) { + index = Modulo(index, num_pages); + if (pages_offset_ <= index && index < pages_offset_ + pages_.size()) { + *page = pages_[index - pages_offset_]; // Page is available already. + return true; + } + } + return false; +} + +// Removes all pages from memory and frees the memory, but does not forget +// the document metadata. +inT64 DocumentData::UnCache() { + SVAutoLock lock(&pages_mutex_); + inT64 memory_saved = memory_used(); + pages_.clear(); + pages_offset_ = -1; + set_total_pages(-1); + set_memory_used(0); + tprintf("Unloaded document %s, saving %d memory\n", document_name_.string(), + memory_saved); + return memory_saved; +} + +// Shuffles all the pages in the document. +void DocumentData::Shuffle() { + TRand random; + // Different documents get shuffled differently, but the same for the same + // name. + random.set_seed(document_name_.string()); + int num_pages = pages_.size(); + // Execute one random swap for each page in the document. + for (int i = 0; i < num_pages; ++i) { + int src = random.IntRand() % num_pages; + int dest = random.IntRand() % num_pages; + std::swap(pages_[src], pages_[dest]); + } +} + +// Locks the pages_mutex_ and Loads as many pages can fit in max_memory_ +// starting at index pages_offset_. bool DocumentData::ReCachePages() { + SVAutoLock lock(&pages_mutex_); // Read the file. + set_total_pages(0); + set_memory_used(0); + int loaded_pages = 0; + pages_.truncate(0); TFile fp; - if (!fp.Open(document_name_, reader_)) return false; - memory_used_ = 0; - if (!pages_.DeSerialize(false, &fp)) { - tprintf("Deserialize failed: %s\n", document_name_.string()); - pages_.truncate(0); + if (!fp.Open(document_name_, reader_) || + !PointerVector::DeSerializeSize(false, &fp, &loaded_pages) || + loaded_pages <= 0) { + tprintf("Deserialize header failed: %s\n", document_name_.string()); return false; } - total_pages_ = pages_.size(); - pages_offset_ %= total_pages_; - // Delete pages before the first one we want, and relocate the rest. + pages_offset_ %= loaded_pages; + // Skip pages before the first one we want, and load the rest until max + // memory and skip the rest after that. int page; - for (page = 0; page < pages_.size(); ++page) { - if (page < pages_offset_) { - delete pages_[page]; - pages_[page] = NULL; + for (page = 0; page < loaded_pages; ++page) { + if (page < pages_offset_ || + (max_memory_ > 0 && memory_used() > max_memory_)) { + if (!PointerVector::DeSerializeSkip(false, &fp)) break; } else { - ImageData* image_data = pages_[page]; - if (max_memory_ > 0 && page > pages_offset_ && - memory_used_ + image_data->MemoryUsed() > max_memory_) - break; // Don't go over memory quota unless the first image. + if (!pages_.DeSerializeElement(false, &fp)) break; + ImageData* image_data = pages_.back(); if (image_data->imagefilename().length() == 0) { image_data->set_imagefilename(document_name_); image_data->set_page_number(page); } image_data->set_language(lang_); - memory_used_ += image_data->MemoryUsed(); - if (pages_offset_ != 0) { - pages_[page - pages_offset_] = image_data; - pages_[page] = NULL; - } + set_memory_used(memory_used() + image_data->MemoryUsed()); } } - pages_.truncate(page - pages_offset_); - tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", - pages_.size(), total_pages_, pages_offset_, - pages_offset_ + pages_.size(), document_name_.string()); + if (page < loaded_pages) { + tprintf("Deserialize failed: %s read %d/%d pages\n", + document_name_.string(), page, loaded_pages); + pages_.truncate(0); + } else { + tprintf("Loaded %d/%d pages (%d-%d) of document %s\n", pages_.size(), + loaded_pages, pages_offset_ + 1, pages_offset_ + pages_.size(), + document_name_.string()); + } + set_total_pages(loaded_pages); return !pages_.empty(); } -// Adds the given page data to this document, counting up memory. -void DocumentData::AddPageToDocument(ImageData* page) { - pages_.push_back(page); - memory_used_ += page->MemoryUsed(); -} - // A collection of DocumentData that knows roughly how much memory it is using. DocumentCache::DocumentCache(inT64 max_memory) - : total_pages_(0), memory_used_(0), max_memory_(max_memory) {} + : num_pages_per_doc_(0), max_memory_(max_memory) {} DocumentCache::~DocumentCache() {} // Adds all the documents in the list of filenames, counting memory. // The reader is used to read the files. bool DocumentCache::LoadDocuments(const GenericVector& filenames, - const char* lang, FileReader reader) { - inT64 fair_share_memory = max_memory_ / filenames.size(); + const char* lang, + CachingStrategy cache_strategy, + FileReader reader) { + cache_strategy_ = cache_strategy; + inT64 fair_share_memory = 0; + // In the round-robin case, each DocumentData handles restricting its content + // to its fair share of memory. In the sequential case, DocumentCache + // determines which DocumentDatas are held entirely in memory. + if (cache_strategy_ == CS_ROUND_ROBIN) + fair_share_memory = max_memory_ / filenames.size(); for (int arg = 0; arg < filenames.size(); ++arg) { STRING filename = filenames[arg]; DocumentData* document = new DocumentData(filename); - if (document->LoadDocument(filename.string(), lang, 0, - fair_share_memory, reader)) { - AddToCache(document); - } else { - tprintf("Failed to load image %s!\n", filename.string()); - delete document; - } + document->SetDocument(filename.string(), lang, fair_share_memory, reader); + AddToCache(document); } - tprintf("Loaded %d pages, total %gMB\n", - total_pages_, memory_used_ / 1048576.0); - return total_pages_ > 0; + if (!documents_.empty()) { + // Try to get the first page now to verify the list of filenames. + if (GetPageBySerial(0) != NULL) return true; + tprintf("Load of page 0 failed!\n"); + } + return false; } -// Adds document to the cache, throwing out other documents if needed. +// Adds document to the cache. bool DocumentCache::AddToCache(DocumentData* data) { - inT64 new_memory = data->memory_used(); - memory_used_ += new_memory; documents_.push_back(data); - total_pages_ += data->NumPages(); - // Delete the first item in the array, and other pages of the same name - // while memory is full. - while (memory_used_ >= max_memory_ && max_memory_ > 0) { - tprintf("Memory used=%lld vs max=%lld, discarding doc of size %lld\n", - memory_used_ , max_memory_, documents_[0]->memory_used()); - memory_used_ -= documents_[0]->memory_used(); - total_pages_ -= documents_[0]->NumPages(); - documents_.remove(0); - } return true; } @@ -488,11 +606,104 @@ DocumentData* DocumentCache::FindDocument(const STRING& document_name) const { return NULL; } +// Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache +// strategy, could take a long time. +int DocumentCache::TotalPages() { + if (cache_strategy_ == CS_SEQUENTIAL) { + // In sequential mode, we assume each doc has the same number of pages + // whether it is true or not. + if (num_pages_per_doc_ == 0) GetPageSequential(0); + return num_pages_per_doc_ * documents_.size(); + } + int total_pages = 0; + int num_docs = documents_.size(); + for (int d = 0; d < num_docs; ++d) { + // We have to load a page to make NumPages() valid. + documents_[d]->GetPage(0); + total_pages += documents_[d]->NumPages(); + } + return total_pages; +} + // Returns a page by serial number, selecting them in a round-robin fashion -// from all the documents. -const ImageData* DocumentCache::GetPageBySerial(int serial) { - int document_index = serial % documents_.size(); - return documents_[document_index]->GetPage(serial / documents_.size()); +// from all the documents. Highly disk-intensive, but doesn't need samples +// to be shuffled between files to begin with. +const ImageData* DocumentCache::GetPageRoundRobin(int serial) { + int num_docs = documents_.size(); + int doc_index = serial % num_docs; + const ImageData* doc = documents_[doc_index]->GetPage(serial / num_docs); + for (int offset = 1; offset <= kMaxReadAhead && offset < num_docs; ++offset) { + doc_index = (serial + offset) % num_docs; + int page = (serial + offset) / num_docs; + documents_[doc_index]->LoadPageInBackground(page); + } + return doc; +} + +// Returns a page by serial number, selecting them in sequence from each file. +// Requires the samples to be shuffled between the files to give a random or +// uniform distribution of data. Less disk-intensive than GetPageRoundRobin. +const ImageData* DocumentCache::GetPageSequential(int serial) { + int num_docs = documents_.size(); + ASSERT_HOST(num_docs > 0); + if (num_pages_per_doc_ == 0) { + // Use the pages in the first doc as the number of pages in each doc. + documents_[0]->GetPage(0); + num_pages_per_doc_ = documents_[0]->NumPages(); + if (num_pages_per_doc_ == 0) { + tprintf("First document cannot be empty!!\n"); + ASSERT_HOST(num_pages_per_doc_ > 0); + } + // Get rid of zero now if we don't need it. + if (serial / num_pages_per_doc_ % num_docs > 0) documents_[0]->UnCache(); + } + int doc_index = serial / num_pages_per_doc_ % num_docs; + const ImageData* doc = + documents_[doc_index]->GetPage(serial % num_pages_per_doc_); + // Count up total memory. Background loading makes it more complicated to + // keep a running count. + inT64 total_memory = 0; + for (int d = 0; d < num_docs; ++d) { + total_memory += documents_[d]->memory_used(); + } + if (total_memory >= max_memory_) { + // Find something to un-cache. + // If there are more than 3 in front, then serial is from the back reader + // of a pair of readers. If we un-cache from in-front-2 to 2-ahead, then + // we create a hole between them and then un-caching the backmost occupied + // will work for both. + int num_in_front = CountNeighbourDocs(doc_index, 1); + for (int offset = num_in_front - 2; + offset > 1 && total_memory >= max_memory_; --offset) { + int next_index = (doc_index + offset) % num_docs; + total_memory -= documents_[next_index]->UnCache(); + } + // If that didn't work, the best solution is to un-cache from the back. If + // we take away the document that a 2nd reader is using, it will put it + // back and make a hole between. + int num_behind = CountNeighbourDocs(doc_index, -1); + for (int offset = num_behind; offset < 0 && total_memory >= max_memory_; + ++offset) { + int next_index = (doc_index + offset + num_docs) % num_docs; + total_memory -= documents_[next_index]->UnCache(); + } + } + int next_index = (doc_index + 1) % num_docs; + if (!documents_[next_index]->IsCached() && total_memory < max_memory_) { + documents_[next_index]->LoadPageInBackground(0); + } + return doc; +} + +// Helper counts the number of adjacent cached neighbours of index looking in +// direction dir, ie index+dir, index+2*dir etc. +int DocumentCache::CountNeighbourDocs(int index, int dir) { + int num_docs = documents_.size(); + for (int offset = dir; abs(offset) < num_docs; offset += dir) { + int offset_index = (index + offset + num_docs) % num_docs; + if (!documents_[offset_index]->IsCached()) return offset - dir; + } + return num_docs; } } // namespace tesseract. diff --git a/ccstruct/imagedata.h b/ccstruct/imagedata.h index 6321f121..45cb65a6 100644 --- a/ccstruct/imagedata.h +++ b/ccstruct/imagedata.h @@ -25,6 +25,7 @@ #include "normalis.h" #include "rect.h" #include "strngs.h" +#include "svutil.h" struct Pix; @@ -34,8 +35,22 @@ namespace tesseract { const int kFeaturePadding = 2; // Number of pixels to pad around text boxes. const int kImagePadding = 4; -// Number of training images to combine into a mini-batch for training. -const int kNumPagesPerMiniBatch = 100; + +// Enum to determine the caching and data sequencing strategy. +enum CachingStrategy { + // Reads all of one file before moving on to the next. Requires samples to be + // shuffled across files. Uses the count of samples in the first file as + // the count in all the files to achieve high-speed random access. As a + // consequence, if subsequent files are smaller, they get entries used more + // than once, and if subsequent files are larger, some entries are not used. + // Best for larger data sets that don't fit in memory. + CS_SEQUENTIAL, + // Reads one sample from each file in rotation. Does not require shuffled + // samples, but is extremely disk-intensive. Samples in smaller files also + // get used more often than samples in larger files. + // Best for smaller data sets that mostly fit in memory. + CS_ROUND_ROBIN, +}; class WordFeature { public: @@ -103,6 +118,8 @@ class ImageData { // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. bool DeSerialize(bool swap, TFile* fp); + // As DeSerialize, but only seeks past the data - hence a static method. + static bool SkipDeSerialize(bool swap, tesseract::TFile* fp); // Other accessors. const STRING& imagefilename() const { @@ -145,11 +162,12 @@ class ImageData { // Gets anything and everything with a non-NULL pointer, prescaled to a // given target_height (if 0, then the original image height), and aligned. // Also returns (if not NULL) the width and height of the scaled image. - // The return value is the scale factor that was applied to the image to - // achieve the target_height. - float PreScale(int target_height, Pix** pix, - int* scaled_width, int* scaled_height, - GenericVector* boxes) const; + // The return value is the scaled Pix, which must be pixDestroyed after use, + // and scale_factor (if not NULL) is set to the scale factor that was applied + // to the image to achieve the target_height. + Pix* PreScale(int target_height, int max_height, float* scale_factor, + int* scaled_width, int* scaled_height, + GenericVector* boxes) const; int MemoryUsed() const; @@ -184,6 +202,8 @@ class ImageData { // A collection of ImageData that knows roughly how much memory it is using. class DocumentData { + friend void* ReCachePagesFunc(void* data); + public: explicit DocumentData(const STRING& name); ~DocumentData(); @@ -192,6 +212,9 @@ class DocumentData { // is used to read the file. bool LoadDocument(const char* filename, const char* lang, int start_page, inT64 max_memory, FileReader reader); + // Sets up the document, without actually loading it. + void SetDocument(const char* filename, const char* lang, inT64 max_memory, + FileReader reader); // Writes all the pages to the given filename. Returns false on error. bool SaveDocument(const char* filename, FileWriter writer); bool SaveToBuffer(GenericVector* buffer); @@ -200,26 +223,64 @@ class DocumentData { void AddPageToDocument(ImageData* page); const STRING& document_name() const { + SVAutoLock lock(&general_mutex_); return document_name_; } int NumPages() const { + SVAutoLock lock(&general_mutex_); return total_pages_; } inT64 memory_used() const { + SVAutoLock lock(&general_mutex_); return memory_used_; } + // If the given index is not currently loaded, loads it using a separate + // thread. Note: there are 4 cases: + // Document uncached: IsCached() returns false, total_pages_ < 0. + // Required page is available: IsPageAvailable returns true. In this case, + // total_pages_ > 0 and + // pages_offset_ <= index%total_pages_ <= pages_offset_+pages_.size() + // Pages are loaded, but the required one is not. + // The requested page is being loaded by LoadPageInBackground. In this case, + // index == pages_offset_. Once the loading starts, the pages lock is held + // until it completes, at which point IsPageAvailable will unblock and return + // true. + void LoadPageInBackground(int index); // Returns a pointer to the page with the given index, modulo the total - // number of pages, recaching if needed. + // number of pages. Blocks until the background load is completed. const ImageData* GetPage(int index); + // Returns true if the requested page is available, and provides a pointer, + // which may be NULL if the document is empty. May block, even though it + // doesn't guarantee to return true. + bool IsPageAvailable(int index, ImageData** page); // Takes ownership of the given page index. The page is made NULL in *this. ImageData* TakePage(int index) { + SVAutoLock lock(&pages_mutex_); ImageData* page = pages_[index]; pages_[index] = NULL; return page; } + // Returns true if the document is currently loaded or in the process of + // loading. + bool IsCached() const { return NumPages() >= 0; } + // Removes all pages from memory and frees the memory, but does not forget + // the document metadata. Returns the memory saved. + inT64 UnCache(); + // Shuffles all the pages in the document. + void Shuffle(); private: - // Loads as many pages can fit in max_memory_ starting at index pages_offset_. + // Sets the value of total_pages_ behind a mutex. + void set_total_pages(int total) { + SVAutoLock lock(&general_mutex_); + total_pages_ = total; + } + void set_memory_used(inT64 memory_used) { + SVAutoLock lock(&general_mutex_); + memory_used_ = memory_used; + } + // Locks the pages_mutex_ and Loads as many pages can fit in max_memory_ + // starting at index pages_offset_. bool ReCachePages(); private: @@ -239,43 +300,77 @@ class DocumentData { inT64 max_memory_; // Saved reader from LoadDocument to allow re-caching. FileReader reader_; + // Mutex that protects pages_ and pages_offset_ against multiple parallel + // loads, and provides a wait for page. + SVMutex pages_mutex_; + // Mutex that protects other data members that callers want to access without + // waiting for a load operation. + mutable SVMutex general_mutex_; }; // A collection of DocumentData that knows roughly how much memory it is using. +// Note that while it supports background read-ahead, it assumes that a single +// thread is accessing documents, ie it is not safe for multiple threads to +// access different documents in parallel, as one may de-cache the other's +// content. class DocumentCache { public: explicit DocumentCache(inT64 max_memory); ~DocumentCache(); + // Deletes all existing documents from the cache. + void Clear() { + documents_.clear(); + num_pages_per_doc_ = 0; + } // Adds all the documents in the list of filenames, counting memory. // The reader is used to read the files. bool LoadDocuments(const GenericVector& filenames, const char* lang, - FileReader reader); + CachingStrategy cache_strategy, FileReader reader); - // Adds document to the cache, throwing out other documents if needed. + // Adds document to the cache. bool AddToCache(DocumentData* data); // Finds and returns a document by name. DocumentData* FindDocument(const STRING& document_name) const; - // Returns a page by serial number, selecting them in a round-robin fashion - // from all the documents. - const ImageData* GetPageBySerial(int serial); + // Returns a page by serial number using the current cache_strategy_ to + // determine the mapping from serial number to page. + const ImageData* GetPageBySerial(int serial) { + if (cache_strategy_ == CS_SEQUENTIAL) + return GetPageSequential(serial); + else + return GetPageRoundRobin(serial); + } const PointerVector& documents() const { return documents_; } - int total_pages() const { - return total_pages_; - } + // Returns the total number of pages in an epoch. For CS_ROUND_ROBIN cache + // strategy, could take a long time. + int TotalPages(); private: + // Returns a page by serial number, selecting them in a round-robin fashion + // from all the documents. Highly disk-intensive, but doesn't need samples + // to be shuffled between files to begin with. + const ImageData* GetPageRoundRobin(int serial); + // Returns a page by serial number, selecting them in sequence from each file. + // Requires the samples to be shuffled between the files to give a random or + // uniform distribution of data. Less disk-intensive than GetPageRoundRobin. + const ImageData* GetPageSequential(int serial); + + // Helper counts the number of adjacent cached neighbour documents_ of index + // looking in direction dir, ie index+dir, index+2*dir etc. + int CountNeighbourDocs(int index, int dir); + // A group of pages that corresponds in some loose way to a document. PointerVector documents_; - // Total of all pages. - int total_pages_; - // Total of all memory used by the cache. - inT64 memory_used_; + // Strategy to use for caching and serializing data samples. + CachingStrategy cache_strategy_; + // Number of pages in the first document, used as a divisor in + // GetPageSequential to determine the document index. + int num_pages_per_doc_; // Max memory allowed in this cache. inT64 max_memory_; }; diff --git a/ccstruct/matrix.h b/ccstruct/matrix.h index e13ef318..8e0442ae 100644 --- a/ccstruct/matrix.h +++ b/ccstruct/matrix.h @@ -1,8 +1,12 @@ /* -*-C-*- ****************************************************************************** + * File: matrix.h (Formerly matrix.h) + * Description: Generic 2-d array/matrix and banded triangular matrix class. + * Author: Ray Smith + * TODO(rays) Separate from ratings matrix, which it also contains: * - * File: matrix.h (Formerly matrix.h) - * Description: Ratings matrix code. (Used by associator) + * Descrition: Ratings matrix class (specialization of banded matrix). + * Segmentation search matrix of lists of BLOB_CHOICE. * Author: Mark Seaman, OCR Technology * Created: Wed May 16 13:22:06 1990 * Modified: Tue Mar 19 16:00:20 1991 (Mark Seaman) marks@hpgrlt @@ -22,12 +26,16 @@ ** limitations under the License. * *********************************************************************************/ -#ifndef TESSERACT_CCSTRUCT_MATRIX_H__ -#define TESSERACT_CCSTRUCT_MATRIX_H__ +#ifndef TESSERACT_CCSTRUCT_MATRIX_H_ +#define TESSERACT_CCSTRUCT_MATRIX_H_ +#include #include "kdpair.h" +#include "points.h" +#include "serialis.h" #include "unicharset.h" +class BLOB_CHOICE; class BLOB_CHOICE_LIST; #define NOT_CLASSIFIED reinterpret_cast(0) @@ -44,34 +52,60 @@ class GENERIC_2D_ARRAY { // either pass the memory in, or allocate after by calling Resize(). GENERIC_2D_ARRAY(int dim1, int dim2, const T& empty, T* array) : empty_(empty), dim1_(dim1), dim2_(dim2), array_(array) { + size_allocated_ = dim1 * dim2; } // Original constructor for a full rectangular matrix DOES allocate memory // and initialize it to empty. GENERIC_2D_ARRAY(int dim1, int dim2, const T& empty) : empty_(empty), dim1_(dim1), dim2_(dim2) { - array_ = new T[dim1_ * dim2_]; - for (int x = 0; x < dim1_; x++) - for (int y = 0; y < dim2_; y++) - this->put(x, y, empty_); + int new_size = dim1 * dim2; + array_ = new T[new_size]; + size_allocated_ = new_size; + for (int i = 0; i < size_allocated_; ++i) + array_[i] = empty_; + } + // Default constructor for array allocation. Use Resize to set the size. + GENERIC_2D_ARRAY() + : array_(NULL), empty_(static_cast(0)), dim1_(0), dim2_(0), + size_allocated_(0) { + } + GENERIC_2D_ARRAY(const GENERIC_2D_ARRAY& src) + : array_(NULL), empty_(static_cast(0)), dim1_(0), dim2_(0), + size_allocated_(0) { + *this = src; } virtual ~GENERIC_2D_ARRAY() { delete[] array_; } + void operator=(const GENERIC_2D_ARRAY& src) { + ResizeNoInit(src.dim1(), src.dim2()); + memcpy(array_, src.array_, num_elements() * sizeof(array_[0])); + } + + // Reallocate the array to the given size. Does not keep old data, but does + // not initialize the array either. + void ResizeNoInit(int size1, int size2) { + int new_size = size1 * size2; + if (new_size > size_allocated_) { + delete [] array_; + array_ = new T[new_size]; + size_allocated_ = new_size; + } + dim1_ = size1; + dim2_ = size2; + } + // Reallocate the array to the given size. Does not keep old data. void Resize(int size1, int size2, const T& empty) { empty_ = empty; - if (size1 != dim1_ || size2 != dim2_) { - dim1_ = size1; - dim2_ = size2; - delete [] array_; - array_ = new T[dim1_ * dim2_]; - } + ResizeNoInit(size1, size2); Clear(); } // Reallocate the array to the given size, keeping old data. void ResizeWithCopy(int size1, int size2) { if (size1 != dim1_ || size2 != dim2_) { - T* new_array = new T[size1 * size2]; + int new_size = size1 * size2; + T* new_array = new T[new_size]; for (int col = 0; col < size1; ++col) { for (int row = 0; row < size2; ++row) { int old_index = col * dim2() + row; @@ -87,6 +121,7 @@ class GENERIC_2D_ARRAY { array_ = new_array; dim1_ = size1; dim2_ = size2; + size_allocated_ = new_size; } } @@ -106,9 +141,16 @@ class GENERIC_2D_ARRAY { if (fwrite(array_, sizeof(*array_), size, fp) != size) return false; return true; } + bool Serialize(tesseract::TFile* fp) const { + if (!SerializeSize(fp)) return false; + if (fp->FWrite(&empty_, sizeof(empty_), 1) != 1) return false; + int size = num_elements(); + if (fp->FWrite(array_, sizeof(*array_), size) != size) return false; + return true; + } // Reads from the given file. Returns false in case of error. - // Only works with bitwise-serializeable typ + // Only works with bitwise-serializeable types! // If swap is true, assumes a big/little-endian swap is needed. bool DeSerialize(bool swap, FILE* fp) { if (!DeSerializeSize(swap, fp)) return false; @@ -122,6 +164,18 @@ class GENERIC_2D_ARRAY { } return true; } + bool DeSerialize(bool swap, tesseract::TFile* fp) { + if (!DeSerializeSize(swap, fp)) return false; + if (fp->FRead(&empty_, sizeof(empty_), 1) != 1) return false; + if (swap) ReverseN(&empty_, sizeof(empty_)); + int size = num_elements(); + if (fp->FRead(array_, sizeof(*array_), size) != size) return false; + if (swap) { + for (int i = 0; i < size; ++i) + ReverseN(&array_[i], sizeof(array_[i])); + } + return true; + } // Writes to the given file. Returns false in case of error. // Assumes a T::Serialize(FILE*) const function. @@ -163,11 +217,17 @@ class GENERIC_2D_ARRAY { } // Put a list element into the matrix at a specific location. + void put(ICOORD pos, const T& thing) { + array_[this->index(pos.x(), pos.y())] = thing; + } void put(int column, int row, const T& thing) { array_[this->index(column, row)] = thing; } // Get the item at a specified location from the matrix. + T get(ICOORD pos) const { + return array_[this->index(pos.x(), pos.y())]; + } T get(int column, int row) const { return array_[this->index(column, row)]; } @@ -187,6 +247,207 @@ class GENERIC_2D_ARRAY { return &array_[this->index(column, 0)]; } + // Adds addend to *this, element-by-element. + void operator+=(const GENERIC_2D_ARRAY& addend) { + if (dim2_ == addend.dim2_) { + // Faster if equal size in the major dimension. + int size = MIN(num_elements(), addend.num_elements()); + for (int i = 0; i < size; ++i) { + array_[i] += addend.array_[i]; + } + } else { + for (int x = 0; x < dim1_; x++) { + for (int y = 0; y < dim2_; y++) { + (*this)(x, y) += addend(x, y); + } + } + } + } + // Subtracts minuend from *this, element-by-element. + void operator-=(const GENERIC_2D_ARRAY& minuend) { + if (dim2_ == minuend.dim2_) { + // Faster if equal size in the major dimension. + int size = MIN(num_elements(), minuend.num_elements()); + for (int i = 0; i < size; ++i) { + array_[i] -= minuend.array_[i]; + } + } else { + for (int x = 0; x < dim1_; x++) { + for (int y = 0; y < dim2_; y++) { + (*this)(x, y) -= minuend(x, y); + } + } + } + } + // Adds addend to all elements. + void operator+=(const T& addend) { + int size = num_elements(); + for (int i = 0; i < size; ++i) { + array_[i] += addend; + } + } + // Multiplies *this by factor, element-by-element. + void operator*=(const T& factor) { + int size = num_elements(); + for (int i = 0; i < size; ++i) { + array_[i] *= factor; + } + } + // Clips *this to the given range. + void Clip(const T& rangemin, const T& rangemax) { + int size = num_elements(); + for (int i = 0; i < size; ++i) { + array_[i] = ClipToRange(array_[i], rangemin, rangemax); + } + } + // Returns true if all elements of *this are within the given range. + // Only uses operator< + bool WithinBounds(const T& rangemin, const T& rangemax) const { + int size = num_elements(); + for (int i = 0; i < size; ++i) { + const T& value = array_[i]; + if (value < rangemin || rangemax < value) + return false; + } + return true; + } + // Normalize the whole array. + double Normalize() { + int size = num_elements(); + if (size <= 0) return 0.0; + // Compute the mean. + double mean = 0.0; + for (int i = 0; i < size; ++i) { + mean += array_[i]; + } + mean /= size; + // Subtract the mean and compute the standard deviation. + double sd = 0.0; + for (int i = 0; i < size; ++i) { + double normed = array_[i] - mean; + array_[i] = normed; + sd += normed * normed; + } + sd = sqrt(sd / size); + if (sd > 0.0) { + // Divide by the sd. + for (int i = 0; i < size; ++i) { + array_[i] /= sd; + } + } + return sd; + } + + // Returns the maximum value of the array. + T Max() const { + int size = num_elements(); + if (size <= 0) return empty_; + // Compute the max. + T max_value = array_[0]; + for (int i = 1; i < size; ++i) { + const T& value = array_[i]; + if (value > max_value) max_value = value; + } + return max_value; + } + + // Returns the maximum absolute value of the array. + T MaxAbs() const { + int size = num_elements(); + if (size <= 0) return empty_; + // Compute the max. + T max_abs = static_cast(0); + for (int i = 0; i < size; ++i) { + T value = static_cast(fabs(array_[i])); + if (value > max_abs) max_abs = value; + } + return max_abs; + } + + // Accumulates the element-wise sums of squares of src into *this. + void SumSquares(const GENERIC_2D_ARRAY& src) { + int size = num_elements(); + for (int i = 0; i < size; ++i) { + array_[i] += src.array_[i] * src.array_[i]; + } + } + + // Scales each element using the ada-grad algorithm, ie array_[i] by + // sqrt(num_samples/max(1,sqsum[i])). + void AdaGradScaling(const GENERIC_2D_ARRAY& sqsum, int num_samples) { + int size = num_elements(); + for (int i = 0; i < size; ++i) { + array_[i] *= sqrt(num_samples / MAX(1.0, sqsum.array_[i])); + } + } + + void AssertFinite() const { + int size = num_elements(); + for (int i = 0; i < size; ++i) { + ASSERT_HOST(isfinite(array_[i])); + } + } + + // REGARDLESS OF THE CURRENT DIMENSIONS, treats the data as a + // num_dims-dimensional array/tensor with dimensions given by dims, (ordered + // from most significant to least significant, the same as standard C arrays) + // and moves src_dim to dest_dim, with the initial dest_dim and any dimensions + // in between shifted towards the hole left by src_dim. Example: + // Current data content: array_=[0, 1, 2, ....119] + // perhaps *this may be of dim[40, 3], with values [[0, 1, 2][3, 4, 5]... + // but the current dimensions are irrelevant. + // num_dims = 4, dims=[5, 4, 3, 2] + // src_dim=3, dest_dim=1 + // tensor=[[[[0, 1][2, 3][4, 5]] + // [[6, 7][8, 9][10, 11]] + // [[12, 13][14, 15][16, 17]] + // [[18, 19][20, 21][22, 23]]] + // [[[24, 25]... + // output dims =[5, 2, 4, 3] + // output tensor=[[[[0, 2, 4][6, 8, 10][12, 14, 16][18, 20, 22]] + // [[1, 3, 5][7, 9, 11][13, 15, 17][19, 21, 23]]] + // [[[24, 26, 28]... + // which is stored in the array_ as: + // [0, 2, 4, 6, 8, 10, 12, 14, 16, 18, 20, 22, 1, 3, 5, 7, 9, 11, 13...] + // NOTE: the 2 stored matrix dimensions are simply copied from *this. To + // change the dimensions after the transpose, use ResizeNoInit. + // Higher dimensions above 2 are strictly the responsibility of the caller. + void RotatingTranspose(const int* dims, int num_dims, int src_dim, + int dest_dim, GENERIC_2D_ARRAY* result) const { + int max_d = MAX(src_dim, dest_dim); + int min_d = MIN(src_dim, dest_dim); + // In a tensor of shape [d0, d1... min_d, ... max_d, ... dn-2, dn-1], the + // ends outside of min_d and max_d are unaffected, with [max_d +1, dn-1] + // being contiguous blocks of data that will move together, and + // [d0, min_d -1] being replicas of the transpose operation. + // num_replicas represents the large dimensions unchanged by the operation. + // move_size represents the small dimensions unchanged by the operation. + // src_step represents the stride in the src between each adjacent group + // in the destination. + int num_replicas = 1, move_size = 1, src_step = 1; + for (int d = 0; d < min_d; ++d) num_replicas *= dims[d]; + for (int d = max_d + 1; d < num_dims; ++d) move_size *= dims[d]; + for (int d = src_dim + 1; d < num_dims; ++d) src_step *= dims[d]; + if (src_dim > dest_dim) src_step *= dims[src_dim]; + // wrap_size is the size of a single replica, being the amount that is + // handled num_replicas times. + int wrap_size = move_size; + for (int d = min_d; d <= max_d; ++d) wrap_size *= dims[d]; + result->ResizeNoInit(dim1_, dim2_); + result->empty_ = empty_; + const T* src = array_; + T* dest = result->array_; + for (int replica = 0; replica < num_replicas; ++replica) { + for (int start = 0; start < src_step; start += move_size) { + for (int pos = start; pos < wrap_size; pos += src_step) { + memcpy(dest, src + pos, sizeof(*dest) * move_size); + dest += move_size; + } + } + src += wrap_size; + } + } + // Delete objects pointed to by array_[i]. void delete_matrix_pointers() { int size = num_elements(); @@ -206,6 +467,13 @@ class GENERIC_2D_ARRAY { if (fwrite(&size, sizeof(size), 1, fp) != 1) return false; return true; } + bool SerializeSize(tesseract::TFile* fp) const { + inT32 size = dim1_; + if (fp->FWrite(&size, sizeof(size), 1) != 1) return false; + size = dim2_; + if (fp->FWrite(&size, sizeof(size), 1) != 1) return false; + return true; + } // Factored helper to deserialize the size. // If swap is true, assumes a big/little-endian swap is needed. bool DeSerializeSize(bool swap, FILE* fp) { @@ -219,11 +487,26 @@ class GENERIC_2D_ARRAY { Resize(size1, size2, empty_); return true; } + bool DeSerializeSize(bool swap, tesseract::TFile* fp) { + inT32 size1, size2; + if (fp->FRead(&size1, sizeof(size1), 1) != 1) return false; + if (fp->FRead(&size2, sizeof(size2), 1) != 1) return false; + if (swap) { + ReverseN(&size1, sizeof(size1)); + ReverseN(&size2, sizeof(size2)); + } + Resize(size1, size2, empty_); + return true; + } T* array_; T empty_; // The unused cell. int dim1_; // Size of the 1st dimension in indexing functions. int dim2_; // Size of the 2nd dimension in indexing functions. + // The total size to which the array can be expanded before a realloc is + // needed. If Resize is used, memory is retained so it can be re-expanded + // without a further alloc, and this stores the allocated size. + int size_allocated_; }; // A generic class to store a banded triangular matrix with entries of type T. @@ -349,4 +632,4 @@ struct MATRIX_COORD { // The MatrixCoordPair contains a MATRIX_COORD and its priority. typedef tesseract::KDPairInc MatrixCoordPair; -#endif // TESSERACT_CCSTRUCT_MATRIX_H__ +#endif // TESSERACT_CCSTRUCT_MATRIX_H_ diff --git a/ccstruct/mod128.cpp b/ccstruct/mod128.cpp index ee4aa6c3..4e5f4bd3 100644 --- a/ccstruct/mod128.cpp +++ b/ccstruct/mod128.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: mod128.c (Formerly dir128.c) * Description: Code to convert a DIR128 to an ICOORD. - * Author: Ray Smith - * Created: Tue Oct 22 11:56:09 BST 1991 + * Author: Ray Smith + * Created: Tue Oct 22 11:56:09 BST 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -86,16 +86,3 @@ DIR128::DIR128( //from fcoord while (high - low > 1); dir = low; } - - -/********************************************************************** - * dir_to_gradient - * - * Convert a direction to a vector. - **********************************************************************/ - -#if 0 // code is buggy for negative dir and unused -ICOORD DIR128::vector() const { //convert to vector - return dirtab[dir]; //easy really -} -#endif diff --git a/ccstruct/mod128.h b/ccstruct/mod128.h index 592264ba..c0e71a42 100644 --- a/ccstruct/mod128.h +++ b/ccstruct/mod128.h @@ -1,8 +1,8 @@ /********************************************************************** * File: mod128.h (Formerly dir128.h) * Description: Header for class which implements modulo arithmetic. - * Author: Ray Smith - * Created: Tue Mar 26 17:48:13 GMT 1991 + * Author: Ray Smith + * Created: Tue Mar 26 17:48:13 GMT 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -77,7 +77,6 @@ class DLLSYM DIR128 inT8 get_dir() const { //access function return dir; } - ICOORD vector() const; //turn to vector private: inT8 dir; //a direction diff --git a/ccstruct/otsuthr.cpp b/ccstruct/otsuthr.cpp index 071b0d23..8822ce29 100644 --- a/ccstruct/otsuthr.cpp +++ b/ccstruct/otsuthr.cpp @@ -51,23 +51,16 @@ int OtsuThreshold(Pix* src_pix, int left, int top, int width, int height, // only use opencl if compiled w/ OpenCL and selected device is opencl #ifdef USE_OPENCL - // all of channel 0 then all of channel 1... - int *histogramAllChannels = new int[kHistogramSize * num_channels]; + // all of channel 0 then all of channel 1... + int* histogramAllChannels = new int[kHistogramSize * num_channels]; - // Calculate Histogram on GPU - OpenclDevice od; - if (od.selectedDeviceIsOpenCL() && - (num_channels == 1 || num_channels == 4) && top == 0 && left == 0 ) { - od.HistogramRectOCL( - (const unsigned char*)pixGetData(src_pix), - num_channels, - pixGetWpl(src_pix) * 4, - left, - top, - width, - height, - kHistogramSize, - histogramAllChannels); + // Calculate Histogram on GPU + OpenclDevice od; + if (od.selectedDeviceIsOpenCL() && (num_channels == 1 || num_channels == 4) && + top == 0 && left == 0) { + od.HistogramRectOCL((unsigned char*)pixGetData(src_pix), num_channels, + pixGetWpl(src_pix) * 4, left, top, width, height, + kHistogramSize, histogramAllChannels); // Calculate Threshold from Histogram on cpu for (int ch = 0; ch < num_channels; ++ch) { @@ -143,7 +136,6 @@ int OtsuThreshold(Pix* src_pix, int left, int top, int width, int height, delete[] histogramAllChannels; #endif // USE_OPENCL - if (!any_good_hivalue) { // Use the best of the ones that were not good enough. (*hi_values)[best_hi_index] = best_hi_value; diff --git a/ccstruct/otsuthr.h b/ccstruct/otsuthr.h index 7e7d2817..dd35d23f 100644 --- a/ccstruct/otsuthr.h +++ b/ccstruct/otsuthr.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCMAIN_OTSUTHR_H__ -#define TESSERACT_CCMAIN_OTSUTHR_H__ +#ifndef TESSERACT_CCMAIN_OTSUTHR_H_ +#define TESSERACT_CCMAIN_OTSUTHR_H_ struct Pix; @@ -53,4 +53,4 @@ int OtsuStats(const int* histogram, int* H_out, int* omega0_out); } // namespace tesseract. -#endif // TESSERACT_CCMAIN_OTSUTHR_H__ +#endif // TESSERACT_CCMAIN_OTSUTHR_H_ diff --git a/ccstruct/pageres.cpp b/ccstruct/pageres.cpp index b66e5636..894102cd 100644 --- a/ccstruct/pageres.cpp +++ b/ccstruct/pageres.cpp @@ -303,8 +303,9 @@ bool WERD_RES::SetupForRecognition(const UNICHARSET& unicharset_in, static_cast(norm_mode); tesseract = tess; POLY_BLOCK* pb = block != NULL ? block->poly_block() : NULL; - if ((norm_mode_hint != tesseract::OEM_CUBE_ONLY && - word->cblob_list()->empty()) || (pb != NULL && !pb->IsText())) { + if ((norm_mode_hint != tesseract::OEM_LSTM_ONLY && + word->cblob_list()->empty()) || + (pb != NULL && !pb->IsText())) { // Empty words occur when all the blobs have been moved to the rej_blobs // list, which seems to occur frequently in junk. SetupFake(unicharset_in); @@ -528,13 +529,12 @@ void WERD_RES::FilterWordChoices(int debug_level) { if (choice->unichar_id(i) != best_choice->unichar_id(j) && choice->certainty(i) - best_choice->certainty(j) < threshold) { if (debug_level >= 2) { - STRING label; - label.add_str_int("\nDiscarding bad choice #", index); - choice->print(label.string()); - tprintf("i %d j %d Chunk %d Choice->Blob[i].Certainty %.4g" - " BestChoice->ChunkCertainty[Chunk] %g Threshold %g\n", - i, j, chunk, choice->certainty(i), - best_choice->certainty(j), threshold); + choice->print("WorstCertaintyDiffWorseThan"); + tprintf( + "i %d j %d Choice->Blob[i].Certainty %.4g" + " WorstOtherChoiceCertainty %g Threshold %g\n", + i, j, choice->certainty(i), best_choice->certainty(j), threshold); + tprintf("Discarding bad choice #%d\n", index); } delete it.extract(); break; @@ -882,17 +882,18 @@ void WERD_RES::FakeClassifyWord(int blob_count, BLOB_CHOICE** choices) { choice_it.add_after_then_move(choices[c]); ratings->put(c, c, choice_list); } - FakeWordFromRatings(); + FakeWordFromRatings(TOP_CHOICE_PERM); reject_map.initialise(blob_count); + best_state.init_to_size(blob_count, 1); done = true; } // Creates a WERD_CHOICE for the word using the top choices from the leading // diagonal of the ratings matrix. -void WERD_RES::FakeWordFromRatings() { +void WERD_RES::FakeWordFromRatings(PermuterType permuter) { int num_blobs = ratings->dimension(); WERD_CHOICE* word_choice = new WERD_CHOICE(uch_set, num_blobs); - word_choice->set_permuter(TOP_CHOICE_PERM); + word_choice->set_permuter(permuter); for (int b = 0; b < num_blobs; ++b) { UNICHAR_ID unichar_id = UNICHAR_SPACE; float rating = MAX_INT32; @@ -1105,6 +1106,7 @@ void WERD_RES::InitNonPointers() { x_height = 0.0; caps_height = 0.0; baseline_shift = 0.0f; + space_certainty = 0.0f; guessed_x_ht = TRUE; guessed_caps_ht = TRUE; combination = FALSE; diff --git a/ccstruct/pageres.h b/ccstruct/pageres.h index 7329bc89..33b9f4cb 100644 --- a/ccstruct/pageres.h +++ b/ccstruct/pageres.h @@ -1,7 +1,7 @@ /********************************************************************** * File: pageres.h (Formerly page_res.h) * Description: Results classes used by control.c - * Author: Phil Cheatle + * Author: Phil Cheatle * Created: Tue Sep 22 08:42:49 BST 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. @@ -295,6 +295,9 @@ class WERD_RES : public ELIST_LINK { float x_height; // post match estimate float caps_height; // post match estimate float baseline_shift; // post match estimate. + // Certainty score for the spaces either side of this word (LSTM mode). + // MIN this value with the actual word certainty. + float space_certainty; /* To deal with fuzzy spaces we need to be able to combine "words" to form @@ -327,7 +330,7 @@ class WERD_RES : public ELIST_LINK { } // Deep copies everything except the ratings MATRIX. // To get that use deep_copy below. - WERD_RES(const WERD_RES &source) : ELIST_LINK(source) { + WERD_RES(const WERD_RES& source) : ELIST_LINK(source) { InitPointers(); *this = source; // see operator= } @@ -590,7 +593,7 @@ class WERD_RES : public ELIST_LINK { // Creates a WERD_CHOICE for the word using the top choices from the leading // diagonal of the ratings matrix. - void FakeWordFromRatings(); + void FakeWordFromRatings(PermuterType permuter); // Copies the best_choice strings to the correct_text for adaption/training. void BestChoiceToCorrectText(); @@ -630,7 +633,7 @@ class WERD_RES : public ELIST_LINK { static WERD_RES* deep_copy(const WERD_RES* src) { WERD_RES* result = new WERD_RES(*src); // That didn't copy the ratings, but we want a copy if there is one to - // begin width. + // begin with. if (src->ratings != NULL) result->ratings = src->ratings->DeepCopy(); return result; diff --git a/ccstruct/params_training_featdef.h b/ccstruct/params_training_featdef.h index ff76480b..6e021f0b 100644 --- a/ccstruct/params_training_featdef.h +++ b/ccstruct/params_training_featdef.h @@ -126,7 +126,7 @@ typedef GenericVector ParamsTrainingHypothesisList; // explored on PASS1, PASS2, fix xheight pass, etc). class ParamsTrainingBundle { public: - ParamsTrainingBundle() {}; + ParamsTrainingBundle() {} // Starts a new hypothesis list. // Should be called at the beginning of a new run of the segmentation search. void StartHypothesisList() { diff --git a/ccstruct/pdblock.h b/ccstruct/pdblock.h index b64eff36..cf29b782 100644 --- a/ccstruct/pdblock.h +++ b/ccstruct/pdblock.h @@ -29,90 +29,74 @@ struct Pix; CLISTIZEH (PDBLK) ///page block -class PDBLK -{ +class PDBLK { friend class BLOCK_RECT_IT; //< block iterator - public: - ///empty constructor - PDBLK() { - hand_poly = NULL; - index_ = 0; - } - ///simple constructor - PDBLK(inT16 xmin, //< bottom left - inT16 ymin, - inT16 xmax, //< top right - inT16 ymax); + public: + /// empty constructor + PDBLK() { + hand_poly = NULL; + index_ = 0; + } + /// simple constructor + PDBLK(inT16 xmin, //< bottom left + inT16 ymin, + inT16 xmax, //< top right + inT16 ymax); - ///set vertex lists - ///@param left list of left vertices - ///@param right list of right vertices - void set_sides(ICOORDELT_LIST *left, - ICOORDELT_LIST *right); + /// set vertex lists + ///@param left list of left vertices + ///@param right list of right vertices + void set_sides(ICOORDELT_LIST *left, ICOORDELT_LIST *right); - ///destructor - ~PDBLK () { - if (hand_poly) delete hand_poly; - } + /// destructor + ~PDBLK() { delete hand_poly; } - POLY_BLOCK *poly_block() const { - return hand_poly; - } - ///set the poly block - void set_poly_block(POLY_BLOCK *blk) { - hand_poly = blk; - } - ///get box - void bounding_box(ICOORD &bottom_left, //bottom left - ICOORD &top_right) const { //topright - bottom_left = box.botleft (); - top_right = box.topright (); - } - ///get real box - const TBOX &bounding_box() const { - return box; - } + POLY_BLOCK *poly_block() const { return hand_poly; } + /// set the poly block + void set_poly_block(POLY_BLOCK *blk) { hand_poly = blk; } + /// get box + void bounding_box(ICOORD &bottom_left, // bottom left + ICOORD &top_right) const { // topright + bottom_left = box.botleft(); + top_right = box.topright(); + } + /// get real box + const TBOX &bounding_box() const { return box; } - int index() const { - return index_; - } - void set_index(int value) { - index_ = value; - } + int index() const { return index_; } + void set_index(int value) { index_ = value; } - ///is pt inside block - BOOL8 contains(ICOORD pt); + /// is pt inside block + BOOL8 contains(ICOORD pt); - /// reposition block - void move(const ICOORD vec); // by vector + /// reposition block + void move(const ICOORD vec); // by vector - // Returns a binary Pix mask with a 1 pixel for every pixel within the - // block. Rotates the coordinate system by rerotation prior to rendering. - // If not NULL, mask_box is filled with the position box of the returned - // mask image. - Pix *render_mask(const FCOORD &rerotation, TBOX *mask_box); + // Returns a binary Pix mask with a 1 pixel for every pixel within the + // block. Rotates the coordinate system by rerotation prior to rendering. + // If not NULL, mask_box is filled with the position box of the returned + // mask image. + Pix *render_mask(const FCOORD &rerotation, TBOX *mask_box); - #ifndef GRAPHICS_DISABLED - ///draw histogram - ///@param window window to draw in - ///@param serial serial number - ///@param colour colour to draw in - void plot(ScrollView* window, - inT32 serial, - ScrollView::Color colour); - #endif // GRAPHICS_DISABLED +#ifndef GRAPHICS_DISABLED + /// draw histogram + ///@param window window to draw in + ///@param serial serial number + ///@param colour colour to draw in + void plot(ScrollView *window, inT32 serial, ScrollView::Color colour); +#endif // GRAPHICS_DISABLED - ///assignment - ///@param source from this - PDBLK & operator= (const PDBLK & source); + /// assignment + ///@param source from this + PDBLK &operator=(const PDBLK &source); - protected: - POLY_BLOCK *hand_poly; //< weird as well - ICOORDELT_LIST leftside; //< left side vertices - ICOORDELT_LIST rightside; //< right side vertices - TBOX box; //< bounding box - int index_; //< Serial number of this block. + protected: + POLY_BLOCK *hand_poly; //< weird as well + ICOORDELT_LIST leftside; //< left side vertices + ICOORDELT_LIST rightside; //< right side vertices + TBOX box; //< bounding box + int index_; //< Serial number of this block. }; class DLLSYM BLOCK_RECT_IT //rectangle iterator diff --git a/ccstruct/polyaprx.cpp b/ccstruct/polyaprx.cpp index 81b8500a..75973495 100644 --- a/ccstruct/polyaprx.cpp +++ b/ccstruct/polyaprx.cpp @@ -214,7 +214,7 @@ EDGEPT edgepts[] //output is array void fix2( //polygonal approx EDGEPT *start, /*loop to approimate */ int area) { - EDGEPT *edgept; /*current point */ + EDGEPT *edgept; /*current point */ EDGEPT *edgept1; EDGEPT *loopstart; /*modified start of loop */ EDGEPT *linestart; /*start of line segment */ diff --git a/ccstruct/polyblk.cpp b/ccstruct/polyblk.cpp index 15837885..b5ca2e12 100644 --- a/ccstruct/polyblk.cpp +++ b/ccstruct/polyblk.cpp @@ -1,7 +1,7 @@ /********************************************************************** * File: polyblk.c (Formerly poly_block.c) * Description: Polygonal blocks - * Author: Sheelagh Lloyd? + * Author: Sheelagh Lloyd? * Created: * * (C) Copyright 1993, Hewlett-Packard Ltd. @@ -294,6 +294,8 @@ void POLY_BLOCK::fill(ScrollView* window, ScrollView::Color colour) { } } } + + delete lines; } #endif diff --git a/ccstruct/publictypes.h b/ccstruct/publictypes.h index 6cb9f3ba..a3428658 100644 --- a/ccstruct/publictypes.h +++ b/ccstruct/publictypes.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H__ -#define TESSERACT_CCSTRUCT_PUBLICTYPES_H__ +#ifndef TESSERACT_CCSTRUCT_PUBLICTYPES_H_ +#define TESSERACT_CCSTRUCT_PUBLICTYPES_H_ // This file contains types that are used both by the API and internally // to Tesseract. In order to decouple the API from Tesseract and prevent cyclic @@ -213,7 +213,7 @@ enum PageIteratorLevel { }; /** - * JUSTIFICATION_UNKNONW + * JUSTIFICATION_UNKNOWN * The alignment is not clearly one of the other options. This could happen * for example if there are only one or two lines of text or the text looks * like source code or poetry. @@ -235,7 +235,7 @@ enum PageIteratorLevel { * * JUSTIFICATION_RIGHT * Each line, except possibly the first, is flush to the same right tab stop. -*/ + */ enum ParagraphJustification { JUSTIFICATION_UNKNOWN, JUSTIFICATION_LEFT, @@ -255,17 +255,20 @@ enum ParagraphJustification { */ enum OcrEngineMode { OEM_TESSERACT_ONLY, // Run Tesseract only - fastest - OEM_CUBE_ONLY, // Run Cube only - better accuracy, but slower - OEM_TESSERACT_CUBE_COMBINED, // Run both and combine results - best accuracy - OEM_DEFAULT // Specify this mode when calling init_*(), + OEM_LSTM_ONLY, // Run just the LSTM line recognizer. + OEM_TESSERACT_LSTM_COMBINED, // Run the LSTM recognizer, but allow fallback + // to Tesseract when things get difficult. + OEM_DEFAULT, // Specify this mode when calling init_*(), // to indicate that any of the above modes // should be automatically inferred from the // variables in the language-specific config, // command-line configs, or if not specified // in any of the above should be set to the // default OEM_TESSERACT_ONLY. + OEM_CUBE_ONLY, // Run Cube only - better accuracy, but slower + OEM_TESSERACT_CUBE_COMBINED, // Run both and combine results - best accuracy }; } // namespace tesseract. -#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H__ +#endif // TESSERACT_CCSTRUCT_PUBLICTYPES_H_ diff --git a/ccstruct/quspline.cpp b/ccstruct/quspline.cpp index f50cfe50..82107e1e 100644 --- a/ccstruct/quspline.cpp +++ b/ccstruct/quspline.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: quspline.cpp (Formerly qspline.c) * Description: Code for the QSPLINE class. - * Author: Ray Smith - * Created: Tue Oct 08 17:16:12 BST 1991 + * Author: Ray Smith + * Created: Tue Oct 08 17:16:12 BST 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/ccstruct/ratngs.h b/ccstruct/ratngs.h index 31b27cfb..408968b0 100644 --- a/ccstruct/ratngs.h +++ b/ccstruct/ratngs.h @@ -288,7 +288,8 @@ class WERD_CHOICE : public ELIST_LINK { src_certainty, src_permuter); } WERD_CHOICE(const char *src_string, const UNICHARSET &unicharset); - WERD_CHOICE(const WERD_CHOICE &word) : ELIST_LINK(word), unicharset_(word.unicharset_) { + WERD_CHOICE(const WERD_CHOICE &word) + : ELIST_LINK(word), unicharset_(word.unicharset_) { this->init(word.length()); this->operator=(word); } @@ -507,6 +508,20 @@ class WERD_CHOICE : public ELIST_LINK { } return word_str; } + // Returns true if any unichar_id in the word is a non-space-delimited char. + bool ContainsAnyNonSpaceDelimited() const { + for (int i = 0; i < length_; ++i) { + if (!unicharset_->IsSpaceDelimited(unichar_ids_[i])) return true; + } + return false; + } + // Returns true if the word is all spaces. + bool IsAllSpaces() const { + for (int i = 0; i < length_; ++i) { + if (unichar_ids_[i] != UNICHAR_SPACE) return false; + } + return true; + } // Call this to override the default (strict left to right graphemes) // with the fact that some engine produces a "reading order" set of diff --git a/ccstruct/rect.cpp b/ccstruct/rect.cpp index 22417485..4a9fe00b 100644 --- a/ccstruct/rect.cpp +++ b/ccstruct/rect.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: rect.c (Formerly box.c) * Description: Bounding box class definition. - * Author: Phil Cheatle - * Created: Wed Oct 16 15:18:45 BST 1991 + * Author: Phil Cheatle + * Created: Wed Oct 16 15:18:45 BST 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -29,10 +29,10 @@ * **********************************************************************/ -TBOX::TBOX( //constructor - const ICOORD pt1, //one corner - const ICOORD pt2 //the other corner - ) { +TBOX::TBOX( // constructor + const ICOORD pt1, // one corner + const ICOORD pt2 // the other corner + ) { if (pt1.x () <= pt2.x ()) { if (pt1.y () <= pt2.y ()) { bot_left = pt1; diff --git a/ccstruct/rect.h b/ccstruct/rect.h index d9b90642..f31247a1 100644 --- a/ccstruct/rect.h +++ b/ccstruct/rect.h @@ -1,8 +1,8 @@ /********************************************************************** * File: rect.h (Formerly box.h) * Description: Bounding box class definition. - * Author: Phil Cheatle - * Created: Wed Oct 16 15:18:45 BST 1991 + * Author: Phil Cheatle + * Created: Wed Oct 16 15:18:45 BST 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -307,9 +307,9 @@ class DLLSYM TBOX { // bounding box * **********************************************************************/ -inline TBOX::TBOX( // constructor - const FCOORD pt // floating centre - ) { +inline TBOX::TBOX( // constructor + const FCOORD pt // floating centre + ) { bot_left = ICOORD ((inT16) floor (pt.x ()), (inT16) floor (pt.y ())); top_right = ICOORD ((inT16) ceil (pt.x ()), (inT16) ceil (pt.y ())); } diff --git a/ccstruct/rejctmap.cpp b/ccstruct/rejctmap.cpp index 9c9ff2e5..2123230e 100644 --- a/ccstruct/rejctmap.cpp +++ b/ccstruct/rejctmap.cpp @@ -267,10 +267,10 @@ void REJ::full_print(FILE *fp) { //The REJMAP class has been hacked to use alloc_struct instead of new []. //This is to reduce memory fragmentation only as it is rather kludgy. -//alloc_struct by-passes the call to the contsructor of REJ on each -//array element. Although the constructor is empty, the BITS16 members -//do have a constructor which sets all the flags to 0. The memset -//replaces this functionality. +// alloc_struct by-passes the call to the constructor of REJ on each +// array element. Although the constructor is empty, the BITS16 members +// do have a constructor which sets all the flags to 0. The memset +// replaces this functionality. REJMAP::REJMAP( //classwise copy const REJMAP &source) { diff --git a/ccstruct/rejctmap.h b/ccstruct/rejctmap.h index d945dda1..009ba58a 100644 --- a/ccstruct/rejctmap.h +++ b/ccstruct/rejctmap.h @@ -1,8 +1,8 @@ /********************************************************************** * File: rejctmap.h (Formerly rejmap.h) * Description: REJ and REJMAP class functions. - * Author: Phil Cheatle - * Created: Thu Jun 9 13:46:38 BST 1994 + * Author: Phil Cheatle + * Created: Thu Jun 9 13:46:38 BST 1994 * * (C) Copyright 1994, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -48,46 +48,45 @@ OF THIS IMPLIED TEMPORAL ORDERING OF THE FLAGS!!!! #include "bits16.h" #include "params.h" -enum REJ_FLAGS -{ +enum REJ_FLAGS { /* Reject modes which are NEVER overridden */ - R_TESS_FAILURE, // PERM Tess didn't classify - R_SMALL_XHT, // PERM Xht too small - R_EDGE_CHAR, // PERM Too close to edge of image - R_1IL_CONFLICT, // PERM 1Il confusion - R_POSTNN_1IL, // PERM 1Il unrejected by NN - R_REJ_CBLOB, // PERM Odd blob - R_MM_REJECT, // PERM Matrix match rejection (m's) - R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend + R_TESS_FAILURE, // PERM Tess didn't classify + R_SMALL_XHT, // PERM Xht too small + R_EDGE_CHAR, // PERM Too close to edge of image + R_1IL_CONFLICT, // PERM 1Il confusion + R_POSTNN_1IL, // PERM 1Il unrejected by NN + R_REJ_CBLOB, // PERM Odd blob + R_MM_REJECT, // PERM Matrix match rejection (m's) + R_BAD_REPETITION, // TEMP Repeated char which doesn't match trend /* Initial reject modes (pre NN_ACCEPT) */ - R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) - R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD - R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD - R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD + R_POOR_MATCH, // TEMP Ray's original heuristic (Not used) + R_NOT_TESS_ACCEPTED, // TEMP Tess didn't accept WERD + R_CONTAINS_BLANKS, // TEMP Tess failed on other chs in WERD + R_BAD_PERMUTER, // POTENTIAL Bad permuter for WERD /* Reject modes generated after NN_ACCEPT but before MM_ACCEPT */ - R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop - R_DUBIOUS, // TEMP Post NN dodgy chars - R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN - R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest - R_XHT_FIXUP, // TEMP Xht tests unsure + R_HYPHEN, // TEMP Post NN dodgy hyphen or full stop + R_DUBIOUS, // TEMP Post NN dodgy chars + R_NO_ALPHANUMS, // TEMP No alphanumerics in word after NN + R_MOSTLY_REJ, // TEMP Most of word rejected so rej the rest + R_XHT_FIXUP, // TEMP Xht tests unsure /* Reject modes generated after MM_ACCEPT but before QUALITY_ACCEPT */ - R_BAD_QUALITY, // TEMP Quality metrics bad for WERD + R_BAD_QUALITY, // TEMP Quality metrics bad for WERD /* Reject modes generated after QUALITY_ACCEPT but before MINIMAL_REJ accep*/ - R_DOC_REJ, // TEMP Document rejection - R_BLOCK_REJ, // TEMP Block rejection - R_ROW_REJ, // TEMP Row rejection - R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space + R_DOC_REJ, // TEMP Document rejection + R_BLOCK_REJ, // TEMP Block rejection + R_ROW_REJ, // TEMP Row rejection + R_UNLV_REJ, // TEMP ~ turned to - or ^ turned to space /* Accept modes which occur between the above rejection groups */ - R_NN_ACCEPT, //NN acceptance - R_HYPHEN_ACCEPT, //Hyphen acceptance - R_MM_ACCEPT, //Matrix match acceptance - R_QUALITY_ACCEPT, //Accept word in good quality doc - R_MINIMAL_REJ_ACCEPT //Accept EVERYTHING except tess failures + R_NN_ACCEPT, // NN acceptance + R_HYPHEN_ACCEPT, // Hyphen acceptance + R_MM_ACCEPT, // Matrix match acceptance + R_QUALITY_ACCEPT, // Accept word in good quality doc + R_MINIMAL_REJ_ACCEPT // Accept EVERYTHING except tess failures }; /* REJECT MAP VALUES */ diff --git a/ccstruct/statistc.cpp b/ccstruct/statistc.cpp index 39d5edd1..8b1ba8c9 100644 --- a/ccstruct/statistc.cpp +++ b/ccstruct/statistc.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: statistc.c (Formerly stats.c) * Description: Simple statistical package for integer values. - * Author: Ray Smith - * Created: Mon Feb 04 16:56:05 GMT 1991 + * Author: Ray Smith + * Created: Mon Feb 04 16:56:05 GMT 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -215,7 +215,6 @@ inT32 STATS::min_bucket() const { // Find min return rangemin_ + min; } - /********************************************************************** * STATS::max_bucket * diff --git a/ccutil/Makefile.am b/ccutil/Makefile.am index 76012006..9d3d83b2 100644 --- a/ccutil/Makefile.am +++ b/ccutil/Makefile.am @@ -14,11 +14,11 @@ endif include_HEADERS = \ basedir.h errcode.h fileerr.h genericvector.h helpers.h host.h memry.h \ ndminx.h params.h ocrclass.h platform.h serialis.h strngs.h \ - tesscallback.h unichar.h unicharmap.h unicharset.h + tesscallback.h unichar.h unicharcompress.h unicharmap.h unicharset.h noinst_HEADERS = \ ambigs.h bits16.h bitvector.h ccutil.h clst.h doubleptr.h elst2.h \ - elst.h genericheap.h globaloc.h hashfn.h indexmapbidi.h kdpair.h lsterr.h \ + elst.h genericheap.h globaloc.h indexmapbidi.h kdpair.h lsterr.h \ nwmain.h object_cache.h qrsequence.h sorthelper.h stderr.h \ scanutils.h tessdatamanager.h tprintf.h unicity_table.h unicodes.h \ universalambigs.h @@ -38,7 +38,7 @@ libtesseract_ccutil_la_SOURCES = \ mainblk.cpp memry.cpp \ serialis.cpp strngs.cpp scanutils.cpp \ tessdatamanager.cpp tprintf.cpp \ - unichar.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \ + unichar.cpp unicharcompress.cpp unicharmap.cpp unicharset.cpp unicodes.cpp \ params.cpp universalambigs.cpp if T_WIN diff --git a/ccutil/ambigs.h b/ccutil/ambigs.h index b278f9f3..faab2198 100644 --- a/ccutil/ambigs.h +++ b/ccutil/ambigs.h @@ -59,17 +59,18 @@ class UnicharIdArrayUtils { // less than length of array2, if any array1[i] is less than array2[i]. // Returns 0 if the arrays are equal, 1 otherwise. // The function assumes that the arrays are terminated by INVALID_UNICHAR_ID. - static inline int compare(const UNICHAR_ID array1[], - const UNICHAR_ID array2[]) { - const UNICHAR_ID *ptr1 = array1; - const UNICHAR_ID *ptr2 = array2; - while (*ptr1 != INVALID_UNICHAR_ID && *ptr2 != INVALID_UNICHAR_ID) { - if (*ptr1 != *ptr2) return *ptr1 < *ptr2 ? -1 : 1; - ++ptr1; - ++ptr2; + static inline int compare(const UNICHAR_ID *ptr1, const UNICHAR_ID *ptr2) { + for (;;) { + const UNICHAR_ID val1 = *ptr1++; + const UNICHAR_ID val2 = *ptr2++; + if (val1 != val2) { + if (val1 == INVALID_UNICHAR_ID) return -1; + if (val2 == INVALID_UNICHAR_ID) return 1; + if (val1 < val2) return -1; + return 1; + } + if (val1 == INVALID_UNICHAR_ID) return 0; } - if (*ptr1 == INVALID_UNICHAR_ID && *ptr2 == INVALID_UNICHAR_ID) return 0; - return *ptr1 == INVALID_UNICHAR_ID ? -1 : 1; } // Look uid in the vector of uids. If found, the index of the matched diff --git a/ccutil/bits16.h b/ccutil/bits16.h index 352b48be..6bbec4c0 100644 --- a/ccutil/bits16.h +++ b/ccutil/bits16.h @@ -37,12 +37,12 @@ class DLLSYM BITS16 void turn_on_bit( // flip specified bit uinT8 bit_num) { // bit to flip 0..7 val = val | 01 << bit_num; - }; + } void turn_off_bit( // flip specified bit uinT8 bit_num) { // bit to flip 0..7 val = val & ~(01 << bit_num); - }; + } void set_bit( // flip specified bit uinT8 bit_num, // bit to flip 0..7 @@ -51,11 +51,11 @@ class DLLSYM BITS16 val = val | 01 << bit_num; else val = val & ~(01 << bit_num); - }; + } BOOL8 bit( // access bit uinT8 bit_num) const { // bit to access return (val >> bit_num) & 01; - }; + } }; #endif diff --git a/ccutil/bitvector.h b/ccutil/bitvector.h index 5e748077..32c3f413 100644 --- a/ccutil/bitvector.h +++ b/ccutil/bitvector.h @@ -19,9 +19,8 @@ // /////////////////////////////////////////////////////////////////////// - -#ifndef TESSERACT_CCUTIL_BITVECTOR_H__ -#define TESSERACT_CCUTIL_BITVECTOR_H__ +#ifndef TESSERACT_CCUTIL_BITVECTOR_H_ +#define TESSERACT_CCUTIL_BITVECTOR_H_ #include #include @@ -140,4 +139,4 @@ class BitVector { } // namespace tesseract. -#endif // TESSERACT_CCUTIL_BITVECTOR_H__ +#endif // TESSERACT_CCUTIL_BITVECTOR_H_ diff --git a/ccutil/ccutil.cpp b/ccutil/ccutil.cpp index ecf2cb5e..8f965bb8 100644 --- a/ccutil/ccutil.cpp +++ b/ccutil/ccutil.cpp @@ -1,5 +1,14 @@ // Copyright 2008 Google Inc. All Rights Reserved. // Author: scharron@google.com (Samuel Charron) +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "ccutil.h" diff --git a/ccutil/ccutil.h b/ccutil/ccutil.h index 974ef7d2..9f695b25 100644 --- a/ccutil/ccutil.h +++ b/ccutil/ccutil.h @@ -16,8 +16,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCUTIL_CCUTIL_H__ -#define TESSERACT_CCUTIL_CCUTIL_H__ +#ifndef TESSERACT_CCUTIL_CCUTIL_H_ +#define TESSERACT_CCUTIL_CCUTIL_H_ #include "ambigs.h" #include "errcode.h" @@ -26,9 +26,7 @@ #include "params.h" #include "unicharset.h" -#ifdef _WIN32 -#include -#else +#ifndef _WIN32 #include #include #endif @@ -68,7 +66,6 @@ class CCUtil { STRING imagebasename; // name of image STRING lang; STRING language_data_path_prefix; - TessdataManager tessdata_manager; UNICHARSET unicharset; UnicharAmbigs unichar_ambigs; STRING imagefile; // image file name @@ -96,4 +93,4 @@ class CCUtil { extern CCUtilMutex tprintfMutex; // should remain global } // namespace tesseract -#endif // TESSERACT_CCUTIL_CCUTIL_H__ +#endif // TESSERACT_CCUTIL_CCUTIL_H_ diff --git a/ccutil/clst.cpp b/ccutil/clst.cpp index fbbb561f..52caadf3 100644 --- a/ccutil/clst.cpp +++ b/ccutil/clst.cpp @@ -26,7 +26,7 @@ **********************************************************************/ /*********************************************************************** - * CLIST::internal_deep_clear + * CLIST::internal_deep_clear * * Used by the "deep_clear" member function of derived list * classes to destroy all the elements on the list. @@ -56,9 +56,8 @@ void (*zapper) (void *)) { //ptr to zapper functn } } - /*********************************************************************** - * CLIST::shallow_clear + * CLIST::shallow_clear * * Used by the destructor and the "shallow_clear" member function of derived * list classes to destroy the list. @@ -83,7 +82,7 @@ void CLIST::shallow_clear() { //destroy all links } /*********************************************************************** - * CLIST::assign_to_sublist + * CLIST::assign_to_sublist * * The list is set to a sublist of another list. "This" list must be empty * before this function is invoked. The two iterators passed must refer to @@ -107,9 +106,8 @@ void CLIST::assign_to_sublist( //to this list last = start_it->extract_sublist (end_it); } - /*********************************************************************** - * CLIST::length + * CLIST::length * * Return count of elements on list **********************************************************************/ @@ -123,9 +121,8 @@ inT32 CLIST::length() const { //count elements return count; } - /*********************************************************************** - * CLIST::sort + * CLIST::sort * * Sort elements on list **********************************************************************/ @@ -239,7 +236,7 @@ void CLIST::set_subtract(int comparator(const void*, const void*), **********************************************************************/ /*********************************************************************** - * CLIST_ITERATOR::forward + * CLIST_ITERATOR::forward * * Move the iterator to the next element of the list. * REMEMBER: ALL LISTS ARE CIRCULAR. @@ -276,9 +273,8 @@ void *CLIST_ITERATOR::forward() { return current->data; } - /*********************************************************************** - * CLIST_ITERATOR::data_relative + * CLIST_ITERATOR::data_relative * * Return the data pointer to the element "offset" elements from current. * "offset" must not be less than -1. @@ -312,9 +308,8 @@ void *CLIST_ITERATOR::data_relative( //get data + or - ... return ptr->data; } - /*********************************************************************** - * CLIST_ITERATOR::move_to_last() + * CLIST_ITERATOR::move_to_last() * * Move current so that it is set to the end of the list. * Return data just in case anyone wants it. @@ -336,9 +331,8 @@ void *CLIST_ITERATOR::move_to_last() { return current->data; } - /*********************************************************************** - * CLIST_ITERATOR::exchange() + * CLIST_ITERATOR::exchange() * * Given another iterator, whose current element is a different element on * the same list list OR an element of another list, exchange the two current @@ -434,9 +428,8 @@ void CLIST_ITERATOR::exchange( //positions of 2 link other_it->current = old_current; } - /*********************************************************************** - * CLIST_ITERATOR::extract_sublist() + * CLIST_ITERATOR::extract_sublist() * * This is a private member, used only by CLIST::assign_to_sublist. * Given another iterator for the same list, extract the links from THIS to @@ -478,7 +471,7 @@ CLIST_LINK *CLIST_ITERATOR::extract_sublist( //from temp_it.mark_cycle_pt (); do { //walk sublist - if (temp_it.cycled_list ()) //can't find end pt + if (temp_it.cycled_list()) // can't find end pt BAD_SUBLIST.error ("CLIST_ITERATOR.extract_sublist", ABORT, NULL); if (temp_it.at_last ()) { diff --git a/ccutil/clst.h b/ccutil/clst.h index a5a42a6e..f93d75af 100644 --- a/ccutil/clst.h +++ b/ccutil/clst.h @@ -28,9 +28,9 @@ class CLIST_ITERATOR; /********************************************************************** - * CLASS - CLIST_LINK + * CLASS - CLIST_LINK * - * Generic link class for singly linked CONS cell lists + * Generic link class for singly linked CONS cell lists * * Note: No destructor - elements are assumed to be destroyed EITHER after * they have been extracted from a list OR by the CLIST destructor which @@ -50,13 +50,13 @@ class DLLSYM CLIST_LINK data = next = NULL; } - CLIST_LINK( //copy constructor - const CLIST_LINK &) { //don't copy link + CLIST_LINK( // copy constructor + const CLIST_LINK &) { // don't copy link data = next = NULL; } - void operator= ( //don't copy links - const CLIST_LINK &) { + void operator=( // don't copy links + const CLIST_LINK &) { data = next = NULL; } }; @@ -89,8 +89,8 @@ class DLLSYM CLIST void internal_deep_clear ( //destroy all links void (*zapper) (void *)); //ptr to zapper functn - void shallow_clear(); //clear list but don't - //delete data elements + void shallow_clear(); // clear list but don't + // delete data elements bool empty() const { //is list empty? return !last; @@ -136,9 +136,10 @@ class DLLSYM CLIST }; /*********************************************************************** - * CLASS - CLIST_ITERATOR + * CLASS - CLIST_ITERATOR * - * Generic iterator class for singly linked lists with embedded links + * Generic iterator class for singly linked lists with embedded + *links **********************************************************************/ class DLLSYM CLIST_ITERATOR @@ -231,8 +232,8 @@ class DLLSYM CLIST_ITERATOR BOOL8 cycled_list(); //Completed a cycle? - void add_to_end( //add at end & - void *new_data); //don't move + void add_to_end( // add at end & + void *new_data); // don't move void exchange( //positions of 2 links CLIST_ITERATOR *other_it); //other iterator @@ -246,7 +247,7 @@ class DLLSYM CLIST_ITERATOR }; /*********************************************************************** - * CLIST_ITERATOR::set_to_list + * CLIST_ITERATOR::set_to_list * * (Re-)initialise the iterator to point to the start of the list_to_iterate * over. @@ -270,9 +271,8 @@ inline void CLIST_ITERATOR::set_to_list( //change list ex_current_was_cycle_pt = FALSE; } - /*********************************************************************** - * CLIST_ITERATOR::CLIST_ITERATOR + * CLIST_ITERATOR::CLIST_ITERATOR * * CONSTRUCTOR - set iterator to specified list; **********************************************************************/ @@ -281,9 +281,8 @@ inline CLIST_ITERATOR::CLIST_ITERATOR(CLIST *list_to_iterate) { set_to_list(list_to_iterate); } - /*********************************************************************** - * CLIST_ITERATOR::add_after_then_move + * CLIST_ITERATOR::add_after_then_move * * Add a new element to the list after the current element and move the * iterator to the new element. @@ -329,9 +328,8 @@ inline void CLIST_ITERATOR::add_after_then_move( // element to add current = new_element; } - /*********************************************************************** - * CLIST_ITERATOR::add_after_stay_put + * CLIST_ITERATOR::add_after_stay_put * * Add a new element to the list after the current element but do not move * the iterator to the new element. @@ -380,9 +378,8 @@ inline void CLIST_ITERATOR::add_after_stay_put( // element to add } } - /*********************************************************************** - * CLIST_ITERATOR::add_before_then_move + * CLIST_ITERATOR::add_before_then_move * * Add a new element to the list before the current element and move the * iterator to the new element. @@ -425,9 +422,8 @@ inline void CLIST_ITERATOR::add_before_then_move( // element to add current = new_element; } - /*********************************************************************** - * CLIST_ITERATOR::add_before_stay_put + * CLIST_ITERATOR::add_before_stay_put * * Add a new element to the list before the current element but don't move the * iterator to the new element. @@ -471,11 +467,11 @@ inline void CLIST_ITERATOR::add_before_stay_put( // element to add } } - /*********************************************************************** - * CLIST_ITERATOR::add_list_after + * CLIST_ITERATOR::add_list_after * - * Insert another list to this list after the current element but don't move the + * Insert another list to this list after the current element but don't move + *the * iterator. **********************************************************************/ @@ -518,9 +514,8 @@ inline void CLIST_ITERATOR::add_list_after(CLIST *list_to_add) { } } - /*********************************************************************** - * CLIST_ITERATOR::add_list_before + * CLIST_ITERATOR::add_list_before * * Insert another list to this list before the current element. Move the * iterator to the start of the inserted elements @@ -563,9 +558,8 @@ inline void CLIST_ITERATOR::add_list_before(CLIST *list_to_add) { } } - /*********************************************************************** - * CLIST_ITERATOR::extract + * CLIST_ITERATOR::extract * * Do extraction by removing current from the list, deleting the cons cell * and returning the data to the caller, but NOT updating the iterator. (So @@ -606,9 +600,8 @@ inline void *CLIST_ITERATOR::extract() { return extracted_data; } - /*********************************************************************** - * CLIST_ITERATOR::move_to_first() + * CLIST_ITERATOR::move_to_first() * * Move current so that it is set to the start of the list. * Return data just in case anyone wants it. @@ -626,9 +619,8 @@ inline void *CLIST_ITERATOR::move_to_first() { return current != NULL ? current->data : NULL; } - /*********************************************************************** - * CLIST_ITERATOR::mark_cycle_pt() + * CLIST_ITERATOR::mark_cycle_pt() * * Remember the current location so that we can tell whether we've returned * to this point later. @@ -651,9 +643,8 @@ inline void CLIST_ITERATOR::mark_cycle_pt() { started_cycling = FALSE; } - /*********************************************************************** - * CLIST_ITERATOR::at_first() + * CLIST_ITERATOR::at_first() * * Are we at the start of the list? * @@ -671,9 +662,8 @@ inline BOOL8 CLIST_ITERATOR::at_first() { !ex_current_was_last)); //first and last } - /*********************************************************************** - * CLIST_ITERATOR::at_last() + * CLIST_ITERATOR::at_last() * * Are we at the end of the list? * @@ -691,9 +681,8 @@ inline BOOL8 CLIST_ITERATOR::at_last() { ex_current_was_last)); //first and last } - /*********************************************************************** - * CLIST_ITERATOR::cycled_list() + * CLIST_ITERATOR::cycled_list() * * Have we returned to the cycle_pt since it was set? * @@ -709,9 +698,8 @@ inline BOOL8 CLIST_ITERATOR::cycled_list() { } - /*********************************************************************** - * CLIST_ITERATOR::length() + * CLIST_ITERATOR::length() * * Return the length of the list * @@ -726,9 +714,8 @@ inline inT32 CLIST_ITERATOR::length() { return list->length (); } - /*********************************************************************** - * CLIST_ITERATOR::sort() + * CLIST_ITERATOR::sort() * * Sort the elements of the list, then reposition at the start. * @@ -747,9 +734,8 @@ const void *, const void *)) { move_to_first(); } - /*********************************************************************** - * CLIST_ITERATOR::add_to_end + * CLIST_ITERATOR::add_to_end * * Add a new element to the end of the list without moving the iterator. * This is provided because a single linked list cannot move to the last as @@ -811,7 +797,7 @@ The macro generates: - An element deletion function: CLASSNAME##_c1_zapper - An element copier function: CLASSNAME##_c1_copier - - A CLIST subclass: CLASSNAME##_CLIST + - A CLIST subclass: CLASSNAME##_CLIST - A CLIST_ITERATOR subclass: CLASSNAME##_C_IT @@ -830,114 +816,116 @@ CLISTIZEH is a concatenation of 3 fragments CLISTIZEH_A, CLISTIZEH_B and CLISTIZEH_C. ***********************************************************************/ -#define CLISTIZEH_A( CLASSNAME ) \ - \ -extern DLLSYM void CLASSNAME##_c1_zapper( /*delete a link*/ \ -void* link); /*link to delete*/ \ - \ -extern DLLSYM void* CLASSNAME##_c1_copier( /*deep copy a link*/ \ -void* old_element); /*source link */ +#define CLISTIZEH_A(CLASSNAME) \ + \ + extern DLLSYM void CLASSNAME##_c1_zapper( /*delete a link*/ \ + void *link); /*link to delete*/ \ + \ + extern DLLSYM void \ + *CLASSNAME##_c1_copier( /*deep copy a link*/ \ + void *old_element); /*source link */ -#define CLISTIZEH_B( CLASSNAME ) \ - \ -/*********************************************************************** \ -* CLASS - CLASSNAME##_CLIST \ -* \ -* List class for class CLASSNAME \ -* \ -**********************************************************************/ \ - \ -class DLLSYM CLASSNAME##_CLIST : public CLIST \ -{ \ -public: \ - CLASSNAME##_CLIST():CLIST() {} \ - /* constructor */ \ - \ - CLASSNAME##_CLIST( /* don't construct */ \ - const CLASSNAME##_CLIST&) /*by initial assign*/ \ - { DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_CLIST ), \ - ABORT, NULL ); } \ - \ -void deep_clear() /* delete elements */ \ - { CLIST::internal_deep_clear( &CLASSNAME##_c1_zapper ); } \ - \ -void operator=( /* prevent assign */ \ - const CLASSNAME##_CLIST&) \ - { DONT_ASSIGN_LISTS.error( QUOTE_IT( CLASSNAME##_CLIST ), \ - ABORT, NULL ); } +#define CLISTIZEH_B(CLASSNAME) \ + \ + /*********************************************************************** \ + * CLASS - \ + *CLASSNAME##_CLIST \ + * \ + * List class for class \ + *CLASSNAME \ + * \ + **********************************************************************/ \ + \ + class DLLSYM CLASSNAME##_CLIST : public CLIST { \ + public: \ + CLASSNAME##_CLIST() : CLIST() {} \ + /* constructor */ \ + \ + CLASSNAME##_CLIST( /* don't construct */ \ + const CLASSNAME##_CLIST &) /*by initial assign*/ \ + { \ + DONT_CONSTRUCT_LIST_BY_COPY.error(QUOTE_IT(CLASSNAME##_CLIST), ABORT, \ + NULL); \ + } \ + \ + void deep_clear() /* delete elements */ \ + { \ + CLIST::internal_deep_clear(&CLASSNAME##_c1_zapper); \ + } \ + \ + void operator=(/* prevent assign */ \ + const CLASSNAME##_CLIST &) { \ + DONT_ASSIGN_LISTS.error(QUOTE_IT(CLASSNAME##_CLIST), ABORT, NULL); \ + } -#define CLISTIZEH_C( CLASSNAME ) \ - \ -}; \ - \ - \ - \ -/*********************************************************************** \ -* CLASS - CLASSNAME##_C_IT \ -* \ -* Iterator class for class CLASSNAME##_CLIST \ -* \ -* Note: We don't need to coerce pointers to member functions input \ -* parameters as these are automatically converted to the type of the base \ -* type. ("A ptr to a class may be converted to a pointer to a public base \ -* class of that class") \ -**********************************************************************/ \ - \ -class DLLSYM CLASSNAME##_C_IT : public CLIST_ITERATOR \ -{ \ -public: \ - CLASSNAME##_C_IT():CLIST_ITERATOR(){} \ - \ - CLASSNAME##_C_IT( \ - CLASSNAME##_CLIST* list):CLIST_ITERATOR(list){} \ - \ - CLASSNAME* data() \ - { return (CLASSNAME*) CLIST_ITERATOR::data(); } \ - \ - CLASSNAME* data_relative( \ - inT8 offset) \ - { return (CLASSNAME*) CLIST_ITERATOR::data_relative( offset ); } \ - \ - CLASSNAME* forward() \ - { return (CLASSNAME*) CLIST_ITERATOR::forward(); } \ - \ - CLASSNAME* extract() \ - { return (CLASSNAME*) CLIST_ITERATOR::extract(); } \ - \ - CLASSNAME* move_to_first() \ - { return (CLASSNAME*) CLIST_ITERATOR::move_to_first(); } \ - \ - CLASSNAME* move_to_last() \ - { return (CLASSNAME*) CLIST_ITERATOR::move_to_last(); } \ -}; +#define CLISTIZEH_C(CLASSNAME) \ + } \ + ; \ + \ + /*********************************************************************** \ + * CLASS - CLASSNAME##_C_IT \ + * \ + * Iterator class for class CLASSNAME##_CLIST \ + * \ + * Note: We don't need to coerce pointers to member functions input \ + * parameters as these are automatically converted to the type of the base \ + * type. ("A ptr to a class may be converted to a pointer to a public base \ + * class of that class") \ + **********************************************************************/ \ + \ + class DLLSYM CLASSNAME##_C_IT : public CLIST_ITERATOR { \ + public: \ + CLASSNAME##_C_IT() : CLIST_ITERATOR() {} \ + \ + CLASSNAME##_C_IT(CLASSNAME##_CLIST *list) : CLIST_ITERATOR(list) {} \ + \ + CLASSNAME *data() { return (CLASSNAME *)CLIST_ITERATOR::data(); } \ + \ + CLASSNAME *data_relative(inT8 offset) { \ + return (CLASSNAME *)CLIST_ITERATOR::data_relative(offset); \ + } \ + \ + CLASSNAME *forward() { return (CLASSNAME *)CLIST_ITERATOR::forward(); } \ + \ + CLASSNAME *extract() { return (CLASSNAME *)CLIST_ITERATOR::extract(); } \ + \ + CLASSNAME *move_to_first() { \ + return (CLASSNAME *)CLIST_ITERATOR::move_to_first(); \ + } \ + \ + CLASSNAME *move_to_last() { \ + return (CLASSNAME *)CLIST_ITERATOR::move_to_last(); \ + } \ + }; -#define CLISTIZEH( CLASSNAME ) \ - \ -CLISTIZEH_A( CLASSNAME ) \ - \ -CLISTIZEH_B( CLASSNAME ) \ - \ -CLISTIZEH_C( CLASSNAME ) +#define CLISTIZEH(CLASSNAME) \ + \ + CLISTIZEH_A(CLASSNAME) \ + \ + CLISTIZEH_B(CLASSNAME) \ + \ + CLISTIZEH_C(CLASSNAME) /*********************************************************************** CLISTIZE( CLASSNAME ) MACRO ***********************************************************************/ -#define CLISTIZE( CLASSNAME ) \ - \ -/*********************************************************************** \ -* CLASSNAME##_c1_zapper \ -* \ -* A function which can delete a CLASSNAME element. This is passed to the \ -* generic deep_clear list member function so that when a list is cleared the \ -* elements on the list are properly destroyed from the base class, even \ -* though we don't use a virtual destructor function. \ -**********************************************************************/ \ - \ -DLLSYM void CLASSNAME##_c1_zapper( /*delete a link*/ \ -void* link) /*link to delete*/ \ -{ \ -delete (CLASSNAME *) link; \ -} \ +#define CLISTIZE(CLASSNAME) \ + \ + /*********************************************************************** \ + * CLASSNAME##_c1_zapper \ + * \ + * A function which can delete a CLASSNAME element. This is passed to the \ + * generic deep_clear list member function so that when a list is cleared \ + *the \ + * elements on the list are properly destroyed from the base class, even \ + * though we don't use a virtual destructor function. \ + **********************************************************************/ \ + \ + DLLSYM void CLASSNAME##_c1_zapper( /*delete a link*/ \ + void *link) /*link to delete*/ \ + { \ + delete (CLASSNAME *)link; \ + } #endif diff --git a/ccutil/elst.cpp b/ccutil/elst.cpp index 8ad999b5..2d2c9ad6 100644 --- a/ccutil/elst.cpp +++ b/ccutil/elst.cpp @@ -26,7 +26,7 @@ **********************************************************************/ /*********************************************************************** - * ELIST::internal_clear + * ELIST::internal_clear * * Used by the destructor and the "clear" member function of derived list * classes to destroy all the elements on the list. @@ -57,7 +57,7 @@ void (*zapper) (ELIST_LINK *)) { } /*********************************************************************** - * ELIST::assign_to_sublist + * ELIST::assign_to_sublist * * The list is set to a sublist of another list. "This" list must be empty * before this function is invoked. The two iterators passed must refer to @@ -81,9 +81,8 @@ void ELIST::assign_to_sublist( //to this list last = start_it->extract_sublist (end_it); } - /*********************************************************************** - * ELIST::length + * ELIST::length * * Return count of elements on list **********************************************************************/ @@ -97,9 +96,8 @@ inT32 ELIST::length() const { // count elements return count; } - /*********************************************************************** - * ELIST::sort + * ELIST::sort * * Sort elements on list * NB If you don't like the const declarations in the comparator, coerce yours: @@ -187,7 +185,7 @@ ELIST_LINK *ELIST::add_sorted_and_find( **********************************************************************/ /*********************************************************************** - * ELIST_ITERATOR::forward + * ELIST_ITERATOR::forward * * Move the iterator to the next element of the list. * REMEMBER: ALL LISTS ARE CIRCULAR. @@ -224,9 +222,8 @@ ELIST_LINK *ELIST_ITERATOR::forward() { return current; } - /*********************************************************************** - * ELIST_ITERATOR::data_relative + * ELIST_ITERATOR::data_relative * * Return the data pointer to the element "offset" elements from current. * "offset" must not be less than -1. @@ -260,9 +257,8 @@ ELIST_LINK *ELIST_ITERATOR::data_relative( //get data + or - ... return ptr; } - /*********************************************************************** - * ELIST_ITERATOR::move_to_last() + * ELIST_ITERATOR::move_to_last() * * Move current so that it is set to the end of the list. * Return data just in case anyone wants it. @@ -281,9 +277,8 @@ ELIST_LINK *ELIST_ITERATOR::move_to_last() { return current; } - /*********************************************************************** - * ELIST_ITERATOR::exchange() + * ELIST_ITERATOR::exchange() * * Given another iterator, whose current element is a different element on * the same list list OR an element of another list, exchange the two current @@ -379,9 +374,8 @@ void ELIST_ITERATOR::exchange( //positions of 2 link other_it->current = old_current; } - /*********************************************************************** - * ELIST_ITERATOR::extract_sublist() + * ELIST_ITERATOR::extract_sublist() * * This is a private member, used only by ELIST::assign_to_sublist. * Given another iterator for the same list, extract the links from THIS to @@ -425,7 +419,7 @@ ELIST_LINK *ELIST_ITERATOR::extract_sublist( //from temp_it.mark_cycle_pt (); do { //walk sublist - if (temp_it.cycled_list ()) //can't find end pt + if (temp_it.cycled_list()) // can't find end pt BAD_SUBLIST.error ("ELIST_ITERATOR.extract_sublist", ABORT, NULL); if (temp_it.at_last ()) { diff --git a/ccutil/elst.h b/ccutil/elst.h index e2395774..d53a7c34 100644 --- a/ccutil/elst.h +++ b/ccutil/elst.h @@ -98,8 +98,8 @@ class DLLSYM ELIST_LINK next = NULL; } - void operator= ( //don't copy links - const ELIST_LINK &) { + void operator=( // don't copy links + const ELIST_LINK &) { next = NULL; } }; @@ -273,8 +273,8 @@ class DLLSYM ELIST_ITERATOR bool cycled_list(); //Completed a cycle? - void add_to_end( //add at end & - ELIST_LINK *new_link); //don't move + void add_to_end( // add at end & + ELIST_LINK *new_link); // don't move void exchange( //positions of 2 links ELIST_ITERATOR *other_it); //other iterator @@ -458,7 +458,6 @@ inline void ELIST_ITERATOR::add_before_then_move( // element to add current = new_element; } - /*********************************************************************** * ELIST_ITERATOR::add_before_stay_put * @@ -501,11 +500,11 @@ inline void ELIST_ITERATOR::add_before_stay_put( // element to add } } - /*********************************************************************** * ELIST_ITERATOR::add_list_after * - * Insert another list to this list after the current element but don't move the + * Insert another list to this list after the current element but don't move + *the * iterator. **********************************************************************/ @@ -959,30 +958,29 @@ ELISTIZEH_C( CLASSNAME ) ELISTIZE( CLASSNAME ) MACRO ***********************************************************************/ -#define ELISTIZE(CLASSNAME) \ - \ -/*********************************************************************** \ -* CLASSNAME##_zapper \ -* \ -* A function which can delete a CLASSNAME element. This is passed to the \ -* generic clear list member function so that when a list is cleared the \ -* elements on the list are properly destroyed from the base class, even \ -* though we don't use a virtual destructor function. \ -**********************************************************************/ \ - \ -DLLSYM void CLASSNAME##_zapper(ELIST_LINK* link) { \ - delete reinterpret_cast(link); \ -} \ - \ -/* Become a deep copy of src_list*/ \ -void CLASSNAME##_LIST::deep_copy(const CLASSNAME##_LIST* src_list, \ - CLASSNAME* (*copier)(const CLASSNAME*)) { \ - \ - CLASSNAME##_IT from_it(const_cast(src_list)); \ - CLASSNAME##_IT to_it(this); \ - \ - for (from_it.mark_cycle_pt(); !from_it.cycled_list(); from_it.forward()) \ - to_it.add_after_then_move((*copier)(from_it.data())); \ -} +#define ELISTIZE(CLASSNAME) \ + \ + /*********************************************************************** \ + * CLASSNAME##_zapper \ + * \ + * A function which can delete a CLASSNAME element. This is passed to the \ + * generic clear list member function so that when a list is cleared the \ + * elements on the list are properly destroyed from the base class, even \ + * though we don't use a virtual destructor function. \ + **********************************************************************/ \ + \ + DLLSYM void CLASSNAME##_zapper(ELIST_LINK *link) { \ + delete reinterpret_cast(link); \ + } \ + \ + /* Become a deep copy of src_list*/ \ + void CLASSNAME##_LIST::deep_copy(const CLASSNAME##_LIST *src_list, \ + CLASSNAME *(*copier)(const CLASSNAME *)) { \ + CLASSNAME##_IT from_it(const_cast(src_list)); \ + CLASSNAME##_IT to_it(this); \ + \ + for (from_it.mark_cycle_pt(); !from_it.cycled_list(); from_it.forward()) \ + to_it.add_after_then_move((*copier)(from_it.data())); \ + } #endif diff --git a/ccutil/elst2.cpp b/ccutil/elst2.cpp index 30cedec1..0d4960ed 100644 --- a/ccutil/elst2.cpp +++ b/ccutil/elst2.cpp @@ -27,7 +27,7 @@ **********************************************************************/ /*********************************************************************** - * ELIST2::internal_clear + * ELIST2::internal_clear * * Used by the destructor and the "clear" member function of derived list * classes to destroy all the elements on the list. @@ -58,7 +58,7 @@ void (*zapper) (ELIST2_LINK *)) { } /*********************************************************************** - * ELIST2::assign_to_sublist + * ELIST2::assign_to_sublist * * The list is set to a sublist of another list. "This" list must be empty * before this function is invoked. The two iterators passed must refer to @@ -82,9 +82,8 @@ void ELIST2::assign_to_sublist( //to this list last = start_it->extract_sublist (end_it); } - /*********************************************************************** - * ELIST2::length + * ELIST2::length * * Return count of elements on list **********************************************************************/ @@ -98,9 +97,8 @@ inT32 ELIST2::length() const { // count elements return count; } - /*********************************************************************** - * ELIST2::sort + * ELIST2::sort * * Sort elements on list * NB If you don't like the const declarations in the comparator, coerce yours: @@ -180,7 +178,7 @@ void ELIST2::add_sorted(int comparator(const void*, const void*), **********************************************************************/ /*********************************************************************** - * ELIST2_ITERATOR::forward + * ELIST2_ITERATOR::forward * * Move the iterator to the next element of the list. * REMEMBER: ALL LISTS ARE CIRCULAR. @@ -218,9 +216,8 @@ ELIST2_LINK *ELIST2_ITERATOR::forward() { return current; } - /*********************************************************************** - * ELIST2_ITERATOR::backward + * ELIST2_ITERATOR::backward * * Move the iterator to the previous element of the list. * REMEMBER: ALL LISTS ARE CIRCULAR. @@ -257,9 +254,8 @@ ELIST2_LINK *ELIST2_ITERATOR::backward() { return current; } - /*********************************************************************** - * ELIST2_ITERATOR::data_relative + * ELIST2_ITERATOR::data_relative * * Return the data pointer to the element "offset" elements from current. * (This function can't be INLINEd because it contains a loop) @@ -289,9 +285,8 @@ ELIST2_LINK *ELIST2_ITERATOR::data_relative( //get data + or - .. return ptr; } - /*********************************************************************** - * ELIST2_ITERATOR::exchange() + * ELIST2_ITERATOR::exchange() * * Given another iterator, whose current element is a different element on * the same list list OR an element of another list, exchange the two current @@ -399,9 +394,8 @@ void ELIST2_ITERATOR::exchange( //positions of 2 li other_it->current = old_current; } - /*********************************************************************** - * ELIST2_ITERATOR::extract_sublist() + * ELIST2_ITERATOR::extract_sublist() * * This is a private member, used only by ELIST2::assign_to_sublist. * Given another iterator for the same list, extract the links from THIS to @@ -445,7 +439,7 @@ ELIST2_LINK *ELIST2_ITERATOR::extract_sublist( //fr temp_it.mark_cycle_pt (); do { //walk sublist - if (temp_it.cycled_list ()) //can't find end pt + if (temp_it.cycled_list()) // can't find end pt BAD_SUBLIST.error ("ELIST2_ITERATOR.extract_sublist", ABORT, NULL); if (temp_it.at_last ()) { diff --git a/ccutil/elst2.h b/ccutil/elst2.h index 364abd86..bf078fbd 100644 --- a/ccutil/elst2.h +++ b/ccutil/elst2.h @@ -46,9 +46,9 @@ i) The duplication in source does not affect the run time code size - the **********************************************************************/ /********************************************************************** - * CLASS - ELIST2_LINK + * CLASS - ELIST2_LINK * - * Generic link class for doubly linked lists with embedded links + * Generic link class for doubly linked lists with embedded links * * Note: No destructor - elements are assumed to be destroyed EITHER after * they have been extracted from a list OR by the ELIST2 destructor which @@ -68,13 +68,13 @@ class DLLSYM ELIST2_LINK prev = next = NULL; } - ELIST2_LINK( //copy constructor - const ELIST2_LINK &) { //don't copy link + ELIST2_LINK( // copy constructor + const ELIST2_LINK &) { // don't copy link prev = next = NULL; } - void operator= ( //don't copy links - const ELIST2_LINK &) { + void operator=( // don't copy links + const ELIST2_LINK &) { prev = next = NULL; } }; @@ -142,9 +142,10 @@ class DLLSYM ELIST2 }; /*********************************************************************** - * CLASS - ELIST2_ITERATOR + * CLASS - ELIST2_ITERATOR * - * Generic iterator class for doubly linked lists with embedded links + * Generic iterator class for doubly linked lists with embedded + *links **********************************************************************/ class DLLSYM ELIST2_ITERATOR @@ -240,8 +241,8 @@ class DLLSYM ELIST2_ITERATOR BOOL8 cycled_list(); //Completed a cycle? - void add_to_end( //add at end & - ELIST2_LINK *new_link); //don't move + void add_to_end( // add at end & + ELIST2_LINK *new_link); // don't move void exchange( //positions of 2 links ELIST2_ITERATOR *other_it); //other iterator @@ -255,7 +256,7 @@ class DLLSYM ELIST2_ITERATOR }; /*********************************************************************** - * ELIST2_ITERATOR::set_to_list + * ELIST2_ITERATOR::set_to_list * * (Re-)initialise the iterator to point to the start of the list_to_iterate * over. @@ -279,9 +280,8 @@ inline void ELIST2_ITERATOR::set_to_list( //change list ex_current_was_cycle_pt = FALSE; } - /*********************************************************************** - * ELIST2_ITERATOR::ELIST2_ITERATOR + * ELIST2_ITERATOR::ELIST2_ITERATOR * * CONSTRUCTOR - set iterator to specified list; **********************************************************************/ @@ -290,9 +290,8 @@ inline ELIST2_ITERATOR::ELIST2_ITERATOR(ELIST2 *list_to_iterate) { set_to_list(list_to_iterate); } - /*********************************************************************** - * ELIST2_ITERATOR::add_after_then_move + * ELIST2_ITERATOR::add_after_then_move * * Add a new element to the list after the current element and move the * iterator to the new element. @@ -339,9 +338,8 @@ inline void ELIST2_ITERATOR::add_after_then_move( // element to add current = new_element; } - /*********************************************************************** - * ELIST2_ITERATOR::add_after_stay_put + * ELIST2_ITERATOR::add_after_stay_put * * Add a new element to the list after the current element but do not move * the iterator to the new element. @@ -391,9 +389,8 @@ inline void ELIST2_ITERATOR::add_after_stay_put( // element to add } } - /*********************************************************************** - * ELIST2_ITERATOR::add_before_then_move + * ELIST2_ITERATOR::add_before_then_move * * Add a new element to the list before the current element and move the * iterator to the new element. @@ -438,9 +435,8 @@ inline void ELIST2_ITERATOR::add_before_then_move( // element to add current = new_element; } - /*********************************************************************** - * ELIST2_ITERATOR::add_before_stay_put + * ELIST2_ITERATOR::add_before_stay_put * * Add a new element to the list before the current element but don't move the * iterator to the new element. @@ -486,11 +482,11 @@ inline void ELIST2_ITERATOR::add_before_stay_put( // element to add } } - /*********************************************************************** - * ELIST2_ITERATOR::add_list_after + * ELIST2_ITERATOR::add_list_after * - * Insert another list to this list after the current element but don't move the + * Insert another list to this list after the current element but don't move + *the * iterator. **********************************************************************/ @@ -537,9 +533,8 @@ inline void ELIST2_ITERATOR::add_list_after(ELIST2 *list_to_add) { } } - /*********************************************************************** - * ELIST2_ITERATOR::add_list_before + * ELIST2_ITERATOR::add_list_before * * Insert another list to this list before the current element. Move the * iterator to the start of the inserted elements @@ -586,9 +581,8 @@ inline void ELIST2_ITERATOR::add_list_before(ELIST2 *list_to_add) { } } - /*********************************************************************** - * ELIST2_ITERATOR::extract + * ELIST2_ITERATOR::extract * * Do extraction by removing current from the list, returning it to the * caller, but NOT updating the iterator. (So that any calling loop can do @@ -631,9 +625,8 @@ inline ELIST2_LINK *ELIST2_ITERATOR::extract() { return extracted_link; } - /*********************************************************************** - * ELIST2_ITERATOR::move_to_first() + * ELIST2_ITERATOR::move_to_first() * * Move current so that it is set to the start of the list. * Return data just in case anyone wants it. @@ -651,9 +644,8 @@ inline ELIST2_LINK *ELIST2_ITERATOR::move_to_first() { return current; } - /*********************************************************************** - * ELIST2_ITERATOR::move_to_last() + * ELIST2_ITERATOR::move_to_last() * * Move current so that it is set to the end of the list. * Return data just in case anyone wants it. @@ -671,9 +663,8 @@ inline ELIST2_LINK *ELIST2_ITERATOR::move_to_last() { return current; } - /*********************************************************************** - * ELIST2_ITERATOR::mark_cycle_pt() + * ELIST2_ITERATOR::mark_cycle_pt() * * Remember the current location so that we can tell whether we've returned * to this point later. @@ -696,9 +687,8 @@ inline void ELIST2_ITERATOR::mark_cycle_pt() { started_cycling = FALSE; } - /*********************************************************************** - * ELIST2_ITERATOR::at_first() + * ELIST2_ITERATOR::at_first() * * Are we at the start of the list? * @@ -716,9 +706,8 @@ inline BOOL8 ELIST2_ITERATOR::at_first() { !ex_current_was_last)); //first and last } - /*********************************************************************** - * ELIST2_ITERATOR::at_last() + * ELIST2_ITERATOR::at_last() * * Are we at the end of the list? * @@ -736,9 +725,8 @@ inline BOOL8 ELIST2_ITERATOR::at_last() { ex_current_was_last)); //first and last } - /*********************************************************************** - * ELIST2_ITERATOR::cycled_list() + * ELIST2_ITERATOR::cycled_list() * * Have we returned to the cycle_pt since it was set? * @@ -754,9 +742,8 @@ inline BOOL8 ELIST2_ITERATOR::cycled_list() { } - /*********************************************************************** - * ELIST2_ITERATOR::length() + * ELIST2_ITERATOR::length() * * Return the length of the list * @@ -771,9 +758,8 @@ inline inT32 ELIST2_ITERATOR::length() { return list->length (); } - /*********************************************************************** - * ELIST2_ITERATOR::sort() + * ELIST2_ITERATOR::sort() * * Sort the elements of the list, then reposition at the start. * @@ -792,9 +778,8 @@ const void *, const void *)) { move_to_first(); } - /*********************************************************************** - * ELIST2_ITERATOR::add_to_end + * ELIST2_ITERATOR::add_to_end * * Add a new element to the end of the list without moving the iterator. * This is provided because a single linked list cannot move to the last as @@ -854,7 +839,7 @@ will NOT work correctly for classes derived from this. The macro generates: - An element deletion function: CLASSNAME##_zapper - - An E_LIST2 subclass: CLASSNAME##_LIST + - An E_LIST2 subclass: CLASSNAME##_LIST - An E_LIST2_ITERATOR subclass: CLASSNAME##_IT @@ -873,132 +858,132 @@ ELIST2IZEH is a concatenation of 3 fragments ELIST2IZEH_A, ELIST2IZEH_B and ELIST2IZEH_C. ***********************************************************************/ -#define ELIST2IZEH_A( CLASSNAME ) \ - \ -extern DLLSYM void CLASSNAME##_zapper( /*delete a link*/ \ -ELIST2_LINK* link); /*link to delete*/ +#define ELIST2IZEH_A(CLASSNAME) \ + \ + extern DLLSYM void CLASSNAME##_zapper( /*delete a link*/ \ + ELIST2_LINK *link); /*link to delete*/ -#define ELIST2IZEH_B( CLASSNAME ) \ - \ -/*********************************************************************** \ -* CLASS - CLASSNAME##_LIST \ -* \ -* List class for class CLASSNAME \ -* \ -**********************************************************************/ \ - \ -class DLLSYM CLASSNAME##_LIST : public ELIST2 \ -{ \ -public: \ - CLASSNAME##_LIST():ELIST2() {} \ - /* constructor */ \ - \ - CLASSNAME##_LIST( /* don't construct */ \ - const CLASSNAME##_LIST&) /*by initial assign*/\ - { DONT_CONSTRUCT_LIST_BY_COPY.error( QUOTE_IT( CLASSNAME##_LIST ), \ - ABORT, NULL ); } \ - \ -void clear() /* delete elements */\ - { ELIST2::internal_clear( &CLASSNAME##_zapper ); } \ - \ - ~CLASSNAME##_LIST() /* destructor */ \ - { clear(); } \ -\ -/* Become a deep copy of src_list*/ \ -void deep_copy(const CLASSNAME##_LIST* src_list, \ - CLASSNAME* (*copier)(const CLASSNAME*)); \ -\ -void operator=( /* prevent assign */ \ - const CLASSNAME##_LIST&) \ - { DONT_ASSIGN_LISTS.error( QUOTE_IT( CLASSNAME##_LIST ), \ - ABORT, NULL ); } +#define ELIST2IZEH_B(CLASSNAME) \ + \ + /*********************************************************************** \ + * CLASS - \ + *CLASSNAME##_LIST \ + * \ + * List class for class \ + *CLASSNAME \ + * \ + **********************************************************************/ \ + \ + class DLLSYM CLASSNAME##_LIST : public ELIST2 { \ + public: \ + CLASSNAME##_LIST() : ELIST2() {} \ + /* constructor */ \ + \ + CLASSNAME##_LIST( /* don't construct */ \ + const CLASSNAME##_LIST &) /*by initial assign*/ \ + { \ + DONT_CONSTRUCT_LIST_BY_COPY.error(QUOTE_IT(CLASSNAME##_LIST), ABORT, \ + NULL); \ + } \ + \ + void clear() /* delete elements */ \ + { \ + ELIST2::internal_clear(&CLASSNAME##_zapper); \ + } \ + \ + ~CLASSNAME##_LIST() /* destructor */ \ + { \ + clear(); \ + } \ + \ + /* Become a deep copy of src_list*/ \ + void deep_copy(const CLASSNAME##_LIST *src_list, \ + CLASSNAME *(*copier)(const CLASSNAME *)); \ + \ + void operator=(/* prevent assign */ \ + const CLASSNAME##_LIST &) { \ + DONT_ASSIGN_LISTS.error(QUOTE_IT(CLASSNAME##_LIST), ABORT, NULL); \ + } -#define ELIST2IZEH_C( CLASSNAME ) \ -}; \ - \ - \ - \ -/*********************************************************************** \ -* CLASS - CLASSNAME##_IT \ -* \ -* Iterator class for class CLASSNAME##_LIST \ -* \ -* Note: We don't need to coerce pointers to member functions input \ -* parameters as these are automatically converted to the type of the base \ -* type. ("A ptr to a class may be converted to a pointer to a public base \ -* class of that class") \ -**********************************************************************/ \ - \ -class DLLSYM CLASSNAME##_IT : public ELIST2_ITERATOR \ -{ \ -public: \ - CLASSNAME##_IT():ELIST2_ITERATOR(){} \ - \ - CLASSNAME##_IT( \ -CLASSNAME##_LIST* list):ELIST2_ITERATOR(list){} \ - \ - CLASSNAME* data() \ - { return (CLASSNAME*) ELIST2_ITERATOR::data(); } \ - \ - CLASSNAME* data_relative( \ - inT8 offset) \ - { return (CLASSNAME*) ELIST2_ITERATOR::data_relative( offset ); } \ - \ - CLASSNAME* forward() \ - { return (CLASSNAME*) ELIST2_ITERATOR::forward(); } \ - \ - CLASSNAME* backward() \ - { return (CLASSNAME*) ELIST2_ITERATOR::backward(); } \ - \ - CLASSNAME* extract() \ - { return (CLASSNAME*) ELIST2_ITERATOR::extract(); } \ - \ - CLASSNAME* move_to_first() \ - { return (CLASSNAME*) ELIST2_ITERATOR::move_to_first(); } \ - \ - CLASSNAME* move_to_last() \ - { return (CLASSNAME*) ELIST2_ITERATOR::move_to_last(); } \ -}; - -#define ELIST2IZEH( CLASSNAME ) \ - \ -ELIST2IZEH_A( CLASSNAME ) \ - \ -ELIST2IZEH_B( CLASSNAME ) \ - \ -ELIST2IZEH_C( CLASSNAME ) +#define ELIST2IZEH_C(CLASSNAME) \ + } \ + ; \ + \ + /*********************************************************************** \ + * CLASS - CLASSNAME##_IT \ + * \ + * Iterator class for class CLASSNAME##_LIST \ + * \ + * Note: We don't need to coerce pointers to member functions input \ + * parameters as these are automatically converted to the type of the base \ + * type. ("A ptr to a class may be converted to a pointer to a public base \ + * class of that class") \ + **********************************************************************/ \ + \ + class DLLSYM CLASSNAME##_IT : public ELIST2_ITERATOR { \ + public: \ + CLASSNAME##_IT() : ELIST2_ITERATOR() {} \ + \ + CLASSNAME##_IT(CLASSNAME##_LIST *list) : ELIST2_ITERATOR(list) {} \ + \ + CLASSNAME *data() { return (CLASSNAME *)ELIST2_ITERATOR::data(); } \ + \ + CLASSNAME *data_relative(inT8 offset) { \ + return (CLASSNAME *)ELIST2_ITERATOR::data_relative(offset); \ + } \ + \ + CLASSNAME *forward() { return (CLASSNAME *)ELIST2_ITERATOR::forward(); } \ + \ + CLASSNAME *backward() { return (CLASSNAME *)ELIST2_ITERATOR::backward(); } \ + \ + CLASSNAME *extract() { return (CLASSNAME *)ELIST2_ITERATOR::extract(); } \ + \ + CLASSNAME *move_to_first() { \ + return (CLASSNAME *)ELIST2_ITERATOR::move_to_first(); \ + } \ + \ + CLASSNAME *move_to_last() { \ + return (CLASSNAME *)ELIST2_ITERATOR::move_to_last(); \ + } \ + }; +#define ELIST2IZEH(CLASSNAME) \ + \ + ELIST2IZEH_A(CLASSNAME) \ + \ + ELIST2IZEH_B(CLASSNAME) \ + \ + ELIST2IZEH_C(CLASSNAME) /*********************************************************************** ELIST2IZE( CLASSNAME ) MACRO ***********************************************************************/ -#define ELIST2IZE( CLASSNAME ) \ - \ -/*********************************************************************** \ -* CLASSNAME##_zapper \ -* \ -* A function which can delete a CLASSNAME element. This is passed to the \ -* generic clear list member function so that when a list is cleared the \ -* elements on the list are properly destroyed from the base class, even \ -* though we don't use a virtual destructor function. \ -**********************************************************************/ \ - \ -DLLSYM void CLASSNAME##_zapper( /*delete a link*/ \ -ELIST2_LINK* link) /*link to delete*/ \ -{ \ -delete (CLASSNAME *) link; \ -} \ -\ -/* Become a deep copy of src_list*/ \ -void CLASSNAME##_LIST::deep_copy(const CLASSNAME##_LIST* src_list, \ - CLASSNAME* (*copier)(const CLASSNAME*)) { \ -\ - CLASSNAME##_IT from_it(const_cast(src_list)); \ - CLASSNAME##_IT to_it(this); \ -\ - for (from_it.mark_cycle_pt(); !from_it.cycled_list(); from_it.forward()) \ - to_it.add_after_then_move((*copier)(from_it.data())); \ -} +#define ELIST2IZE(CLASSNAME) \ + \ + /*********************************************************************** \ + * CLASSNAME##_zapper \ + * \ + * A function which can delete a CLASSNAME element. This is passed to the \ + * generic clear list member function so that when a list is cleared the \ + * elements on the list are properly destroyed from the base class, even \ + * though we don't use a virtual destructor function. \ + **********************************************************************/ \ + \ + DLLSYM void CLASSNAME##_zapper( /*delete a link*/ \ + ELIST2_LINK *link) /*link to delete*/ \ + { \ + delete (CLASSNAME *)link; \ + } \ + \ + /* Become a deep copy of src_list*/ \ + void CLASSNAME##_LIST::deep_copy(const CLASSNAME##_LIST *src_list, \ + CLASSNAME *(*copier)(const CLASSNAME *)) { \ + CLASSNAME##_IT from_it(const_cast(src_list)); \ + CLASSNAME##_IT to_it(this); \ + \ + for (from_it.mark_cycle_pt(); !from_it.cycled_list(); from_it.forward()) \ + to_it.add_after_then_move((*copier)(from_it.data())); \ + } #endif diff --git a/ccutil/errcode.h b/ccutil/errcode.h index d6902400..2f31a7b9 100644 --- a/ccutil/errcode.h +++ b/ccutil/errcode.h @@ -87,11 +87,10 @@ const ERRCODE ASSERT_FAILED = "Assert failed"; __FILE__, __LINE__); \ } -#define ASSERT_HOST_MSG(x, ...) if (!(x)) \ - { \ - tprintf(__VA_ARGS__); \ - ASSERT_FAILED.error(#x, ABORT, "in file %s, line %d", \ - __FILE__, __LINE__); \ +#define ASSERT_HOST_MSG(x, ...) \ + if (!(x)) { \ + tprintf(__VA_ARGS__); \ + ASSERT_FAILED.error(#x, ABORT, "in file %s, line %d", __FILE__, __LINE__); \ } void signal_exit(int signal_code); diff --git a/ccutil/genericheap.h b/ccutil/genericheap.h index bb5f8ddc..b68439aa 100644 --- a/ccutil/genericheap.h +++ b/ccutil/genericheap.h @@ -19,12 +19,12 @@ // /////////////////////////////////////////////////////////////////////// -#include "errcode.h" -#include "genericvector.h" - #ifndef TESSERACT_CCUTIL_GENERICHEAP_H_ #define TESSERACT_CCUTIL_GENERICHEAP_H_ +#include "errcode.h" +#include "genericvector.h" + namespace tesseract { // GenericHeap requires 1 template argument: @@ -108,6 +108,8 @@ class GenericHeap { const Pair& PeekTop() const { return heap_[0]; } + // Get the value of the worst (largest, defined by operator< ) element. + const Pair& PeekWorst() const { return heap_[IndexOfWorst()]; } // Removes the top element of the heap. If entry is not NULL, the element // is copied into *entry, otherwise it is discarded. @@ -136,22 +138,12 @@ class GenericHeap { // not NULL, the element is copied into *entry, otherwise it is discarded. // Time = O(n). Returns false if the heap was already empty. bool PopWorst(Pair* entry) { - int heap_size = heap_.size(); - if (heap_size == 0) return false; // It cannot be empty! - - // Find the maximum element. Its index is guaranteed to be greater than - // the index of the parent of the last element, since by the heap invariant - // the parent must be less than or equal to the children. - int worst_index = heap_size - 1; - int end_parent = ParentNode(worst_index); - for (int i = worst_index - 1; i > end_parent; --i) { - if (heap_[worst_index] < heap_[i]) - worst_index = i; - } + int worst_index = IndexOfWorst(); + if (worst_index < 0) return false; // It cannot be empty! // Extract the worst element from the heap, leaving a hole at worst_index. if (entry != NULL) *entry = heap_[worst_index]; - --heap_size; + int heap_size = heap_.size() - 1; if (heap_size > 0) { // Sift the hole upwards to match the last element of the heap_ Pair hole_pair = heap_[heap_size]; @@ -162,6 +154,22 @@ class GenericHeap { return true; } + // Returns the index of the worst element. Time = O(n/2). + int IndexOfWorst() const { + int heap_size = heap_.size(); + if (heap_size == 0) return -1; // It cannot be empty! + + // Find the maximum element. Its index is guaranteed to be greater than + // the index of the parent of the last element, since by the heap invariant + // the parent must be less than or equal to the children. + int worst_index = heap_size - 1; + int end_parent = ParentNode(worst_index); + for (int i = worst_index - 1; i > end_parent; --i) { + if (heap_[worst_index] < heap_[i]) worst_index = i; + } + return worst_index; + } + // The pointed-to Pair has changed its key value, so the location of pair // is reshuffled to maintain the heap invariant. // Must be a valid pointer to an element of the heap_! diff --git a/ccutil/genericvector.h b/ccutil/genericvector.h index d867d892..eae15af2 100644 --- a/ccutil/genericvector.h +++ b/ccutil/genericvector.h @@ -162,7 +162,9 @@ class GenericVector { // Returns false on error or if the callback returns false. // DEPRECATED. Use [De]Serialize[Classes] instead. bool write(FILE* f, TessResultCallback2* cb) const; - bool read(FILE* f, TessResultCallback3* cb, bool swap); + bool read(tesseract::TFile* f, + TessResultCallback3* cb, + bool swap); // Writes a vector of simple types to the given file. Assumes that bitwise // read/write of T will work. Returns false in case of error. // TODO(rays) Change all callers to use TFile and remove deprecated methods. @@ -174,6 +176,8 @@ class GenericVector { // If swap is true, assumes a big/little-endian swap is needed. bool DeSerialize(bool swap, FILE* fp); bool DeSerialize(bool swap, tesseract::TFile* fp); + // Skips the deserialization of the vector. + static bool SkipDeSerialize(bool swap, tesseract::TFile* fp); // Writes a vector of classes to the given file. Assumes the existence of // bool T::Serialize(FILE* fp) const that returns false in case of error. // Returns false in case of error. @@ -186,6 +190,8 @@ class GenericVector { // If swap is true, assumes a big/little-endian swap is needed. bool DeSerializeClasses(bool swap, FILE* fp); bool DeSerializeClasses(bool swap, tesseract::TFile* fp); + // Calls SkipDeSerialize on the elements of the vector. + static bool SkipDeSerializeClasses(bool swap, tesseract::TFile* fp); // Allocates a new array of double the current_size, copies over the // information from data to the new location, deletes data and returns @@ -238,14 +244,13 @@ class GenericVector { int binary_search(const T& target) const { int bottom = 0; int top = size_used_; - do { + while (top - bottom > 1) { int middle = (bottom + top) / 2; if (data_[middle] > target) top = middle; else bottom = middle; } - while (top - bottom > 1); return bottom; } @@ -360,8 +365,7 @@ inline bool LoadDataFromFile(const STRING& filename, fseek(fp, 0, SEEK_END); size_t size = ftell(fp); fseek(fp, 0, SEEK_SET); - // Pad with a 0, just in case we treat the result as a string. - data->init_to_size((int)size + 1, 0); + data->init_to_size(static_cast(size), 0); bool result = fread(&(*data)[0], 1, size, fp) == size; fclose(fp); return result; @@ -377,6 +381,17 @@ inline bool SaveDataToFile(const GenericVector& data, fclose(fp); return result; } +// Reads a file as a vector of STRING. +inline bool LoadFileLinesToStrings(const STRING& filename, + GenericVector* lines) { + GenericVector data; + if (!LoadDataFromFile(filename.string(), &data)) { + return false; + } + STRING lines_str(&data[0], data.size()); + lines_str.split('\n', lines); + return true; +} template bool cmp_eq(T const & t1, T const & t2) { @@ -556,34 +571,54 @@ class PointerVector : public GenericVector { } bool DeSerialize(bool swap, TFile* fp) { inT32 reserved; - if (fp->FRead(&reserved, sizeof(reserved), 1) != 1) return false; - if (swap) Reverse32(&reserved); + if (!DeSerializeSize(swap, fp, &reserved)) return false; GenericVector::reserve(reserved); truncate(0); for (int i = 0; i < reserved; ++i) { - inT8 non_null; - if (fp->FRead(&non_null, sizeof(non_null), 1) != 1) return false; - T* item = NULL; - if (non_null) { - item = new T; - if (!item->DeSerialize(swap, fp)) { - delete item; - return false; - } - this->push_back(item); - } else { - // Null elements should keep their place in the vector. - this->push_back(NULL); + if (!DeSerializeElement(swap, fp)) return false; + } + return true; + } + // Enables deserialization of a selection of elements. Note that in order to + // retain the integrity of the stream, the caller must call some combination + // of DeSerializeElement and DeSerializeSkip of the exact number returned in + // *size, assuming a true return. + static bool DeSerializeSize(bool swap, TFile* fp, inT32* size) { + if (fp->FRead(size, sizeof(*size), 1) != 1) return false; + if (swap) Reverse32(size); + return true; + } + // Reads and appends to the vector the next element of the serialization. + bool DeSerializeElement(bool swap, TFile* fp) { + inT8 non_null; + if (fp->FRead(&non_null, sizeof(non_null), 1) != 1) return false; + T* item = NULL; + if (non_null) { + item = new T; + if (!item->DeSerialize(swap, fp)) { + delete item; + return false; } + this->push_back(item); + } else { + // Null elements should keep their place in the vector. + this->push_back(NULL); + } + return true; + } + // Skips the next element of the serialization. + static bool DeSerializeSkip(bool swap, TFile* fp) { + inT8 non_null; + if (fp->FRead(&non_null, sizeof(non_null), 1) != 1) return false; + if (non_null) { + if (!T::SkipDeSerialize(swap, fp)) return false; } return true; } // Sorts the items pointed to by the members of this vector using // t::operator<(). - void sort() { - sort(&sort_ptr_cmp); - } + void sort() { this->GenericVector::sort(&sort_ptr_cmp); } }; } // namespace tesseract @@ -852,15 +887,14 @@ bool GenericVector::write( } template -bool GenericVector::read(FILE* f, - TessResultCallback3* cb, - bool swap) { +bool GenericVector::read( + tesseract::TFile* f, + TessResultCallback3* cb, bool swap) { inT32 reserved; - if (fread(&reserved, sizeof(reserved), 1, f) != 1) return false; - if (swap) Reverse32(&reserved); + if (f->FReadEndian(&reserved, sizeof(reserved), 1, swap) != 1) return false; reserve(reserved); - if (fread(&size_used_, sizeof(size_used_), 1, f) != 1) return false; - if (swap) Reverse32(&size_used_); + if (f->FReadEndian(&size_used_, sizeof(size_used_), 1, swap) != 1) + return false; if (cb != NULL) { for (int i = 0; i < size_used_; ++i) { if (!cb->Run(f, data_ + i, swap)) { @@ -870,11 +904,8 @@ bool GenericVector::read(FILE* f, } delete cb; } else { - if (fread(data_, sizeof(T), size_used_, f) != size_used_) return false; - if (swap) { - for (int i = 0; i < size_used_; ++i) - ReverseN(&data_[i], sizeof(T)); - } + if (f->FReadEndian(data_, sizeof(T), size_used_, swap) != size_used_) + return false; } return true; } @@ -926,6 +957,13 @@ bool GenericVector::DeSerialize(bool swap, tesseract::TFile* fp) { } return true; } +template +bool GenericVector::SkipDeSerialize(bool swap, tesseract::TFile* fp) { + inT32 reserved; + if (fp->FRead(&reserved, sizeof(reserved), 1) != 1) return false; + if (swap) Reverse32(&reserved); + return fp->FRead(NULL, sizeof(T), reserved) == reserved; +} // Writes a vector of classes to the given file. Assumes the existence of // bool T::Serialize(FILE* fp) const that returns false in case of error. @@ -976,6 +1014,16 @@ bool GenericVector::DeSerializeClasses(bool swap, tesseract::TFile* fp) { } return true; } +template +bool GenericVector::SkipDeSerializeClasses(bool swap, tesseract::TFile* fp) { + uinT32 reserved; + if (fp->FRead(&reserved, sizeof(reserved), 1) != 1) return false; + if (swap) Reverse32(&reserved); + for (int i = 0; i < reserved; ++i) { + if (!T::SkipDeSerialize(swap, fp)) return false; + } + return true; +} // This method clear the current object, then, does a shallow copy of // its argument, and finally invalidates its argument. diff --git a/ccutil/hashfn.h b/ccutil/hashfn.h deleted file mode 100644 index be211b07..00000000 --- a/ccutil/hashfn.h +++ /dev/null @@ -1,90 +0,0 @@ -/********************************************************************** - * File: hashfn.h (Formerly hash.h) - * Description: Portability hacks for hash_map, hash_set and unique_ptr. - * Author: Ray Smith - * Created: Wed Jan 08 14:08:25 PST 2014 - * - * (C) Copyright 2014, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#ifndef HASHFN_H -#define HASHFN_H - -#ifdef USE_STD_NAMESPACE -#if (__cplusplus >= 201103L) || defined(_MSC_VER) // Visual Studio -#include -#include -#define hash_map std::unordered_map -#if (_MSC_VER >= 1500 && _MSC_VER < 1600) // Visual Studio 2008 -using namespace std::tr1; -#else // _MSC_VER -using std::unordered_map; -using std::unordered_set; -#include -#define SmartPtr std::unique_ptr -#define HAVE_UNIQUE_PTR -#endif // _MSC_VER -#elif (defined(__GNUC__) && (((__GNUC__ == 3) && (__GNUC_MINOR__ > 0)) || \ - __GNUC__ >= 4)) // gcc -// hash_set is deprecated in gcc -#include -#include -using __gnu_cxx::hash_map; -using __gnu_cxx::hash_set; -#define unordered_map hash_map -#define unordered_set hash_set -#else -#include -#include -#endif // gcc -#elif (__clang__) -#include -#include -#define hash_map std::unordered_map -#define unordered_set std::unordered_set -#else // USE_STD_NAMESPACE -#include -#include -#define unordered_map hash_map -#define unordered_set hash_set -#endif // USE_STD_NAMESPACE - -#ifndef HAVE_UNIQUE_PTR -// Trivial smart ptr. Expand to add features of std::unique_ptr as required. -template class SmartPtr { - public: - SmartPtr() : ptr_(NULL) {} - explicit SmartPtr(T* ptr) : ptr_(ptr) {} - ~SmartPtr() { - delete ptr_; - } - - T* get() const { - return ptr_; - } - void reset(T* ptr) { - if (ptr_ != NULL) delete ptr_; - ptr_ = ptr; - } - bool operator==(const T* ptr) const { - return ptr_ == ptr; - } - T* operator->() const { - return ptr_; - } - private: - T* ptr_; -}; -#endif // HAVE_UNIQUE_PTR - -#endif // HASHFN_H diff --git a/ccutil/helpers.h b/ccutil/helpers.h index 51dd3b0b..33ffd6c4 100644 --- a/ccutil/helpers.h +++ b/ccutil/helpers.h @@ -27,6 +27,8 @@ #include #include +#include +#include #include "host.h" @@ -43,6 +45,11 @@ class TRand { void set_seed(uinT64 seed) { seed_ = seed; } + // Sets the seed using a hash of a string. + void set_seed(const std::string& str) { + std::hash hasher; + set_seed(static_cast(hasher(str))); + } // Returns an integer in the range 0 to MAX_INT32. inT32 IntRand() { @@ -73,7 +80,7 @@ class TRand { // Remove newline (if any) at the end of the string. inline void chomp_string(char *str) { - int last_index = (int)strlen(str) - 1; + int last_index = static_cast(strlen(str)) - 1; while (last_index >= 0 && (str[last_index] == '\n' || str[last_index] == '\r')) { str[last_index--] = '\0'; diff --git a/ccutil/host.h b/ccutil/host.h index 3ec3bcb7..1845128e 100644 --- a/ccutil/host.h +++ b/ccutil/host.h @@ -42,8 +42,8 @@ ** limitations under the License. */ -#ifndef __HOST__ -#define __HOST__ +#ifndef TESSERACT_CCUTIL_HOST_H_ +#define TESSERACT_CCUTIL_HOST_H_ /****************************************************************************** ** IMPORTANT!!! ** @@ -59,13 +59,17 @@ ** of the computer and/or operating system. ******************************************************************************/ +#include #include "platform.h" /* _WIN32 */ #ifdef _WIN32 #include -#include // winbase.h contains windows.h +#undef min +#undef max #endif +#include // int32_t, ... + /********************************************************/ /* __MAC__ */ #ifdef __MAC__ @@ -95,19 +99,14 @@ //typedef HANDLE FD* PHANDLE; // definitions of portable data types (numbers and characters) -typedef SIGNED char inT8; -typedef unsigned char uinT8; -typedef short inT16; -typedef unsigned short uinT16; -typedef int inT32; -typedef unsigned int uinT32; -#if (_MSC_VER >= 1200) //%%% vkr for VC 6.0 -typedef INT64 inT64; -typedef UINT64 uinT64; -#else -typedef long long int inT64; -typedef unsigned long long int uinT64; -#endif //%%% vkr for VC 6.0 +typedef int8_t inT8; +typedef uint8_t uinT8; +typedef int16_t inT16; +typedef uint16_t uinT16; +typedef int32_t inT32; +typedef uint32_t uinT32; +typedef int64_t inT64; +typedef uint64_t uinT64; typedef float FLOAT32; typedef double FLOAT64; typedef unsigned char BOOL8; @@ -121,15 +120,16 @@ typedef unsigned char BOOL8; #define MAX_UINT8 0xff #define MAX_UINT16 0xffff #define MAX_UINT32 0xffffffff -#define MAX_FLOAT32 ((float)3.40282347e+38) +#define MAX_FLOAT32 std::numeric_limits::max() -#define MIN_INT8 0x80 -#define MIN_INT16 0x8000 -#define MIN_INT32 static_cast(0x80000000) +#define MIN_INT8 static_cast(0x80) +#define MIN_INT16 static_cast(0x8000) +#define MIN_INT32 static_cast(0x80000000) #define MIN_UINT8 0x00 #define MIN_UINT16 0x0000 #define MIN_UINT32 0x00000000 -#define MIN_FLOAT32 ((float)1.17549435e-38) +// Minimum positive value ie 1e-37ish. +#define MIN_FLOAT32 std::numeric_limits::min() // Defines #ifndef TRUE @@ -146,4 +146,4 @@ template bool NearlyEqual(T x, T y, T tolerance) { return diff <= tolerance && -diff <= tolerance; } -#endif +#endif // TESSERACT_CCUTIL_HOST_H_ diff --git a/ccutil/lsterr.h b/ccutil/lsterr.h index 42ed07e3..e97d713e 100644 --- a/ccutil/lsterr.h +++ b/ccutil/lsterr.h @@ -17,10 +17,10 @@ * **********************************************************************/ -#include "errcode.h" //must be last include +#ifndef TESSERACT_CCUTIL_LSTERR_H_ +#define TESSERACT_CCUTIL_LSTERR_H_ -#ifndef LSTERR_H -#define LSTERR_H +#include "errcode.h" //must be last include const ERRCODE DONT_CONSTRUCT_LIST_BY_COPY = "Can't create a list by assignment"; @@ -38,6 +38,7 @@ const ERRCODE NULL_PREV = "Previous element on the list is NULL"; const ERRCODE EMPTY_LIST = "List is empty"; const ERRCODE BAD_PARAMETER = "List parameter error"; const ERRCODE STILL_LINKED = -"Attempting to add an element with non NULL links, to a list"; -#endif -#endif + "Attempting to add an element with non NULL links, to a list"; + +#endif // !NDEBUG +#endif // TESSERACT_CCUTIL_LSTERR_H_ diff --git a/ccutil/mainblk.cpp b/ccutil/mainblk.cpp index aa73c550..2ef01c2f 100644 --- a/ccutil/mainblk.cpp +++ b/ccutil/mainblk.cpp @@ -55,7 +55,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) { char *tessdata_prefix = getenv("TESSDATA_PREFIX"); - if (argv0 != NULL) { + if (argv0 != NULL && *argv0 != '\0') { /* Use tessdata prefix from the command line. */ datadir = argv0; } else if (tessdata_prefix) { @@ -74,7 +74,7 @@ void CCUtil::main_setup(const char *argv0, const char *basename) { #endif /* _WIN32 */ #if defined(TESSDATA_PREFIX) } else { - /* Use tessdata prefix which was compiled in. */ +/* Use tessdata prefix which was compiled in. */ #define _STR(a) #a #define _XSTR(a) _STR(a) datadir = _XSTR(TESSDATA_PREFIX); diff --git a/ccutil/ocrclass.h b/ccutil/ocrclass.h index 3175a6d2..cb83c6d6 100644 --- a/ccutil/ocrclass.h +++ b/ccutil/ocrclass.h @@ -1,7 +1,7 @@ /********************************************************************** * File: ocrclass.h * Description: Class definitions and constants for the OCR API. - * Author: Hewlett-Packard Co + * Author: Hewlett-Packard Co * * (C) Copyright 1996, Hewlett-Packard Co. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -29,7 +29,6 @@ #ifndef __GNUC__ #ifdef _WIN32 -#include #include "gettimeofday.h" #endif #else @@ -110,28 +109,35 @@ typedef struct { /*single character */ * user words found. If it returns true then operation is cancelled. **********************************************************************/ typedef bool (*CANCEL_FUNC)(void* cancel_this, int words); -typedef bool (*PROGRESS_FUNC)(int progress, - int left, int right, int top, int bottom); +typedef bool (*PROGRESS_FUNC)(int progress, int left, int right, int top, + int bottom); class ETEXT_DESC { // output header public: - inT16 count; /// chars in this buffer(0) - inT16 progress; /// percent complete increasing (0-100) + inT16 count; /// chars in this buffer(0) + inT16 progress; /// percent complete increasing (0-100) /** Progress monitor covers word recognition and it does not cover layout * analysis. * See Ray comment in https://github.com/tesseract-ocr/tesseract/pull/27 */ - inT8 more_to_come; /// true if not last - volatile inT8 ocr_alive; /// ocr sets to 1, HP 0 - inT8 err_code; /// for errcode use - CANCEL_FUNC cancel; /// returns true to cancel - PROGRESS_FUNC progress_callback; /// called whenever progress increases - void* cancel_this; /// this or other data for cancel - struct timeval end_time; /** time to stop. expected to be set only by call - * to set_deadline_msecs() */ - EANYCODE_CHAR text[1]; /// character data + inT8 more_to_come; /// true if not last + volatile inT8 ocr_alive; /// ocr sets to 1, HP 0 + inT8 err_code; /// for errcode use + CANCEL_FUNC cancel; /// returns true to cancel + PROGRESS_FUNC progress_callback; /// called whenever progress increases + void* cancel_this; /// this or other data for cancel + struct timeval end_time; /// Time to stop. Expected to be set only + /// by call to set_deadline_msecs(). + EANYCODE_CHAR text[1]; /// character data - ETEXT_DESC() : count(0), progress(0), more_to_come(0), ocr_alive(0), - err_code(0), cancel(NULL), cancel_this(NULL) { + ETEXT_DESC() + : count(0), + progress(0), + more_to_come(0), + ocr_alive(0), + err_code(0), + cancel(NULL), + progress_callback(NULL), + cancel_this(NULL) { end_time.tv_sec = 0; end_time.tv_usec = 0; } diff --git a/ccutil/params.cpp b/ccutil/params.cpp index ce2b4300..7fd1c5bb 100644 --- a/ccutil/params.cpp +++ b/ccutil/params.cpp @@ -31,9 +31,8 @@ #define EQUAL '=' tesseract::ParamsVectors *GlobalParams() { - static tesseract::ParamsVectors *global_params = - new tesseract::ParamsVectors(); - return global_params; + static tesseract::ParamsVectors global_params = tesseract::ParamsVectors(); + return &global_params; } namespace tesseract { @@ -42,8 +41,6 @@ bool ParamUtils::ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params) { inT16 nameoffset; // offset for real name - FILE *fp; // file pointer - // iterators if (*file == PLUS) { nameoffset = 1; @@ -53,27 +50,23 @@ bool ParamUtils::ReadParamsFile(const char *file, nameoffset = 0; } - fp = fopen(file + nameoffset, "rb"); - if (fp == NULL) { + TFile fp; + if (!fp.Open(file + nameoffset, nullptr)) { tprintf("read_params_file: Can't open %s\n", file + nameoffset); return true; } - const bool anyerr = ReadParamsFromFp(fp, -1, constraint, member_params); - fclose(fp); - return anyerr; + return ReadParamsFromFp(constraint, &fp, member_params); } -bool ParamUtils::ReadParamsFromFp(FILE *fp, inT64 end_offset, - SetParamConstraint constraint, +bool ParamUtils::ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params) { char line[MAX_PATH]; // input line bool anyerr = false; // true if any error bool foundit; // found parameter char *valptr; // value field - while ((end_offset < 0 || ftell(fp) < end_offset) && - fgets(line, MAX_PATH, fp)) { - if (line[0] != '\n' && line[0] != '#') { + while (fp->FGets(line, MAX_PATH) != nullptr) { + if (line[0] != '\r' && line[0] != '\n' && line[0] != '#') { chomp_string(line); // remove newline for (valptr = line; *valptr && *valptr != ' ' && *valptr != '\t'; valptr++); diff --git a/ccutil/params.h b/ccutil/params.h index d49ce3ff..b49015f3 100644 --- a/ccutil/params.h +++ b/ccutil/params.h @@ -60,9 +60,8 @@ class ParamUtils { SetParamConstraint constraint, ParamsVectors *member_params); - // Read parameters from the given file pointer (stop at end_offset). - static bool ReadParamsFromFp(FILE *fp, inT64 end_offset, - SetParamConstraint constraint, + // Read parameters from the given file pointer. + static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params); // Set a parameters to have the given value. diff --git a/ccutil/platform.h b/ccutil/platform.h index 219f9e31..0662f9bb 100644 --- a/ccutil/platform.h +++ b/ccutil/platform.h @@ -17,13 +17,19 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCUTIL_PLATFORM_H__ -#define TESSERACT_CCUTIL_PLATFORM_H__ +#ifndef TESSERACT_CCUTIL_PLATFORM_H_ +#define TESSERACT_CCUTIL_PLATFORM_H_ #include #define DLLSYM #ifdef _WIN32 +#ifndef NOMINMAX +#define NOMINMAX +#endif /* NOMINMAX */ +#ifndef WIN32_LEAN_AND_MEAN +#define WIN32_LEAN_AND_MEAN +#endif #ifdef __GNUC__ #define ultoa _ultoa #endif /* __GNUC__ */ @@ -77,12 +83,4 @@ #endif #endif -#if defined(_WIN32) || defined(__CYGWIN__) - #define _TESS_FILE_BASENAME_ \ - (strrchr(__FILE__, '\\') ? strrchr(__FILE__, '\\') + 1 : __FILE__) -#else // Unices - #define _TESS_FILE_BASENAME_ \ - (strrchr(__FILE__, '/') ? strrchr(__FILE__, '/') + 1 : __FILE__) -#endif - -#endif // TESSERACT_CCUTIL_PLATFORM_H__ +#endif // TESSERACT_CCUTIL_PLATFORM_H_ diff --git a/ccutil/scanutils.cpp b/ccutil/scanutils.cpp index cba7d549..254eaf28 100644 --- a/ccutil/scanutils.cpp +++ b/ccutil/scanutils.cpp @@ -281,7 +281,6 @@ static int tvfscanf(FILE* stream, const char *format, va_list ap) { } state = ST_NORMAL; char *sarg = NULL; // %s %c or %[ string argument enum Bail bail = BAIL_NONE; - int sign; int converted = 0; // Successful conversions unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) / (CHAR_BIT * sizeof(long))]; @@ -363,29 +362,29 @@ static int tvfscanf(FILE* stream, const char *format, va_list ap) { case 'P': // Upper case pointer case 'p': // Pointer rank = RANK_PTR; - base = 0; sign = 0; - goto scan_int; + base = 0; + goto scan_int; case 'i': // Base-independent integer - base = 0; sign = 1; - goto scan_int; + base = 0; + goto scan_int; case 'd': // Decimal integer - base = 10; sign = 1; - goto scan_int; + base = 10; + goto scan_int; case 'o': // Octal integer - base = 8; sign = 0; - goto scan_int; + base = 8; + goto scan_int; case 'u': // Unsigned decimal integer - base = 10; sign = 0; - goto scan_int; + base = 10; + goto scan_int; case 'x': // Hexadecimal integer case 'X': - base = 16; sign = 0; - goto scan_int; + base = 16; + goto scan_int; case 'n': // Number of characters consumed val = ftell(stream) - start_off; diff --git a/ccutil/serialis.cpp b/ccutil/serialis.cpp index ff3b278a..80dd9b97 100644 --- a/ccutil/serialis.cpp +++ b/ccutil/serialis.cpp @@ -88,6 +88,17 @@ char* TFile::FGets(char* buffer, int buffer_size) { return size > 0 ? buffer : NULL; } +int TFile::FReadEndian(void* buffer, int size, int count, bool swap) { + int num_read = FRead(buffer, size, count); + if (swap) { + char* char_buffer = reinterpret_cast(buffer); + for (int i = 0; i < num_read; ++i, char_buffer += size) { + ReverseN(char_buffer, size); + } + } + return num_read; +} + int TFile::FRead(void* buffer, int size, int count) { ASSERT_HOST(!is_writing_); int required_size = size * count; @@ -95,7 +106,7 @@ int TFile::FRead(void* buffer, int size, int count) { char* char_buffer = reinterpret_cast(buffer); if (data_->size() - offset_ < required_size) required_size = data_->size() - offset_; - if (required_size > 0) + if (required_size > 0 && char_buffer != NULL) memcpy(char_buffer, &(*data_)[offset_], required_size); offset_ += required_size; return required_size / size; diff --git a/ccutil/serialis.h b/ccutil/serialis.h index 8dfac1d7..f5f98a2c 100644 --- a/ccutil/serialis.h +++ b/ccutil/serialis.h @@ -67,6 +67,10 @@ class TFile { // the line is longer. Does nothing if buffer_size <= 0. // To use fscanf use FGets and sscanf. char* FGets(char* buffer, int buffer_size); + // Replicates fread, followed by a swap of the bytes if needed, returning the + // number of items read. If swap is true then the count items will each have + // size bytes reversed. + int FReadEndian(void* buffer, int size, int count, bool swap); // Replicates fread, returning the number of items read. int FRead(void* buffer, int size, int count); // Resets the TFile as if it has been Opened, but nothing read. diff --git a/ccutil/strngs.cpp b/ccutil/strngs.cpp index ff3bbac2..5a9cfd0d 100644 --- a/ccutil/strngs.cpp +++ b/ccutil/strngs.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: strngs.c (Formerly strings.c) * Description: STRING class functions. - * Author: Ray Smith - * Created: Fri Feb 15 09:13:30 GMT 1991 + * Author: Ray Smith + * Created: Fri Feb 15 09:13:30 GMT 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -181,6 +181,14 @@ bool STRING::DeSerialize(bool swap, TFile* fp) { return true; } +// As DeSerialize, but only seeks past the data - hence a static method. +bool STRING::SkipDeSerialize(bool swap, tesseract::TFile* fp) { + inT32 len; + if (fp->FRead(&len, sizeof(len), 1) != 1) return false; + if (swap) ReverseN(&len, sizeof(len)); + return fp->FRead(NULL, 1, len) == len; +} + BOOL8 STRING::contains(const char c) const { return (c != '\0') && (strchr (GetCStr(), c) != NULL); } diff --git a/ccutil/strngs.h b/ccutil/strngs.h index 9308cc67..2e65463e 100644 --- a/ccutil/strngs.h +++ b/ccutil/strngs.h @@ -60,6 +60,8 @@ class TESS_API STRING // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. bool DeSerialize(bool swap, tesseract::TFile* fp); + // As DeSerialize, but only seeks past the data - hence a static method. + static bool SkipDeSerialize(bool swap, tesseract::TFile* fp); BOOL8 contains(const char c) const; inT32 length() const; @@ -145,13 +147,11 @@ class TESS_API STRING } // returns the string data part of storage - inline char* GetCStr() { - return ((char *)data_) + sizeof(STRING_HEADER); - }; + inline char* GetCStr() { return ((char*)data_) + sizeof(STRING_HEADER); } inline const char* GetCStr() const { return ((const char *)data_) + sizeof(STRING_HEADER); - }; + } inline bool InvariantOk() const { #if STRING_IS_PROTECTED return (GetHeader()->used_ == 0) ? diff --git a/ccutil/tesscallback.h b/ccutil/tesscallback.h index 1f20c6b4..5f9ea260 100644 --- a/ccutil/tesscallback.h +++ b/ccutil/tesscallback.h @@ -16,8 +16,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef _TESS_CALLBACK_SPECIALIZATIONS_H -#define _TESS_CALLBACK_SPECIALIZATIONS_H +#ifndef TESS_CALLBACK_SPECIALIZATIONS_H_ +#define TESS_CALLBACK_SPECIALIZATIONS_H_ #include "host.h" // For NULL. @@ -9718,4 +9718,4 @@ NewPermanentTessCallback(R (*function)(P1,P2,P3,P4,P5,P6,A1,A2,A3,A4,A5), typena return new _TessFunctionResultCallback_6_5(function, p1, p2, p3, p4, p5, p6); } -#endif /* _TESS_CALLBACK_SPECIALIZATIONS_H */ +#endif // TESS_CALLBACK_SPECIALIZATIONS_H_ diff --git a/ccutil/tessdatamanager.cpp b/ccutil/tessdatamanager.cpp index 23d029bb..9fea1ef9 100644 --- a/ccutil/tessdatamanager.cpp +++ b/ccutil/tessdatamanager.cpp @@ -33,212 +33,192 @@ namespace tesseract { -bool TessdataManager::Init(const char *data_file_name, int debug_level) { - int i; - debug_level_ = debug_level; +// Lazily loads from the the given filename. Won't actually read the file +// until it needs it. +void TessdataManager::LoadFileLater(const char *data_file_name) { + Clear(); data_file_name_ = data_file_name; - data_file_ = fopen(data_file_name, "rb"); - if (data_file_ == NULL) { - tprintf("Error opening data file %s\n", data_file_name); - tprintf("Please make sure the TESSDATA_PREFIX environment variable is set " - "to the parent directory of your \"tessdata\" directory.\n"); +} + +bool TessdataManager::Init(const char *data_file_name) { + GenericVector data; + bool result = true; + if (reader_ == nullptr) { + if (!LoadDataFromFile(data_file_name, &data)) return false; + } else { + if (!(*reader_)(data_file_name, &data)) return false; + } + return LoadMemBuffer(data_file_name, &data[0], data.size()); +} + +// Loads from the given memory buffer as if a file. +bool TessdataManager::LoadMemBuffer(const char *name, const char *data, + int size) { + data_file_name_ = name; + TFile fp; + fp.Open(data, size); + inT32 num_entries = TESSDATA_NUM_ENTRIES; + if (fp.FRead(&num_entries, sizeof(num_entries), 1) != 1) return false; + swap_ = num_entries > kMaxNumTessdataEntries || num_entries < 0; + if (swap_) ReverseN(&num_entries, sizeof(num_entries)); + GenericVector offset_table; + offset_table.init_to_size(num_entries, -1); + if (fp.FReadEndian(&offset_table[0], sizeof(offset_table[0]), num_entries, + swap_) != num_entries) return false; - } - fread(&actual_tessdata_num_entries_, sizeof(inT32), 1, data_file_); - swap_ = (actual_tessdata_num_entries_ > kMaxNumTessdataEntries); - if (swap_) { - ReverseN(&actual_tessdata_num_entries_, - sizeof(actual_tessdata_num_entries_)); - } - if (actual_tessdata_num_entries_ > TESSDATA_NUM_ENTRIES) { - // For forward compatibility, truncate to the number we can handle. - actual_tessdata_num_entries_ = TESSDATA_NUM_ENTRIES; - } - fread(offset_table_, sizeof(inT64), - actual_tessdata_num_entries_, data_file_); - if (swap_) { - for (i = 0 ; i < actual_tessdata_num_entries_; ++i) { - ReverseN(&offset_table_[i], sizeof(offset_table_[i])); - } - } - if (debug_level_) { - tprintf("TessdataManager loaded %d types of tesseract data files.\n", - actual_tessdata_num_entries_); - for (i = 0; i < actual_tessdata_num_entries_; ++i) { - tprintf("Offset for type %d is %lld\n", i, offset_table_[i]); + for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) { + if (offset_table[i] >= 0) { + inT64 entry_size = size - offset_table[i]; + int j = i + 1; + while (j < num_entries && offset_table[j] == -1) ++j; + if (j < num_entries) entry_size = offset_table[j] - offset_table[i]; + entries_[i].init_to_size(entry_size, 0); + if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false; } } + is_loaded_ = true; return true; } -void TessdataManager::CopyFile(FILE *input_file, FILE *output_file, - bool newline_end, inT64 num_bytes_to_copy) { - if (num_bytes_to_copy == 0) return; - int buffer_size = 1024; - if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) { - buffer_size = num_bytes_to_copy; - } - inT64 num_bytes_copied = 0; - char *chunk = new char[buffer_size]; - int bytes_read; - char last_char = 0x0; - while ((bytes_read = fread(chunk, sizeof(char), - buffer_size, input_file))) { - fwrite(chunk, sizeof(char), bytes_read, output_file); - last_char = chunk[bytes_read-1]; - if (num_bytes_to_copy > 0) { - num_bytes_copied += bytes_read; - if (num_bytes_copied == num_bytes_to_copy) break; - if (num_bytes_copied + buffer_size > num_bytes_to_copy) { - buffer_size = num_bytes_to_copy - num_bytes_copied; - } - } - } - if (newline_end) ASSERT_HOST(last_char == '\n'); - delete[] chunk; +// Overwrites a single entry of the given type. +void TessdataManager::OverwriteEntry(TessdataType type, const char *data, + int size) { + is_loaded_ = true; + entries_[type].init_to_size(size, 0); + memcpy(&entries_[type][0], data, size); } -bool TessdataManager::WriteMetadata(inT64 *offset_table, - const char * language_data_path_prefix, - FILE *output_file) { - inT32 num_entries = TESSDATA_NUM_ENTRIES; - bool result = true; - if (fseek(output_file, 0, SEEK_SET) != 0 || - fwrite(&num_entries, sizeof(inT32), 1, output_file) != 1 || - fwrite(offset_table, sizeof(inT64), TESSDATA_NUM_ENTRIES, - output_file) != TESSDATA_NUM_ENTRIES) { - fclose(output_file); - result = false; - tprintf("WriteMetadata failed in TessdataManager!\n"); - } else if (fclose(output_file)) { - result = false; - tprintf("WriteMetadata failed to close file!\n"); - } else { - tprintf("TessdataManager combined tesseract data files.\n"); - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - tprintf("Offset for type %2d (%s%-22s) is %lld\n", i, - language_data_path_prefix, kTessdataFileSuffixes[i], - offset_table[i]); +// Saves to the given filename. +bool TessdataManager::SaveFile(const STRING &filename, + FileWriter writer) const { + ASSERT_HOST(is_loaded_); + GenericVector data; + Serialize(&data); + if (writer == nullptr) + return SaveDataToFile(data, filename); + else + return (*writer)(data, filename); +} + +// Serializes to the given vector. +void TessdataManager::Serialize(GenericVector *data) const { + ASSERT_HOST(is_loaded_); + // Compute the offset_table and total size. + inT64 offset_table[TESSDATA_NUM_ENTRIES]; + inT64 offset = sizeof(inT32) + sizeof(offset_table); + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + if (entries_[i].empty()) { + offset_table[i] = -1; + } else { + offset_table[i] = offset; + offset += entries_[i].size(); } } - return result; + data->init_to_size(offset, 0); + inT32 num_entries = TESSDATA_NUM_ENTRIES; + TFile fp; + fp.OpenWrite(data); + fp.FWrite(&num_entries, sizeof(num_entries), 1); + fp.FWrite(offset_table, sizeof(offset_table), 1); + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + if (!entries_[i].empty()) { + fp.FWrite(&entries_[i][0], entries_[i].size(), 1); + } + } +} + +// Resets to the initial state, keeping the reader. +void TessdataManager::Clear() { + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + entries_[i].clear(); + } + is_loaded_ = false; +} + +// Prints a directory of contents. +void TessdataManager::Directory() const { + int offset = TESSDATA_NUM_ENTRIES * sizeof(inT64); + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + if (!entries_[i].empty()) { + tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i], + entries_[i].size(), offset); + offset += entries_[i].size(); + } + } +} + +// Opens the given TFile pointer to the given component type. +// Returns false in case of failure. +bool TessdataManager::GetComponent(TessdataType type, TFile *fp) { + if (!is_loaded_ && !Init(data_file_name_.string())) return false; + if (entries_[type].empty()) return false; + fp->Open(&entries_[type][0], entries_[type].size()); + return true; } bool TessdataManager::CombineDataFiles( const char *language_data_path_prefix, const char *output_filename) { - int i; - inT64 offset_table[TESSDATA_NUM_ENTRIES]; - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1; - FILE *output_file = fopen(output_filename, "wb"); - if (output_file == NULL) { - tprintf("Error opening %s for writing\n", output_filename); - return false; - } - // Leave some space for recording the offset_table. - if (fseek(output_file, - sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) { - tprintf("Error seeking %s\n", output_filename); - return false; - } - - TessdataType type = TESSDATA_NUM_ENTRIES; - bool text_file = false; - FILE *file_ptr[TESSDATA_NUM_ENTRIES]; - // Load individual tessdata components from files. - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - ASSERT_HOST(TessdataTypeFromFileSuffix( - kTessdataFileSuffixes[i], &type, &text_file)); + for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { + TessdataType type; + ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type)); STRING filename = language_data_path_prefix; filename += kTessdataFileSuffixes[i]; - file_ptr[i] = fopen(filename.string(), "rb"); - if (file_ptr[i] != NULL) { - offset_table[type] = ftell(output_file); - CopyFile(file_ptr[i], output_file, text_file, -1); - fclose(file_ptr[i]); + FILE *fp = fopen(filename.string(), "rb"); + if (fp != nullptr) { + fclose(fp); + if (!LoadDataFromFile(filename, &entries_[type])) { + tprintf("Load of file %s failed!\n", filename.string()); + return false; + } } } + is_loaded_ = true; // Make sure that the required components are present. - if (file_ptr[TESSDATA_UNICHARSET] == NULL) { - tprintf("Error opening %sunicharset file\n", language_data_path_prefix); - fclose(output_file); + if (!IsBaseAvailable() && !IsLSTMAvailable()) { + tprintf( + "Error: traineddata file must contain at least (a unicharset file" + "and inttemp) OR an lstm file.\n"); return false; } - if (file_ptr[TESSDATA_INTTEMP] != NULL && - (file_ptr[TESSDATA_PFFMTABLE] == NULL || - file_ptr[TESSDATA_NORMPROTO] == NULL)) { - tprintf("Error opening %spffmtable and/or %snormproto files" - " while %sinttemp file was present\n", language_data_path_prefix, - language_data_path_prefix, language_data_path_prefix); - fclose(output_file); - return false; - } - - return WriteMetadata(offset_table, language_data_path_prefix, output_file); + // Write updated data to the output traineddata file. + return SaveFile(output_filename, nullptr); } bool TessdataManager::OverwriteComponents( const char *new_traineddata_filename, char **component_filenames, int num_new_components) { - int i; - inT64 offset_table[TESSDATA_NUM_ENTRIES]; - TessdataType type = TESSDATA_NUM_ENTRIES; - bool text_file = false; - FILE *file_ptr[TESSDATA_NUM_ENTRIES]; - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - offset_table[i] = -1; - file_ptr[i] = NULL; - } - FILE *output_file = fopen(new_traineddata_filename, "wb"); - if (output_file == NULL) { - tprintf("Error opening %s for writing\n", new_traineddata_filename); - return false; - } - - // Leave some space for recording the offset_table. - if (fseek(output_file, - sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET)) { - fclose(output_file); - tprintf("Error seeking %s\n", new_traineddata_filename); - return false; - } - // Open the files with the new components. - for (i = 0; i < num_new_components; ++i) { - if (TessdataTypeFromFileName(component_filenames[i], &type, &text_file)) - file_ptr[type] = fopen(component_filenames[i], "rb"); - } - - // Write updated data to the output traineddata file. - for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - if (file_ptr[i] != NULL) { - // Get the data from the opened component file. - offset_table[i] = ftell(output_file); - CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1); - fclose(file_ptr[i]); - } else { - // Get this data component from the loaded data file. - if (SeekToStart(static_cast(i))) { - offset_table[i] = ftell(output_file); - CopyFile(data_file_, output_file, kTessdataFileIsText[i], - GetEndOffset(static_cast(i)) - - ftell(data_file_) + 1); + for (int i = 0; i < num_new_components; ++i) { + TessdataType type; + if (TessdataTypeFromFileName(component_filenames[i], &type)) { + if (!LoadDataFromFile(component_filenames[i], &entries_[type])) { + tprintf("Failed to read component file:%s\n", component_filenames[i]); + return false; } } } - const char *language_data_path_prefix = strchr(new_traineddata_filename, '.'); - return WriteMetadata(offset_table, language_data_path_prefix, output_file); + + // Write updated data to the output traineddata file. + return SaveFile(new_traineddata_filename, nullptr); } -bool TessdataManager::TessdataTypeFromFileSuffix( - const char *suffix, TessdataType *type, bool *text_file) { +bool TessdataManager::ExtractToFile(const char *filename) { + TessdataType type = TESSDATA_NUM_ENTRIES; + ASSERT_HOST( + tesseract::TessdataManager::TessdataTypeFromFileName(filename, &type)); + if (entries_[type].empty()) return false; + return SaveDataToFile(entries_[type], filename); +} + +bool TessdataManager::TessdataTypeFromFileSuffix(const char *suffix, + TessdataType *type) { for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) { *type = static_cast(i); - *text_file = kTessdataFileIsText[i]; return true; } } @@ -247,33 +227,12 @@ bool TessdataManager::TessdataTypeFromFileSuffix( return false; } -bool TessdataManager::TessdataTypeFromFileName( - const char *filename, TessdataType *type, bool *text_file) { +bool TessdataManager::TessdataTypeFromFileName(const char *filename, + TessdataType *type) { // Get the file suffix (extension) const char *suffix = strrchr(filename, '.'); - if (suffix == NULL || *(++suffix) == '\0') return false; - return TessdataTypeFromFileSuffix(suffix, type, text_file); -} - -bool TessdataManager::ExtractToFile(const char *filename) { - TessdataType type = TESSDATA_NUM_ENTRIES; - bool text_file = false; - ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName( - filename, &type, &text_file)); - if (!SeekToStart(type)) return false; - - FILE *output_file = fopen(filename, "wb"); - if (output_file == NULL) { - tprintf("Error opening %s\n", filename); - exit(1); - } - inT64 begin_offset = ftell(GetDataFilePtr()); - inT64 end_offset = GetEndOffset(type); - tesseract::TessdataManager::CopyFile( - GetDataFilePtr(), output_file, text_file, - end_offset - begin_offset + 1); - fclose(output_file); - return true; + if (suffix == nullptr || *(++suffix) == '\0') return false; + return TessdataTypeFromFileSuffix(suffix, type); } } // namespace tesseract diff --git a/ccutil/tessdatamanager.h b/ccutil/tessdatamanager.h index fd2685a1..db9c5583 100644 --- a/ccutil/tessdatamanager.h +++ b/ccutil/tessdatamanager.h @@ -47,6 +47,10 @@ static const char kShapeTableFileSuffix[] = "shapetable"; static const char kBigramDawgFileSuffix[] = "bigram-dawg"; static const char kUnambigDawgFileSuffix[] = "unambig-dawg"; static const char kParamsModelFileSuffix[] = "params-model"; +static const char kLSTMModelFileSuffix[] = "lstm"; +static const char kLSTMPuncDawgFileSuffix[] = "lstm-punc-dawg"; +static const char kLSTMSystemDawgFileSuffix[] = "lstm-word-dawg"; +static const char kLSTMNumberDawgFileSuffix[] = "lstm-number-dawg"; namespace tesseract { @@ -62,12 +66,16 @@ enum TessdataType { TESSDATA_NUMBER_DAWG, // 8 TESSDATA_FREQ_DAWG, // 9 TESSDATA_FIXED_LENGTH_DAWGS, // 10 // deprecated - TESSDATA_CUBE_UNICHARSET, // 11 - TESSDATA_CUBE_SYSTEM_DAWG, // 12 + TESSDATA_CUBE_UNICHARSET, // 11 // deprecated + TESSDATA_CUBE_SYSTEM_DAWG, // 12 // deprecated TESSDATA_SHAPE_TABLE, // 13 TESSDATA_BIGRAM_DAWG, // 14 TESSDATA_UNAMBIG_DAWG, // 15 TESSDATA_PARAMS_MODEL, // 16 + TESSDATA_LSTM, // 17 + TESSDATA_LSTM_PUNC_DAWG, // 18 + TESSDATA_LSTM_SYSTEM_DAWG, // 19 + TESSDATA_LSTM_NUMBER_DAWG, // 20 TESSDATA_NUM_ENTRIES }; @@ -76,48 +84,28 @@ enum TessdataType { * kTessdataFileSuffixes[i] indicates the file suffix for * tessdata of type i (from TessdataType enum). */ -static const char * const kTessdataFileSuffixes[] = { - kLangConfigFileSuffix, // 0 - kUnicharsetFileSuffix, // 1 - kAmbigsFileSuffix, // 2 - kBuiltInTemplatesFileSuffix, // 3 - kBuiltInCutoffsFileSuffix, // 4 - kNormProtoFileSuffix, // 5 - kPuncDawgFileSuffix, // 6 - kSystemDawgFileSuffix, // 7 - kNumberDawgFileSuffix, // 8 - kFreqDawgFileSuffix, // 9 - kFixedLengthDawgsFileSuffix, // 10 // deprecated - kCubeUnicharsetFileSuffix, // 11 - kCubeSystemDawgFileSuffix, // 12 - kShapeTableFileSuffix, // 13 - kBigramDawgFileSuffix, // 14 - kUnambigDawgFileSuffix, // 15 - kParamsModelFileSuffix, // 16 -}; - -/** - * If kTessdataFileIsText[i] is true - the tessdata component - * of type i (from TessdataType enum) is text, and is binary otherwise. - */ -static const bool kTessdataFileIsText[] = { - true, // 0 - true, // 1 - true, // 2 - false, // 3 - true, // 4 - true, // 5 - false, // 6 - false, // 7 - false, // 8 - false, // 9 - false, // 10 // deprecated - true, // 11 - false, // 12 - false, // 13 - false, // 14 - false, // 15 - true, // 16 +static const char *const kTessdataFileSuffixes[] = { + kLangConfigFileSuffix, // 0 + kUnicharsetFileSuffix, // 1 + kAmbigsFileSuffix, // 2 + kBuiltInTemplatesFileSuffix, // 3 + kBuiltInCutoffsFileSuffix, // 4 + kNormProtoFileSuffix, // 5 + kPuncDawgFileSuffix, // 6 + kSystemDawgFileSuffix, // 7 + kNumberDawgFileSuffix, // 8 + kFreqDawgFileSuffix, // 9 + kFixedLengthDawgsFileSuffix, // 10 // deprecated + kCubeUnicharsetFileSuffix, // 11 // deprecated + kCubeSystemDawgFileSuffix, // 12 // deprecated + kShapeTableFileSuffix, // 13 + kBigramDawgFileSuffix, // 14 + kUnambigDawgFileSuffix, // 15 + kParamsModelFileSuffix, // 16 + kLSTMModelFileSuffix, // 17 + kLSTMPuncDawgFileSuffix, // 18 + kLSTMSystemDawgFileSuffix, // 19 + kLSTMNumberDawgFileSuffix, // 20 }; /** @@ -132,87 +120,61 @@ static const int kMaxNumTessdataEntries = 1000; class TessdataManager { public: - TessdataManager() { - data_file_ = NULL; - actual_tessdata_num_entries_ = 0; - for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) { - offset_table_[i] = -1; - } - } + TessdataManager() : reader_(nullptr), is_loaded_(false), swap_(false) {} + explicit TessdataManager(FileReader reader) + : reader_(reader), is_loaded_(false), swap_(false) {} ~TessdataManager() {} - int DebugLevel() { return debug_level_; } + bool swap() const { return swap_; } + bool is_loaded() const { return is_loaded_; } + + // Lazily loads from the the given filename. Won't actually read the file + // until it needs it. + void LoadFileLater(const char *data_file_name); /** - * Opens the given data file and reads the offset table. + * Opens and reads the given data file right now. * @return true on success. */ - bool Init(const char *data_file_name, int debug_level); + bool Init(const char *data_file_name); + // Loads from the given memory buffer as if a file, remembering name as some + // arbitrary source id for caching. + bool LoadMemBuffer(const char *name, const char *data, int size); + // Overwrites a single entry of the given type. + void OverwriteEntry(TessdataType type, const char *data, int size); + + // Saves to the given filename. + bool SaveFile(const STRING &filename, FileWriter writer) const; + // Serializes to the given vector. + void Serialize(GenericVector *data) const; + // Resets to the initial state, keeping the reader. + void Clear(); + + // Prints a directory of contents. + void Directory() const; + + // Opens the given TFile pointer to the given component type. + // Returns false in case of failure. + bool GetComponent(TessdataType type, TFile *fp); + + // Returns true if the base Tesseract components are present. + bool IsBaseAvailable() const { + return !entries_[TESSDATA_UNICHARSET].empty() && + !entries_[TESSDATA_INTTEMP].empty(); + } + + // Returns true if the LSTM components are present. + bool IsLSTMAvailable() const { return !entries_[TESSDATA_LSTM].empty(); } // Return the name of the underlying data file. const STRING &GetDataFileName() const { return data_file_name_; } - /** Returns data file pointer. */ - inline FILE *GetDataFilePtr() const { return data_file_; } - - /** - * Returns false if there is no data of the given type. - * Otherwise does a seek on the data_file_ to position the pointer - * at the start of the data of the given type. - */ - inline bool SeekToStart(TessdataType tessdata_type) { - if (debug_level_) { - tprintf("TessdataManager: seek to offset %lld - start of tessdata" - "type %d (%s))\n", offset_table_[tessdata_type], - tessdata_type, kTessdataFileSuffixes[tessdata_type]); - } - if (offset_table_[tessdata_type] < 0) { - return false; - } else { - ASSERT_HOST(fseek(data_file_, - static_cast(offset_table_[tessdata_type]), - SEEK_SET) == 0); - return true; - } - } - /** Returns the end offset for the given tesseract data file type. */ - inline inT64 GetEndOffset(TessdataType tessdata_type) const { - int index = tessdata_type + 1; - while (index < actual_tessdata_num_entries_ && offset_table_[index] == -1) { - ++index; // skip tessdata types not present in the combined file - } - if (debug_level_) { - tprintf("TessdataManager: end offset for type %d is %lld\n", - tessdata_type, - (index == actual_tessdata_num_entries_) ? -1 - : offset_table_[index]); - } - return (index == actual_tessdata_num_entries_) ? -1 : offset_table_[index] - 1; - } - /** Closes data_file_ (if it was opened by Init()). */ - inline void End() { - if (data_file_ != NULL) { - fclose(data_file_); - data_file_ = NULL; - } - } - bool swap() const { - return swap_; - } - - /** Writes the number of entries and the given offset table to output_file. - * Returns false on error. - */ - static bool WriteMetadata(inT64 *offset_table, - const char *language_data_path_prefix, - FILE *output_file); - /** * Reads all the standard tesseract config and data files for a language * at the given path and bundles them up into one binary data file. * Returns true if the combined traineddata file was successfully written. */ - static bool CombineDataFiles(const char *language_data_path_prefix, - const char *output_filename); + bool CombineDataFiles(const char *language_data_path_prefix, + const char *output_filename); /** * Gets the individual components from the data_file_ with which the class was @@ -235,65 +197,35 @@ class TessdataManager { */ bool ExtractToFile(const char *filename); - /** - * Copies data from the given input file to the output_file provided. - * If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from - * the input file, otherwise all the data in the input file is copied. - */ - static void CopyFile(FILE *input_file, FILE *output_file, - bool newline_end, inT64 num_bytes_to_copy); - /** * Fills type with TessdataType of the tessdata component represented by the * given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET. - * Sets *text_file to true if the component is in text format (e.g. - * unicharset, unichar ambigs, config, etc). * @return true if the tessdata component type could be determined * from the given file name. */ static bool TessdataTypeFromFileSuffix(const char *suffix, - TessdataType *type, - bool *text_file); + TessdataType *type); /** * Tries to determine tessdata component file suffix from filename, * returns true on success. */ static bool TessdataTypeFromFileName(const char *filename, - TessdataType *type, - bool *text_file); + TessdataType *type); private: - - /** - * Opens the file whose name is a concatenation of language_data_path_prefix - * and file_suffix. Returns a file pointer to the opened file. - */ - static FILE *GetFilePtr(const char *language_data_path_prefix, - const char *file_suffix, bool text_file); - - /** - * Each offset_table_[i] contains a file offset in the combined data file - * where the data of TessdataFileType i is stored. - */ - inT64 offset_table_[TESSDATA_NUM_ENTRIES]; - /** - * Actual number of entries in the tessdata table. This value can only be - * same or smaller than TESSDATA_NUM_ENTRIES, but can never be larger, - * since then it would be impossible to interpret the type of tessdata at - * indices same and higher than TESSDATA_NUM_ENTRIES. - * This parameter is used to allow for backward compatibility - * when new tessdata types are introduced. - */ - inT32 actual_tessdata_num_entries_; - STRING data_file_name_; // name of the data file. - FILE *data_file_; ///< pointer to the data file. - int debug_level_; + // Name of file it came from. + STRING data_file_name_; + // Function to load the file when we need it. + FileReader reader_; + // True if the file has been loaded. + bool is_loaded_; // True if the bytes need swapping. bool swap_; + // Contents of each element of the traineddata file. + GenericVector entries_[TESSDATA_NUM_ENTRIES]; }; - } // namespace tesseract #endif // TESSERACT_CCUTIL_TESSDATAMANAGER_H_ diff --git a/ccutil/unichar.h b/ccutil/unichar.h index b2a1e013..85dde6f2 100644 --- a/ccutil/unichar.h +++ b/ccutil/unichar.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCUTIL_UNICHAR_H__ -#define TESSERACT_CCUTIL_UNICHAR_H__ +#ifndef TESSERACT_CCUTIL_UNICHAR_H_ +#define TESSERACT_CCUTIL_UNICHAR_H_ #include #include @@ -47,7 +47,7 @@ enum StrongScriptDirection { // The UNICHAR class holds a single classification result. This may be // a single Unicode character (stored as between 1 and 4 utf8 bytes) or -// multple Unicode characters representing the NFKC expansion of a ligature +// multiple Unicode characters representing the NFKC expansion of a ligature // such as fi, ffl etc. These are also stored as utf8. class UNICHAR { public: @@ -162,4 +162,4 @@ class UNICHAR { char chars[UNICHAR_LEN]; }; -#endif // TESSERACT_CCUTIL_UNICHAR_H__ +#endif // TESSERACT_CCUTIL_UNICHAR_H_ diff --git a/ccutil/unicharcompress.cpp b/ccutil/unicharcompress.cpp new file mode 100644 index 00000000..084e6c43 --- /dev/null +++ b/ccutil/unicharcompress.cpp @@ -0,0 +1,439 @@ +/////////////////////////////////////////////////////////////////////// +// File: unicharcompress.cpp +// Description: Unicode re-encoding using a sequence of smaller numbers in +// place of a single large code for CJK, similarly for Indic, +// and dissection of ligatures for other scripts. +// Author: Ray Smith +// Created: Wed Mar 04 14:45:01 PST 2015 +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "unicharcompress.h" +#include "tprintf.h" + +namespace tesseract { + +// String used to represent the null_id in direct_set. +const char* kNullChar = ""; + +// Local struct used only for processing the radical-stroke table. +struct RadicalStroke { + RadicalStroke() : num_strokes(0) {} + RadicalStroke(const STRING& r, int s) : radical(r), num_strokes(s) {} + + bool operator==(const RadicalStroke& other) const { + return radical == other.radical && num_strokes == other.num_strokes; + } + + // The radical is encoded as a string because its format is of an int with + // an optional ' mark to indicate a simplified shape. To treat these as + // distinct, we use a string and a UNICHARSET to do the integer mapping. + STRING radical; + // The number of strokes we treat as dense and just take the face value from + // the table. + int num_strokes; +}; + +// Hash functor for RadicalStroke. +struct RadicalStrokedHash { + size_t operator()(const RadicalStroke& rs) const { + size_t result = rs.num_strokes; + for (int i = 0; i < rs.radical.length(); ++i) { + result ^= rs.radical[i] << (6 * i + 8); + } + return result; + } +}; + +// A hash map to convert unicodes to radical,stroke pair. +typedef std::unordered_map RSMap; +// A hash map to count occurrences of each radical,stroke pair. +typedef std::unordered_map RSCounts; + +// Helper function builds the RSMap from the radical-stroke file, which has +// already been read into a STRING. Returns false on error. +// The radical_stroke_table is non-const because it gets split and the caller +// is unlikely to want to use it again. +static bool DecodeRadicalStrokeTable(STRING* radical_stroke_table, + RSMap* radical_map) { + GenericVector lines; + radical_stroke_table->split('\n', &lines); + for (int i = 0; i < lines.size(); ++i) { + if (lines[i].length() == 0 || lines[i][0] == '#') continue; + int unicode, radical, strokes; + STRING str_radical; + if (sscanf(lines[i].string(), "%x\t%d.%d", &unicode, &radical, &strokes) == + 3) { + str_radical.add_str_int("", radical); + } else if (sscanf(lines[i].string(), "%x\t%d'.%d", &unicode, &radical, + &strokes) == 3) { + str_radical.add_str_int("'", radical); + } else { + tprintf("Invalid format in radical stroke table at line %d: %s\n", i, + lines[i].string()); + return false; + } + (*radical_map)[unicode] = RadicalStroke(str_radical, strokes); + } + return true; +} + +UnicharCompress::UnicharCompress() : code_range_(0) {} +UnicharCompress::UnicharCompress(const UnicharCompress& src) { *this = src; } +UnicharCompress::~UnicharCompress() { Cleanup(); } +UnicharCompress& UnicharCompress::operator=(const UnicharCompress& src) { + Cleanup(); + encoder_ = src.encoder_; + code_range_ = src.code_range_; + SetupDecoder(); + return *this; +} + +// Computes the encoding for the given unicharset. It is a requirement that +// the file training/langdata/radical-stroke.txt have been read into the +// input string radical_stroke_table. +// Returns false if the encoding cannot be constructed. +bool UnicharCompress::ComputeEncoding(const UNICHARSET& unicharset, int null_id, + STRING* radical_stroke_table) { + RSMap radical_map; + if (!DecodeRadicalStrokeTable(radical_stroke_table, &radical_map)) + return false; + encoder_.clear(); + UNICHARSET direct_set; + UNICHARSET radicals; + // To avoid unused codes, clear the special codes from the unicharsets. + direct_set.clear(); + radicals.clear(); + // Always keep space as 0; + direct_set.unichar_insert(" "); + // Null char is next if we have one. + if (null_id >= 0) { + direct_set.unichar_insert(kNullChar); + } + RSCounts radical_counts; + // In the initial map, codes [0, unicharset.size()) are + // reserved for non-han/hangul sequences of 1 or more unicodes. + int hangul_offset = unicharset.size(); + // Hangul takes the next range [hangul_offset, hangul_offset + kTotalJamos). + const int kTotalJamos = kLCount + kVCount + kTCount; + // Han takes the codes beyond hangul_offset + kTotalJamos. Since it is hard + // to measure the number of radicals and strokes, initially we use the same + // code range for all 3 Han code positions, and fix them after. + int han_offset = hangul_offset + kTotalJamos; + int max_num_strokes = -1; + for (int u = 0; u <= unicharset.size(); ++u) { + bool self_normalized = false; + // We special-case allow null_id to be equal to unicharset.size() in case + // there is no space in unicharset for it. + if (u == unicharset.size()) { + if (u == null_id) { + self_normalized = true; + } else { + break; // Finished. + } + } else { + self_normalized = strcmp(unicharset.id_to_unichar(u), + unicharset.get_normed_unichar(u)) == 0; + } + RecodedCharID code; + // Convert to unicodes. + GenericVector unicodes; + if (u < unicharset.size() && + UNICHAR::UTF8ToUnicode(unicharset.get_normed_unichar(u), &unicodes) && + unicodes.size() == 1) { + // Check single unicodes for Hangul/Han and encode if so. + int unicode = unicodes[0]; + int leading, vowel, trailing; + auto it = radical_map.find(unicode); + if (it != radical_map.end()) { + // This is Han. Convert to radical, stroke, index. + if (!radicals.contains_unichar(it->second.radical.string())) { + radicals.unichar_insert(it->second.radical.string()); + } + int radical = radicals.unichar_to_id(it->second.radical.string()); + int num_strokes = it->second.num_strokes; + int num_samples = radical_counts[it->second]++; + if (num_strokes > max_num_strokes) max_num_strokes = num_strokes; + code.Set3(radical + han_offset, num_strokes + han_offset, + num_samples + han_offset); + } else if (DecomposeHangul(unicode, &leading, &vowel, &trailing)) { + // This is Hangul. Since we know the exact size of each part at compile + // time, it gets the bottom set of codes. + code.Set3(leading + hangul_offset, vowel + kLCount + hangul_offset, + trailing + kLCount + kVCount + hangul_offset); + } + } + // If the code is still empty, it wasn't Han or Hangul. + if (code.length() == 0) { + // Special cases. + if (u == UNICHAR_SPACE) { + code.Set(0, 0); // Space. + } else if (u == null_id || (unicharset.has_special_codes() && + u < SPECIAL_UNICHAR_CODES_COUNT)) { + code.Set(0, direct_set.unichar_to_id(kNullChar)); + } else { + // Add the direct_set unichar-ids of the unicodes in sequence to the + // code. + for (int i = 0; i < unicodes.size(); ++i) { + int position = code.length(); + if (position >= RecodedCharID::kMaxCodeLen) { + tprintf("Unichar %d=%s->%s is too long to encode!!\n", u, + unicharset.id_to_unichar(u), + unicharset.get_normed_unichar(u)); + return false; + } + int uni = unicodes[i]; + UNICHAR unichar(uni); + char* utf8 = unichar.utf8_str(); + if (!direct_set.contains_unichar(utf8)) + direct_set.unichar_insert(utf8); + code.Set(position, direct_set.unichar_to_id(utf8)); + delete[] utf8; + if (direct_set.size() > unicharset.size()) { + // Code space got bigger! + tprintf("Code space expanded from original unicharset!!\n"); + return false; + } + } + } + } + code.set_self_normalized(self_normalized); + encoder_.push_back(code); + } + // Now renumber Han to make all codes unique. We already added han_offset to + // all Han. Now separate out the radical, stroke, and count codes for Han. + // In the uniqued Han encoding, the 1st code uses the next radical_map.size() + // values, the 2nd code uses the next max_num_strokes+1 values, and the 3rd + // code uses the rest for the max number of duplicated radical/stroke combos. + int num_radicals = radicals.size(); + for (int u = 0; u < unicharset.size(); ++u) { + RecodedCharID* code = &encoder_[u]; + if ((*code)(0) >= han_offset) { + code->Set(1, (*code)(1) + num_radicals); + code->Set(2, (*code)(2) + num_radicals + max_num_strokes + 1); + } + } + DefragmentCodeValues(null_id >= 0 ? 1 : -1); + SetupDecoder(); + return true; +} + +// Sets up an encoder that doesn't change the unichars at all, so it just +// passes them through unchanged. +void UnicharCompress::SetupPassThrough(const UNICHARSET& unicharset) { + GenericVector codes; + for (int u = 0; u < unicharset.size(); ++u) { + RecodedCharID code; + code.Set(0, u); + codes.push_back(code); + } + SetupDirect(codes); +} + +// Sets up an encoder directly using the given encoding vector, which maps +// unichar_ids to the given codes. +void UnicharCompress::SetupDirect(const GenericVector& codes) { + encoder_ = codes; + ComputeCodeRange(); + SetupDecoder(); +} + +// Renumbers codes to eliminate unused values. +void UnicharCompress::DefragmentCodeValues(int encoded_null) { + // There may not be any Hangul, but even if there is, it is possible that not + // all codes are used. Likewise with the Han encoding, it is possible that not + // all numbers of strokes are used. + ComputeCodeRange(); + GenericVector offsets; + offsets.init_to_size(code_range_, 0); + // Find which codes are used + for (int c = 0; c < encoder_.size(); ++c) { + const RecodedCharID& code = encoder_[c]; + for (int i = 0; i < code.length(); ++i) { + offsets[code(i)] = 1; + } + } + // Compute offsets based on code use. + int offset = 0; + for (int i = 0; i < offsets.size(); ++i) { + // If not used, decrement everything above here. + // We are moving encoded_null to the end, so it is not "used". + if (offsets[i] == 0 || i == encoded_null) { + --offset; + } else { + offsets[i] = offset; + } + } + if (encoded_null >= 0) { + // The encoded_null is moving to the end, for the benefit of TensorFlow, + // which is offsets.size() + offsets.back(). + offsets[encoded_null] = offsets.size() + offsets.back() - encoded_null; + } + // Now apply the offsets. + for (int c = 0; c < encoder_.size(); ++c) { + RecodedCharID* code = &encoder_[c]; + for (int i = 0; i < code->length(); ++i) { + int value = (*code)(i); + code->Set(i, value + offsets[value]); + } + } + ComputeCodeRange(); +} + +// Encodes a single unichar_id. Returns the length of the code, or zero if +// invalid input, and the encoding itself +int UnicharCompress::EncodeUnichar(int unichar_id, RecodedCharID* code) const { + if (unichar_id < 0 || unichar_id >= encoder_.size()) return 0; + *code = encoder_[unichar_id]; + return code->length(); +} + +// Decodes code, returning the original unichar-id, or +// INVALID_UNICHAR_ID if the input is invalid. +int UnicharCompress::DecodeUnichar(const RecodedCharID& code) const { + int len = code.length(); + if (len <= 0 || len > RecodedCharID::kMaxCodeLen) return INVALID_UNICHAR_ID; + auto it = decoder_.find(code); + if (it == decoder_.end()) return INVALID_UNICHAR_ID; + return it->second; +} + +// Writes to the given file. Returns false in case of error. +bool UnicharCompress::Serialize(TFile* fp) const { + return encoder_.SerializeClasses(fp); +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool UnicharCompress::DeSerialize(bool swap, TFile* fp) { + if (!encoder_.DeSerializeClasses(swap, fp)) return false; + ComputeCodeRange(); + SetupDecoder(); + return true; +} + +// Returns a STRING containing a text file that describes the encoding thus: +// [,]* +// In words, a comma-separated list of one or more indices, followed by a tab +// and the UTF-8 string that the code represents per line. Most simple scripts +// will encode a single index to a UTF8-string, but Chinese, Japanese, Korean +// and the Indic scripts will contain a many-to-many mapping. +// See the class comment above for details. +STRING UnicharCompress::GetEncodingAsString( + const UNICHARSET& unicharset) const { + STRING encoding; + for (int c = 0; c < encoder_.size(); ++c) { + const RecodedCharID& code = encoder_[c]; + if (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && code == encoder_[c - 1]) { + // Don't show the duplicate entry. + continue; + } + encoding.add_str_int("", code(0)); + for (int i = 1; i < code.length(); ++i) { + encoding.add_str_int(",", code(i)); + } + encoding += "\t"; + if (c >= unicharset.size() || (0 < c && c < SPECIAL_UNICHAR_CODES_COUNT && + unicharset.has_special_codes())) { + encoding += kNullChar; + } else { + encoding += unicharset.id_to_unichar(c); + } + encoding += "\n"; + } + return encoding; +} + +// Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing. +// Note that the returned values are 0-based indices, NOT unicode Jamo. +// Returns false if the input is not in the Hangul unicode range. +/* static */ +bool UnicharCompress::DecomposeHangul(int unicode, int* leading, int* vowel, + int* trailing) { + if (unicode < kFirstHangul) return false; + int offset = unicode - kFirstHangul; + if (offset >= kNumHangul) return false; + const int kNCount = kVCount * kTCount; + *leading = offset / kNCount; + *vowel = (offset % kNCount) / kTCount; + *trailing = offset % kTCount; + return true; +} + +// Computes the value of code_range_ from the encoder_. +void UnicharCompress::ComputeCodeRange() { + code_range_ = -1; + for (int c = 0; c < encoder_.size(); ++c) { + const RecodedCharID& code = encoder_[c]; + for (int i = 0; i < code.length(); ++i) { + if (code(i) > code_range_) code_range_ = code(i); + } + } + ++code_range_; +} + +// Initializes the decoding hash_map from the encoding array. +void UnicharCompress::SetupDecoder() { + Cleanup(); + is_valid_start_.init_to_size(code_range_, false); + for (int c = 0; c < encoder_.size(); ++c) { + const RecodedCharID& code = encoder_[c]; + if (code.self_normalized() || decoder_.find(code) == decoder_.end()) + decoder_[code] = c; + is_valid_start_[code(0)] = true; + RecodedCharID prefix = code; + int len = code.length() - 1; + prefix.Truncate(len); + auto final_it = final_codes_.find(prefix); + if (final_it == final_codes_.end()) { + GenericVectorEqEq* code_list = new GenericVectorEqEq; + code_list->push_back(code(len)); + final_codes_[prefix] = code_list; + while (--len >= 0) { + prefix.Truncate(len); + auto next_it = next_codes_.find(prefix); + if (next_it == next_codes_.end()) { + GenericVectorEqEq* code_list = new GenericVectorEqEq; + code_list->push_back(code(len)); + next_codes_[prefix] = code_list; + } else { + // We still have to search the list as we may get here via multiple + // lengths of code. + if (!next_it->second->contains(code(len))) + next_it->second->push_back(code(len)); + break; // This prefix has been processed. + } + } + } else { + if (!final_it->second->contains(code(len))) + final_it->second->push_back(code(len)); + } + } +} + +// Frees allocated memory. +void UnicharCompress::Cleanup() { + decoder_.clear(); + is_valid_start_.clear(); + for (auto it = next_codes_.begin(); it != next_codes_.end(); ++it) { + delete it->second; + } + for (auto it = final_codes_.begin(); it != final_codes_.end(); ++it) { + delete it->second; + } + next_codes_.clear(); + final_codes_.clear(); +} + +} // namespace tesseract. diff --git a/ccutil/unicharcompress.h b/ccutil/unicharcompress.h new file mode 100644 index 00000000..12fcd867 --- /dev/null +++ b/ccutil/unicharcompress.h @@ -0,0 +1,260 @@ +/////////////////////////////////////////////////////////////////////// +// File: unicharcompress.h +// Description: Unicode re-encoding using a sequence of smaller numbers in +// place of a single large code for CJK, similarly for Indic, +// and dissection of ligatures for other scripts. +// Author: Ray Smith +// Created: Wed Mar 04 14:45:01 PST 2015 +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_CCUTIL_UNICHARCOMPRESS_H_ +#define TESSERACT_CCUTIL_UNICHARCOMPRESS_H_ + +#include + +#include "serialis.h" +#include "strngs.h" +#include "unicharset.h" + +namespace tesseract { + +// Trivial class to hold the code for a recoded unichar-id. +class RecodedCharID { + public: + // The maximum length of a code. + static const int kMaxCodeLen = 9; + + RecodedCharID() : self_normalized_(0), length_(0) { + memset(code_, 0, sizeof(code_)); + } + void Truncate(int length) { length_ = length; } + // Sets the code value at the given index in the code. + void Set(int index, int value) { + code_[index] = value; + if (length_ <= index) length_ = index + 1; + } + // Shorthand for setting codes of length 3, as all Hangul and Han codes are + // length 3. + void Set3(int code0, int code1, int code2) { + length_ = 3; + code_[0] = code0; + code_[1] = code1; + code_[2] = code2; + } + // Accessors + bool self_normalized() const { return self_normalized_ != 0; } + void set_self_normalized(bool value) { self_normalized_ = value; } + int length() const { return length_; } + int operator()(int index) const { return code_[index]; } + + // Writes to the given file. Returns false in case of error. + bool Serialize(TFile* fp) const { + if (fp->FWrite(&self_normalized_, sizeof(self_normalized_), 1) != 1) + return false; + if (fp->FWrite(&length_, sizeof(length_), 1) != 1) return false; + if (fp->FWrite(code_, sizeof(code_[0]), length_) != length_) return false; + return true; + } + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + bool DeSerialize(bool swap, TFile* fp) { + if (fp->FRead(&self_normalized_, sizeof(self_normalized_), 1) != 1) + return false; + if (fp->FRead(&length_, sizeof(length_), 1) != 1) return false; + if (swap) ReverseN(&length_, sizeof(length_)); + if (fp->FRead(code_, sizeof(code_[0]), length_) != length_) return false; + if (swap) { + for (int i = 0; i < length_; ++i) { + ReverseN(&code_[i], sizeof(code_[i])); + } + } + return true; + } + bool operator==(const RecodedCharID& other) const { + if (length_ != other.length_) return false; + for (int i = 0; i < length_; ++i) { + if (code_[i] != other.code_[i]) return false; + } + return true; + } + // Hash functor for RecodedCharID. + struct RecodedCharIDHash { + size_t operator()(const RecodedCharID& code) const { + size_t result = 0; + for (int i = 0; i < code.length_; ++i) { + result ^= code(i) << (7 * i); + } + return result; + } + }; + + private: + // True if this code is self-normalizing, ie is the master entry for indices + // that map to the same code. Has boolean value, but inT8 for serialization. + inT8 self_normalized_; + // The number of elements in use in code_; + inT32 length_; + // The re-encoded form of the unichar-id to which this RecodedCharID relates. + inT32 code_[kMaxCodeLen]; +}; + +// Class holds a "compression" of a unicharset to simplify the learning problem +// for a neural-network-based classifier. +// Objectives: +// 1 (CJK): Ids of a unicharset with a large number of classes are expressed as +// a sequence of 3 codes with much fewer values. +// This is achieved using the Jamo coding for Hangul and the Unicode +// Radical-Stroke-index for Han. +// 2 (Indic): Instead of thousands of codes with one for each grapheme, re-code +// as the unicode sequence (but coded in a more compact space). +// 3 (the rest): Eliminate multi-path problems with ligatures and fold confusing +// and not significantly distinct shapes (quotes) togther, ie +// represent the fi ligature as the f-i pair, and fold u+2019 and +// friends all onto ascii single ' +// 4 The null character and mapping to target activations: +// To save horizontal coding space, the compressed codes are generally mapped +// to target network activations without intervening null characters, BUT +// in the case of ligatures, such as ff, null characters have to be included +// so existence of repeated codes is detected at codebook-building time, and +// null characters are embedded directly into the codes, so the rest of the +// system doesn't need to worry about the problem (much). There is still an +// effect on the range of ways in which the target activations can be +// generated. +// +// The computed code values are compact (no unused values), and, for CJK, +// unique (each code position uses a disjoint set of values from each other code +// position). For non-CJK, the same code value CAN be used in multiple +// positions, eg the ff ligature is converted to , where +// is the same code as is used for the single f. +// NOTE that an intended consequence of using the normalized text from the +// unicharset is that the fancy quotes all map to a single code, so round-trip +// conversion doesn't work for all unichar-ids. +class UnicharCompress { + public: + UnicharCompress(); + UnicharCompress(const UnicharCompress& src); + ~UnicharCompress(); + UnicharCompress& operator=(const UnicharCompress& src); + + // The 1st Hangul unicode. + static const int kFirstHangul = 0xac00; + // The number of Hangul unicodes. + static const int kNumHangul = 11172; + // The number of Jamos for each of the 3 parts of a Hangul character, being + // the Leading consonant, Vowel and Trailing consonant. + static const int kLCount = 19; + static const int kVCount = 21; + static const int kTCount = 28; + + // Computes the encoding for the given unicharset. It is a requirement that + // the file training/langdata/radical-stroke.txt have been read into the + // input string radical_stroke_table. + // Returns false if the encoding cannot be constructed. + bool ComputeEncoding(const UNICHARSET& unicharset, int null_id, + STRING* radical_stroke_table); + // Sets up an encoder that doesn't change the unichars at all, so it just + // passes them through unchanged. + void SetupPassThrough(const UNICHARSET& unicharset); + // Sets up an encoder directly using the given encoding vector, which maps + // unichar_ids to the given codes. + void SetupDirect(const GenericVector& codes); + + // Returns the number of different values that can be used in a code, ie + // 1 + the maximum value that will ever be used by an RecodedCharID code in + // any position in its array. + int code_range() const { return code_range_; } + + // Encodes a single unichar_id. Returns the length of the code, (or zero if + // invalid input), and the encoding itself in code. + int EncodeUnichar(int unichar_id, RecodedCharID* code) const; + // Decodes code, returning the original unichar-id, or + // INVALID_UNICHAR_ID if the input is invalid. Note that this is not a perfect + // inverse of EncodeUnichar, since the unichar-id of U+2019 (curly single + // quote), for example, will have the same encoding as the unichar-id of + // U+0027 (ascii '). The foldings are obtained from the input unicharset, + // which in turn obtains them from NormalizeUTF8String in normstrngs.cpp, + // and include NFKC normalization plus others like quote and dash folding. + int DecodeUnichar(const RecodedCharID& code) const; + // Returns true if the given code is a valid start or single code. + bool IsValidFirstCode(int code) const { return is_valid_start_[code]; } + // Returns a list of valid non-final next codes for a given prefix code, + // which may be empty. + const GenericVector* GetNextCodes(const RecodedCharID& code) const { + auto it = next_codes_.find(code); + return it == next_codes_.end() ? NULL : it->second; + } + // Returns a list of valid final codes for a given prefix code, which may + // be empty. + const GenericVector* GetFinalCodes(const RecodedCharID& code) const { + auto it = final_codes_.find(code); + return it == final_codes_.end() ? NULL : it->second; + } + + // Writes to the given file. Returns false in case of error. + bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + bool DeSerialize(bool swap, TFile* fp); + + // Returns a STRING containing a text file that describes the encoding thus: + // [,]* + // In words, a comma-separated list of one or more indices, followed by a tab + // and the UTF-8 string that the code represents per line. Most simple scripts + // will encode a single index to a UTF8-string, but Chinese, Japanese, Korean + // and the Indic scripts will contain a many-to-many mapping. + // See the class comment above for details. + STRING GetEncodingAsString(const UNICHARSET& unicharset) const; + + // Helper decomposes a Hangul unicode to 3 parts, leading, vowel, trailing. + // Note that the returned values are 0-based indices, NOT unicode Jamo. + // Returns false if the input is not in the Hangul unicode range. + static bool DecomposeHangul(int unicode, int* leading, int* vowel, + int* trailing); + + private: + // Renumbers codes to eliminate unused values. + void DefragmentCodeValues(int encoded_null); + // Computes the value of code_range_ from the encoder_. + void ComputeCodeRange(); + // Initializes the decoding hash_map from the encoder_ array. + void SetupDecoder(); + // Frees allocated memory. + void Cleanup(); + + // The encoder that maps a unichar-id to a sequence of small codes. + // encoder_ is the only part that is serialized. The rest is computed on load. + GenericVector encoder_; + // Decoder converts the output of encoder back to a unichar-id. + std::unordered_map + decoder_; + // True if the index is a valid single or start code. + GenericVector is_valid_start_; + // Maps a prefix code to a list of valid next codes. + // The map owns the vectors. + std::unordered_map*, + RecodedCharID::RecodedCharIDHash> + next_codes_; + // Maps a prefix code to a list of valid final codes. + // The map owns the vectors. + std::unordered_map*, + RecodedCharID::RecodedCharIDHash> + final_codes_; + // Max of any value in encoder_ + 1. + int code_range_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_CCUTIL_UNICHARCOMPRESS_H_ diff --git a/ccutil/unicharmap.h b/ccutil/unicharmap.h index ad901585..ecc4065e 100644 --- a/ccutil/unicharmap.h +++ b/ccutil/unicharmap.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCUTIL_UNICHARMAP_H__ -#define TESSERACT_CCUTIL_UNICHARMAP_H__ +#ifndef TESSERACT_CCUTIL_UNICHARMAP_H_ +#define TESSERACT_CCUTIL_UNICHARMAP_H_ #include "unichar.h" @@ -79,4 +79,4 @@ class UNICHARMAP { UNICHARMAP_NODE* nodes; }; -#endif // TESSERACT_CCUTIL_UNICHARMAP_H__ +#endif // TESSERACT_CCUTIL_UNICHARMAP_H_ diff --git a/ccutil/unicharset.cpp b/ccutil/unicharset.cpp index f7e48421..380c7410 100644 --- a/ccutil/unicharset.cpp +++ b/ccutil/unicharset.cpp @@ -906,6 +906,8 @@ void UNICHARSET::post_load_setup() { han_sid_ = get_script_id_from_name("Han"); hiragana_sid_ = get_script_id_from_name("Hiragana"); katakana_sid_ = get_script_id_from_name("Katakana"); + thai_sid_ = get_script_id_from_name("Thai"); + hangul_sid_ = get_script_id_from_name("Hangul"); // Compute default script. Use the highest-counting alpha script, that is // not the common script, as that still contains some "alphas". diff --git a/ccutil/unicharset.h b/ccutil/unicharset.h index 684655af..716147ee 100644 --- a/ccutil/unicharset.h +++ b/ccutil/unicharset.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CCUTIL_UNICHARSET_H__ -#define TESSERACT_CCUTIL_UNICHARSET_H__ +#ifndef TESSERACT_CCUTIL_UNICHARSET_H_ +#define TESSERACT_CCUTIL_UNICHARSET_H_ #include "errcode.h" #include "genericvector.h" @@ -141,7 +141,7 @@ class UNICHARSET { // Custom list of characters and their ligature forms (UTF8) // These map to unicode values in the private use area (PUC) and are supported // by only few font families (eg. Wyld, Adobe Caslon Pro). - static const char* kCustomLigatures[][2]; + static TESS_API const char* kCustomLigatures[][2]; // List of strings for the SpecialUnicharCodes. Keep in sync with the enum. static const char* kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]; @@ -181,8 +181,7 @@ class UNICHARSET { // Return the UNICHAR_ID of a given unichar representation within the // UNICHARSET. Only the first length characters from unichar_repr are used. - UNICHAR_ID unichar_to_id(const char* const unichar_repr, - int length) const; + UNICHAR_ID unichar_to_id(const char* const unichar_repr, int length) const; // Return the minimum number of bytes that matches a legal UNICHAR_ID, // while leaving the rest of the string encodable. Returns 0 if the @@ -291,6 +290,8 @@ class UNICHARSET { han_sid_ = 0; hiragana_sid_ = 0; katakana_sid_ = 0; + thai_sid_ = 0; + hangul_sid_ = 0; } // Return the size of the set (the number of different UNICHAR it holds). @@ -605,6 +606,16 @@ class UNICHARSET { return unichars[unichar_id].properties.AnyRangeEmpty(); } + // Returns true if the script of the given id is space delimited. + // Returns false for Han and Thai scripts. + bool IsSpaceDelimited(UNICHAR_ID unichar_id) const { + if (INVALID_UNICHAR_ID == unichar_id) return true; + int script_id = get_script(unichar_id); + return script_id != han_sid_ && script_id != thai_sid_ && + script_id != hangul_sid_ && script_id != hiragana_sid_ && + script_id != katakana_sid_; + } + // Return the script name of the given unichar. // The returned pointer will always be the same for the same script, it's // managed by unicharset and thus MUST NOT be deleted @@ -774,7 +785,7 @@ class UNICHARSET { // Returns normalized version of unichar with the given unichar_id. const char *get_normed_unichar(UNICHAR_ID unichar_id) const { - if (unichar_id == UNICHAR_SPACE && has_special_codes()) return " "; + if (unichar_id == UNICHAR_SPACE) return " "; return unichars[unichar_id].properties.normed.string(); } // Returns a vector of UNICHAR_IDs that represent the ids of the normalized @@ -836,6 +847,8 @@ class UNICHARSET { int han_sid() const { return han_sid_; } int hiragana_sid() const { return hiragana_sid_; } int katakana_sid() const { return katakana_sid_; } + int thai_sid() const { return thai_sid_; } + int hangul_sid() const { return hangul_sid_; } int default_sid() const { return default_sid_; } // Returns true if the unicharset has the concept of upper/lower case. @@ -978,8 +991,10 @@ class UNICHARSET { int han_sid_; int hiragana_sid_; int katakana_sid_; + int thai_sid_; + int hangul_sid_; // The most frequently occurring script in the charset. int default_sid_; }; -#endif // TESSERACT_CCUTIL_UNICHARSET_H__ +#endif // TESSERACT_CCUTIL_UNICHARSET_H_ diff --git a/ccutil/unicity_table.h b/ccutil/unicity_table.h index d664d461..f89e1ab3 100644 --- a/ccutil/unicity_table.h +++ b/ccutil/unicity_table.h @@ -87,7 +87,9 @@ class UnicityTable { /// Returns false on read/write error. bool write(FILE* f, TessResultCallback2* cb) const; /// swap is used to switch the endianness. - bool read(FILE* f, TessResultCallback3* cb, bool swap); + bool read(tesseract::TFile* f, + TessResultCallback3* cb, + bool swap); private: GenericVector table_; @@ -194,7 +196,8 @@ bool UnicityTable::write( template bool UnicityTable::read( - FILE* f, TessResultCallback3* cb, bool swap) { + tesseract::TFile* f, + TessResultCallback3* cb, bool swap) { return table_.read(f, cb, swap); } diff --git a/ccutil/unicodes.h b/ccutil/unicodes.h index f6d2bd51..7bab9b00 100644 --- a/ccutil/unicodes.h +++ b/ccutil/unicodes.h @@ -17,8 +17,8 @@ * **********************************************************************/ -#ifndef TESSERACT_CCUTIL_UNICODES_H__ -#define TESSERACT_CCUTIL_UNICODES_H__ +#ifndef TESSERACT_CCUTIL_UNICODES_H_ +#define TESSERACT_CCUTIL_UNICODES_H_ namespace tesseract { @@ -36,4 +36,4 @@ extern const char *kApostropheLikeUTF8[]; } // namespace -#endif // TESSERACT_CCUTIL_UNICODES_H__ +#endif // TESSERACT_CCUTIL_UNICODES_H_ diff --git a/ccutil/universalambigs.h b/ccutil/universalambigs.h index bcc633e8..f3f2fa1a 100644 --- a/ccutil/universalambigs.h +++ b/ccutil/universalambigs.h @@ -18,9 +18,14 @@ // /////////////////////////////////////////////////////////////////////// +#ifndef TESSERACT_CCUTIL_UNIVERSALAMBIGS_H_ +#define TESSERACT_CCUTIL_UNIVERSALAMBIGS_H_ + namespace tesseract { extern const char kUniversalAmbigsFile[]; extern const int ksizeofUniversalAmbigsFile; } // namespace tesseract + +#endif // TESSERACT_CCUTIL_UNIVERSALAMBIGS_H_ diff --git a/classify/adaptive.cpp b/classify/adaptive.cpp index 019befb4..9d00a51c 100644 --- a/classify/adaptive.cpp +++ b/classify/adaptive.cpp @@ -30,6 +30,8 @@ #endif #include +using tesseract::TFile; + /*---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------*/ @@ -310,7 +312,7 @@ void Classify::PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates) { * @note Exceptions: none * @note History: Tue Mar 19 14:11:01 1991, DSJ, Created. */ -ADAPT_CLASS ReadAdaptedClass(FILE *File) { +ADAPT_CLASS ReadAdaptedClass(TFile *fp) { int NumTempProtos; int NumConfigs; int i; @@ -319,34 +321,34 @@ ADAPT_CLASS ReadAdaptedClass(FILE *File) { /* first read high level adapted class structure */ Class = (ADAPT_CLASS) Emalloc (sizeof (ADAPT_CLASS_STRUCT)); - fread ((char *) Class, sizeof (ADAPT_CLASS_STRUCT), 1, File); + fp->FRead(Class, sizeof(ADAPT_CLASS_STRUCT), 1); /* then read in the definitions of the permanent protos and configs */ Class->PermProtos = NewBitVector (MAX_NUM_PROTOS); Class->PermConfigs = NewBitVector (MAX_NUM_CONFIGS); - fread ((char *) Class->PermProtos, sizeof (uinT32), - WordsInVectorOfSize (MAX_NUM_PROTOS), File); - fread ((char *) Class->PermConfigs, sizeof (uinT32), - WordsInVectorOfSize (MAX_NUM_CONFIGS), File); + fp->FRead(Class->PermProtos, sizeof(uinT32), + WordsInVectorOfSize(MAX_NUM_PROTOS)); + fp->FRead(Class->PermConfigs, sizeof(uinT32), + WordsInVectorOfSize(MAX_NUM_CONFIGS)); /* then read in the list of temporary protos */ - fread ((char *) &NumTempProtos, sizeof (int), 1, File); + fp->FRead(&NumTempProtos, sizeof(int), 1); Class->TempProtos = NIL_LIST; for (i = 0; i < NumTempProtos; i++) { TempProto = (TEMP_PROTO) alloc_struct (sizeof (TEMP_PROTO_STRUCT), "TEMP_PROTO_STRUCT"); - fread ((char *) TempProto, sizeof (TEMP_PROTO_STRUCT), 1, File); + fp->FRead(TempProto, sizeof(TEMP_PROTO_STRUCT), 1); Class->TempProtos = push_last (Class->TempProtos, TempProto); } /* then read in the adapted configs */ - fread ((char *) &NumConfigs, sizeof (int), 1, File); + fp->FRead(&NumConfigs, sizeof(int), 1); for (i = 0; i < NumConfigs; i++) if (test_bit (Class->PermConfigs, i)) - Class->Config[i].Perm = ReadPermConfig (File); + Class->Config[i].Perm = ReadPermConfig(fp); else - Class->Config[i].Temp = ReadTempConfig (File); + Class->Config[i].Temp = ReadTempConfig(fp); return (Class); @@ -366,20 +368,20 @@ namespace tesseract { * @note Exceptions: none * @note History: Mon Mar 18 15:18:10 1991, DSJ, Created. */ -ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(FILE *File) { +ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(TFile *fp) { int i; ADAPT_TEMPLATES Templates; /* first read the high level adaptive template struct */ Templates = (ADAPT_TEMPLATES) Emalloc (sizeof (ADAPT_TEMPLATES_STRUCT)); - fread ((char *) Templates, sizeof (ADAPT_TEMPLATES_STRUCT), 1, File); + fp->FRead(Templates, sizeof(ADAPT_TEMPLATES_STRUCT), 1); /* then read in the basic integer templates */ - Templates->Templates = ReadIntTemplates (File); + Templates->Templates = ReadIntTemplates(false, fp); /* then read in the adaptive info for each class */ for (i = 0; i < (Templates->Templates)->NumClasses; i++) { - Templates->Class[i] = ReadAdaptedClass (File); + Templates->Class[i] = ReadAdaptedClass(fp); } return (Templates); @@ -399,15 +401,15 @@ ADAPT_TEMPLATES Classify::ReadAdaptedTemplates(FILE *File) { * @note Exceptions: none * @note History: Tue Mar 19 14:25:26 1991, DSJ, Created. */ -PERM_CONFIG ReadPermConfig(FILE *File) { +PERM_CONFIG ReadPermConfig(TFile *fp) { PERM_CONFIG Config = (PERM_CONFIG) alloc_struct(sizeof(PERM_CONFIG_STRUCT), "PERM_CONFIG_STRUCT"); uinT8 NumAmbigs; - fread ((char *) &NumAmbigs, sizeof(uinT8), 1, File); + fp->FRead(&NumAmbigs, sizeof(uinT8), 1); Config->Ambigs = new UNICHAR_ID[NumAmbigs + 1]; - fread(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs, File); + fp->FRead(Config->Ambigs, sizeof(UNICHAR_ID), NumAmbigs); Config->Ambigs[NumAmbigs] = -1; - fread(&(Config->FontinfoId), sizeof(int), 1, File); + fp->FRead(&(Config->FontinfoId), sizeof(int), 1); return (Config); @@ -426,17 +428,16 @@ PERM_CONFIG ReadPermConfig(FILE *File) { * @note Exceptions: none * @note History: Tue Mar 19 14:29:59 1991, DSJ, Created. */ -TEMP_CONFIG ReadTempConfig(FILE *File) { +TEMP_CONFIG ReadTempConfig(TFile *fp) { TEMP_CONFIG Config; Config = (TEMP_CONFIG) alloc_struct (sizeof (TEMP_CONFIG_STRUCT), "TEMP_CONFIG_STRUCT"); - fread ((char *) Config, sizeof (TEMP_CONFIG_STRUCT), 1, File); + fp->FRead(Config, sizeof(TEMP_CONFIG_STRUCT), 1); Config->Protos = NewBitVector (Config->ProtoVectorSize * BITSINLONG); - fread ((char *) Config->Protos, sizeof (uinT32), - Config->ProtoVectorSize, File); + fp->FRead(Config->Protos, sizeof(uinT32), Config->ProtoVectorSize); return (Config); diff --git a/classify/adaptive.h b/classify/adaptive.h index c67670f6..0f5f3627 100644 --- a/classify/adaptive.h +++ b/classify/adaptive.h @@ -126,11 +126,11 @@ TEMP_CONFIG NewTempConfig(int MaxProtoId, int FontinfoId); TEMP_PROTO NewTempProto(); -ADAPT_CLASS ReadAdaptedClass(FILE *File); +ADAPT_CLASS ReadAdaptedClass(tesseract::TFile *File); -PERM_CONFIG ReadPermConfig(FILE *File); +PERM_CONFIG ReadPermConfig(tesseract::TFile *File); -TEMP_CONFIG ReadTempConfig(FILE *File); +TEMP_CONFIG ReadTempConfig(tesseract::TFile *File); void WriteAdaptedClass(FILE *File, ADAPT_CLASS Class, int NumConfigs); diff --git a/classify/adaptmatch.cpp b/classify/adaptmatch.cpp index b89f1cb7..36a8ac14 100644 --- a/classify/adaptmatch.cpp +++ b/classify/adaptmatch.cpp @@ -514,7 +514,7 @@ void Classify::EndAdaptiveClassifier() { * Parameters: * load_pre_trained_templates Indicates whether the pre-trained * templates (inttemp, normproto and pffmtable components) - * should be lodaded. Should only be set to true if the + * should be loaded. Should only be set to true if the * necessary classifier components are present in the * [lang].traineddata file. * Globals: @@ -524,7 +524,7 @@ void Classify::EndAdaptiveClassifier() { * enables use of pre-adapted templates * @note History: Mon Mar 11 12:49:34 1991, DSJ, Created. */ -void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) { +void Classify::InitAdaptiveClassifier(TessdataManager* mgr) { if (!classify_enable_adaptive_matcher) return; if (AllProtosOn != NULL) @@ -532,37 +532,25 @@ void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) { // If there is no language_data_path_prefix, the classifier will be // adaptive only. - if (language_data_path_prefix.length() > 0 && - load_pre_trained_templates) { - ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_INTTEMP)); - PreTrainedTemplates = - ReadIntTemplates(tessdata_manager.GetDataFilePtr()); - if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded inttemp\n"); + if (language_data_path_prefix.length() > 0 && mgr != nullptr) { + TFile fp; + ASSERT_HOST(mgr->GetComponent(TESSDATA_INTTEMP, &fp)); + PreTrainedTemplates = ReadIntTemplates(mgr->swap(), &fp); - if (tessdata_manager.SeekToStart(TESSDATA_SHAPE_TABLE)) { + if (mgr->GetComponent(TESSDATA_SHAPE_TABLE, &fp)) { shape_table_ = new ShapeTable(unicharset); - if (!shape_table_->DeSerialize(tessdata_manager.swap(), - tessdata_manager.GetDataFilePtr())) { + if (!shape_table_->DeSerialize(mgr->swap(), &fp)) { tprintf("Error loading shape table!\n"); delete shape_table_; shape_table_ = NULL; - } else if (tessdata_manager.DebugLevel() > 0) { - tprintf("Successfully loaded shape table!\n"); } } - ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_PFFMTABLE)); - ReadNewCutoffs(tessdata_manager.GetDataFilePtr(), - tessdata_manager.swap(), - tessdata_manager.GetEndOffset(TESSDATA_PFFMTABLE), - CharNormCutoffs); - if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded pffmtable\n"); + ASSERT_HOST(mgr->GetComponent(TESSDATA_PFFMTABLE, &fp)); + ReadNewCutoffs(&fp, mgr->swap(), CharNormCutoffs); - ASSERT_HOST(tessdata_manager.SeekToStart(TESSDATA_NORMPROTO)); - NormProtos = - ReadNormProtos(tessdata_manager.GetDataFilePtr(), - tessdata_manager.GetEndOffset(TESSDATA_NORMPROTO)); - if (tessdata_manager.DebugLevel() > 0) tprintf("Loaded normproto\n"); + ASSERT_HOST(mgr->GetComponent(TESSDATA_NORMPROTO, &fp)); + NormProtos = ReadNormProtos(&fp); static_classifier_ = new TessClassifier(false, this); } @@ -582,21 +570,19 @@ void Classify::InitAdaptiveClassifier(bool load_pre_trained_templates) { } if (classify_use_pre_adapted_templates) { - FILE *File; + TFile fp; STRING Filename; Filename = imagefile; Filename += ADAPT_TEMPLATE_SUFFIX; - File = fopen(Filename.string(), "rb"); - if (File == NULL) { + if (!fp.Open(Filename.string(), nullptr)) { AdaptedTemplates = NewAdaptedTemplates(true); } else { cprintf("\nReading pre-adapted templates from %s ...\n", Filename.string()); fflush(stdout); - AdaptedTemplates = ReadAdaptedTemplates(File); + AdaptedTemplates = ReadAdaptedTemplates(&fp); cprintf("\n"); - fclose(File); PrintAdaptedTemplates(stdout, AdaptedTemplates); for (int i = 0; i < AdaptedTemplates->Templates->NumClasses; i++) { @@ -819,7 +805,7 @@ int Classify::GetAdaptiveFeatures(TBLOB *Blob, Features = ExtractPicoFeatures(Blob); NumFeatures = Features->NumFeatures; - if (NumFeatures > UNLIKELY_NUM_FEAT) { + if (NumFeatures == 0 || NumFeatures > UNLIKELY_NUM_FEAT) { FreeFeatureSet(Features); return 0; } @@ -907,8 +893,9 @@ void Classify::AdaptToChar(TBLOB* Blob, CLASS_ID ClassId, int FontinfoId, IClass = ClassForClassId(adaptive_templates->Templates, ClassId); NumFeatures = GetAdaptiveFeatures(Blob, IntFeatures, &FloatFeatures); - if (NumFeatures <= 0) - return; + if (NumFeatures <= 0) { + return; // Features already freed by GetAdaptiveFeatures. + } // Only match configs with the matching font. BIT_VECTOR MatchingFontConfigs = NewBitVector(MAX_NUM_PROTOS); @@ -1001,11 +988,11 @@ void Classify::DisplayAdaptedChar(TBLOB* blob, INT_CLASS_STRUCT* int_class) { 6 | 0x19, matcher_debug_separate_windows); UpdateMatchDisplay(); } + + delete sample; #endif } - - /** * This routine adds the result of a classification into * Results. If the new rating is much worse than the current diff --git a/classify/classify.cpp b/classify/classify.cpp index 436efd1f..7c11c51f 100644 --- a/classify/classify.cpp +++ b/classify/classify.cpp @@ -151,8 +151,8 @@ Classify::Classify() INT_MEMBER(classify_integer_matcher_multiplier, 10, "Integer Matcher Multiplier 0-255: ", this->params()), EnableLearning(true), - INT_MEMBER(il1_adaption_test, 0, "Don't adapt to i/I at beginning of word", - this->params()), + INT_MEMBER(il1_adaption_test, 0, + "Don't adapt to i/I at beginning of word", this->params()), BOOL_MEMBER(classify_bln_numeric_mode, 0, "Assume the input is numbers [0-9].", this->params()), double_MEMBER(speckle_large_max_size, 0.30, "Max large speckle size", diff --git a/classify/classify.h b/classify/classify.h index 0de84415..c04cb93c 100644 --- a/classify/classify.h +++ b/classify/classify.h @@ -16,8 +16,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CLASSIFY_CLASSIFY_H__ -#define TESSERACT_CLASSIFY_CLASSIFY_H__ +#ifndef TESSERACT_CLASSIFY_CLASSIFY_H_ +#define TESSERACT_CLASSIFY_CLASSIFY_H_ #include "adaptive.h" #include "ccstruct.h" @@ -103,16 +103,15 @@ class Classify : public CCStruct { const uinT8* normalization_factors, const uinT16* expected_num_features, GenericVector* results); - void ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset, - CLASS_CUTOFF_ARRAY Cutoffs); + void ReadNewCutoffs(TFile* fp, bool swap, CLASS_CUTOFF_ARRAY Cutoffs); void PrintAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates); void WriteAdaptedTemplates(FILE *File, ADAPT_TEMPLATES Templates); - ADAPT_TEMPLATES ReadAdaptedTemplates(FILE *File); + ADAPT_TEMPLATES ReadAdaptedTemplates(TFile* File); /* normmatch.cpp ************************************************************/ FLOAT32 ComputeNormMatch(CLASS_ID ClassId, const FEATURE_STRUCT& feature, BOOL8 DebugMatch); void FreeNormProtos(); - NORM_PROTOS *ReadNormProtos(FILE *File, inT64 end_offset); + NORM_PROTOS* ReadNormProtos(TFile* fp); /* protos.cpp ***************************************************************/ void ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class); INT_TEMPLATES CreateIntTemplates(CLASSES FloatProtos, @@ -138,7 +137,7 @@ class Classify : public CCStruct { void LearnPieces(const char* fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char* correct_text, WERD_RES* word); - void InitAdaptiveClassifier(bool load_pre_trained_templates); + void InitAdaptiveClassifier(TessdataManager* mgr); void InitAdaptedClass(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, @@ -335,7 +334,7 @@ class Classify : public CCStruct { uinT8* char_norm_array); void ComputeIntFeatures(FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures); /* intproto.cpp *************************************************************/ - INT_TEMPLATES ReadIntTemplates(FILE *File); + INT_TEMPLATES ReadIntTemplates(bool swap, TFile* fp); void WriteIntTemplates(FILE *File, INT_TEMPLATES Templates, const UNICHARSET& target_unicharset); CLASS_ID GetClassToDebug(const char *Prompt, bool* adaptive_on, @@ -539,4 +538,4 @@ class Classify : public CCStruct { }; } // namespace tesseract -#endif // TESSERACT_CLASSIFY_CLASSIFY_H__ +#endif // TESSERACT_CLASSIFY_CLASSIFY_H_ diff --git a/classify/cluster.cpp b/classify/cluster.cpp index b723bfa8..1f823495 100644 --- a/classify/cluster.cpp +++ b/classify/cluster.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: cluster.c - ** Purpose: Routines for clustering points in N-D space - ** Author: Dan Johnson - ** History: 5/29/89, DSJ, Created. + ** Filename: cluster.c + ** Purpose: Routines for clustering points in N-D space + ** Author: Dan Johnson + ** History: 5/29/89, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -390,11 +390,11 @@ double InvertMatrix(const float* input, int size, float* inv); * This routine creates a new clusterer data structure, * initializes it, and returns a pointer to it. * - * @param SampleSize number of dimensions in feature space - * @param ParamDesc description of each dimension - * @return pointer to the new clusterer data structure - * @note Exceptions: None - * @note History: 5/29/89, DSJ, Created. + * @param SampleSize number of dimensions in feature space + * @param ParamDesc description of each dimension + * @return pointer to the new clusterer data structure + * @note Exceptions: None + * @note History: 5/29/89, DSJ, Created. */ CLUSTERER * MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) { @@ -437,7 +437,6 @@ MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) { return Clusterer; } // MakeClusterer - /** * This routine creates a new sample data structure to hold * the specified feature. This sample is added to the clusterer @@ -445,14 +444,14 @@ MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]) { * clustered later), and a pointer to the sample is returned to * the caller. * - * @param Clusterer clusterer data structure to add sample to - * @param Feature feature to be added to clusterer - * @param CharID unique ident. of char that sample came from + * @param Clusterer clusterer data structure to add sample to + * @param Feature feature to be added to clusterer + * @param CharID unique ident. of char that sample came from * - * @return Pointer to the new sample data structure - * @note Exceptions: ALREADYCLUSTERED MakeSample can't be called after + * @return Pointer to the new sample data structure + * @note Exceptions: ALREADYCLUSTERED MakeSample can't be called after * ClusterSamples has been called - * @note History: 5/29/89, DSJ, Created. + * @note History: 5/29/89, DSJ, Created. */ SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, inT32 CharID) { @@ -490,7 +489,6 @@ SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, return (Sample); } // MakeSample - /** * This routine first checks to see if the samples in this * clusterer have already been clustered before; if so, it does @@ -505,12 +503,12 @@ SAMPLE* MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, * list of prototypes that best represent the samples given * the constraints specified in Config. * - * @param Clusterer data struct containing samples to be clustered - * @param Config parameters which control clustering process + * @param Clusterer data struct containing samples to be clustered + * @param Config parameters which control clustering process * * @return Pointer to a list of prototypes - * @note Exceptions: None - * @note History: 5/29/89, DSJ, Created. + * @note Exceptions: None + * @note History: 5/29/89, DSJ, Created. */ LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { //only create cluster tree if samples have never been clustered before @@ -523,10 +521,16 @@ LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { //compute prototypes starting at the root node in the tree ComputePrototypes(Clusterer, Config); - return (Clusterer->ProtoList); + // We don't need the cluster pointers in the protos any more, so null them + // out, which makes it safe to delete the clusterer. + LIST proto_list = Clusterer->ProtoList; + iterate(proto_list) { + PROTOTYPE *proto = reinterpret_cast(first_node(proto_list)); + proto->Cluster = NULL; + } + return Clusterer->ProtoList; } // ClusterSamples - /** * This routine frees all of the memory allocated to the * specified data structure. It will not, however, free @@ -535,10 +539,10 @@ LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { * to NULL to indicate that the cluster data structures no * longer exist. Any sample lists that have been obtained * via calls to GetSamples are no longer valid. - * @param Clusterer pointer to data structure to be freed + * @param Clusterer pointer to data structure to be freed * @return None - * @note Exceptions: None - * @note History: 6/6/89, DSJ, Created. + * @note Exceptions: None + * @note History: 6/6/89, DSJ, Created. */ void FreeClusterer(CLUSTERER *Clusterer) { if (Clusterer != NULL) { @@ -558,21 +562,19 @@ void FreeClusterer(CLUSTERER *Clusterer) { } } // FreeClusterer - /** * This routine frees all of the memory allocated to the * specified list of prototypes. The clusters which are * pointed to by the prototypes are not freed. - * @param ProtoList pointer to list of prototypes to be freed + * @param ProtoList pointer to list of prototypes to be freed * @return None - * @note Exceptions: None - * @note History: 6/6/89, DSJ, Created. + * @note Exceptions: None + * @note History: 6/6/89, DSJ, Created. */ void FreeProtoList(LIST *ProtoList) { destroy_nodes(*ProtoList, FreePrototype); } // FreeProtoList - /** * This routine deallocates the memory consumed by the specified * prototype and modifies the corresponding cluster so that it @@ -606,7 +608,6 @@ void FreePrototype(void *arg) { //PROTOTYPE *Prototype) memfree(Prototype); } // FreePrototype - /** * This routine is used to find all of the samples which * belong to a cluster. It starts by removing the top @@ -617,10 +618,10 @@ void FreePrototype(void *arg) { //PROTOTYPE *Prototype) * If all samples have been found, NULL is returned. * InitSampleSearch() must be called * before NextSample() to initialize the search. - * @param SearchState ptr to list containing clusters to be searched - * @return Pointer to the next leaf cluster (sample) or NULL. - * @note Exceptions: None - * @note History: 6/16/89, DSJ, Created. + * @param SearchState ptr to list containing clusters to be searched + * @return Pointer to the next leaf cluster (sample) or NULL. + * @note Exceptions: None + * @note History: 6/16/89, DSJ, Created. */ CLUSTER *NextSample(LIST *SearchState) { CLUSTER *Cluster; @@ -637,29 +638,27 @@ CLUSTER *NextSample(LIST *SearchState) { } } // NextSample - /** * This routine returns the mean of the specified * prototype in the indicated dimension. - * @param Proto prototype to return mean of - * @param Dimension dimension whose mean is to be returned - * @return Mean of Prototype in Dimension + * @param Proto prototype to return mean of + * @param Dimension dimension whose mean is to be returned + * @return Mean of Prototype in Dimension * @note Exceptions: none - * @note History: 7/6/89, DSJ, Created. + * @note History: 7/6/89, DSJ, Created. */ FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension) { return (Proto->Mean[Dimension]); } // Mean - /** * This routine returns the standard deviation of the * prototype in the indicated dimension. - * @param Proto prototype to return standard deviation of - * @param Dimension dimension whose stddev is to be returned - * @return Standard deviation of Prototype in Dimension + * @param Proto prototype to return standard deviation of + * @param Dimension dimension whose stddev is to be returned + * @return Standard deviation of Prototype in Dimension * @note Exceptions: none - * @note History: 7/6/89, DSJ, Created. + * @note History: 7/6/89, DSJ, Created. */ FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) { switch (Proto->Style) { @@ -697,10 +696,10 @@ FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension) { * tree are the individual samples themselves; they have no * sub-clusters. The root node of the tree conceptually contains * all of the samples. - * @param Clusterer data structure holdings samples to be clustered - * @return None (the Clusterer data structure is changed) - * @note Exceptions: None - * @note History: 5/29/89, DSJ, Created. + * @param Clusterer data structure holdings samples to be clustered + * @return None (the Clusterer data structure is changed) + * @note Exceptions: None + * @note History: 5/29/89, DSJ, Created. */ void CreateClusterTree(CLUSTERER *Clusterer) { ClusteringContext context; @@ -760,7 +759,6 @@ void CreateClusterTree(CLUSTERER *Clusterer) { memfree(context.candidates); } // CreateClusterTree - /** * This routine is designed to be used in concert with the * KDWalk routine. It will create a potential cluster for @@ -786,7 +784,6 @@ void MakePotentialClusters(ClusteringContext *context, } } // MakePotentialClusters - /** * This routine searches the specified kd-tree for the nearest * neighbor of the specified cluster. It actually uses the @@ -795,12 +792,12 @@ void MakePotentialClusters(ClusteringContext *context, * neighbor is returned, if it can be found, otherwise NULL is * returned. The distance between the 2 nodes is placed * in the specified variable. - * @param Tree kd-tree to search in for nearest neighbor - * @param Cluster cluster whose nearest neighbor is to be found - * @param Distance ptr to variable to report distance found - * @return Pointer to the nearest neighbor of Cluster, or NULL + * @param Tree kd-tree to search in for nearest neighbor + * @param Cluster cluster whose nearest neighbor is to be found + * @param Distance ptr to variable to report distance found + * @return Pointer to the nearest neighbor of Cluster, or NULL * @note Exceptions: none - * @note History: 5/29/89, DSJ, Created. + * @note History: 5/29/89, DSJ, Created. * 7/13/89, DSJ, Removed visibility of kd-tree node data struct */ CLUSTER * @@ -830,17 +827,16 @@ FindNearestNeighbor(KDTREE * Tree, CLUSTER * Cluster, FLOAT32 * Distance) return BestNeighbor; } // FindNearestNeighbor - /** * This routine creates a new permanent cluster from the * clusters specified in TempCluster. The 2 clusters in * TempCluster are marked as "clustered" and deleted from * the kd-tree. The new cluster is then added to the kd-tree. - * @param Clusterer current clustering environment - * @param TempCluster potential cluster to make permanent + * @param Clusterer current clustering environment + * @param TempCluster potential cluster to make permanent * @return Pointer to the new permanent cluster - * @note Exceptions: none - * @note History: 5/29/89, DSJ, Created. + * @note Exceptions: none + * @note History: 5/29/89, DSJ, Created. * 7/13/89, DSJ, Removed visibility of kd-tree node data struct */ CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) { @@ -872,21 +868,20 @@ CLUSTER *MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster) { return Cluster; } // MakeNewCluster - /** * This routine merges two clusters into one larger cluster. * To do this it computes the number of samples in the new * cluster and the mean of the new cluster. The ParamDesc * information is used to ensure that circular dimensions * are handled correctly. - * @param N # of dimensions (size of arrays) - * @param ParamDesc array of dimension descriptions - * @param n1, n2 number of samples in each old cluster - * @param m array to hold mean of new cluster - * @param m1, m2 arrays containing means of old clusters - * @return The number of samples in the new cluster. - * @note Exceptions: None - * @note History: 5/31/89, DSJ, Created. + * @param N # of dimensions (size of arrays) + * @param ParamDesc array of dimension descriptions + * @param n1, n2 number of samples in each old cluster + * @param m array to hold mean of new cluster + * @param m1, m2 arrays containing means of old clusters + * @return The number of samples in the new cluster. + * @note Exceptions: None + * @note History: 5/31/89, DSJ, Created. */ inT32 MergeClusters(inT16 N, PARAM_DESC ParamDesc[], @@ -921,17 +916,16 @@ inT32 MergeClusters(inT16 N, return n; } // MergeClusters - /** * This routine decides which clusters in the cluster tree * should be represented by prototypes, forms a list of these * prototypes, and places the list in the Clusterer data * structure. - * @param Clusterer data structure holding cluster tree - * @param Config parameters used to control prototype generation - * @return None - * @note Exceptions: None - * @note History: 5/30/89, DSJ, Created. + * @param Clusterer data structure holding cluster tree + * @param Config parameters used to control prototype generation + * @return None + * @note Exceptions: None + * @note History: 5/30/89, DSJ, Created. */ void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { LIST ClusterStack = NIL_LIST; @@ -961,8 +955,7 @@ void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { } } // ComputePrototypes - -/** +/** * This routine attempts to create a prototype from the * specified cluster that conforms to the distribution * specified in Config. If there are too few samples in the @@ -972,12 +965,12 @@ void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { * is generated and NULL is returned. If a prototype can be * found that matches the desired distribution then a pointer * to it is returned, otherwise NULL is returned. - * @param Clusterer data structure holding cluster tree - * @param Config parameters used to control prototype generation - * @param Cluster cluster to be made into a prototype - * @return Pointer to new prototype or NULL - * @note Exceptions: None - * @note History: 6/19/89, DSJ, Created. + * @param Clusterer data structure holding cluster tree + * @param Config parameters used to control prototype generation + * @param Cluster cluster to be made into a prototype + * @return Pointer to new prototype or NULL + * @note Exceptions: None + * @note History: 6/19/89, DSJ, Created. */ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, @@ -1050,7 +1043,6 @@ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, return Proto; } // MakePrototype - /** * This routine checks for clusters which are degenerate and * therefore cannot be analyzed in a statistically valid way. @@ -1063,14 +1055,14 @@ PROTOTYPE *MakePrototype(CLUSTERER *Clusterer, * * If the cluster is not degenerate, NULL is returned. * - * @param N number of dimensions - * @param Cluster cluster being analyzed - * @param Statistics statistical info about cluster - * @param Style type of prototype to be generated - * @param MinSamples minimum number of samples in a cluster - * @return Pointer to degenerate prototype or NULL. - * @note Exceptions: None - * @note History: 6/20/89, DSJ, Created. + * @param N number of dimensions + * @param Cluster cluster being analyzed + * @param Statistics statistical info about cluster + * @param Style type of prototype to be generated + * @param MinSamples minimum number of samples in a cluster + * @return Pointer to degenerate prototype or NULL. + * @note Exceptions: None + * @note History: 6/20/89, DSJ, Created. * 7/12/89, DSJ, Changed name and added check for 0 stddev. * 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere). */ @@ -1110,10 +1102,10 @@ PROTOTYPE *MakeDegenerateProto( //this was MinSample * be split. If not, then a new prototype is formed and * returned to the caller. If there is, then NULL is returned * to the caller. - * @param Clusterer data struct containing samples being clustered + * @param Clusterer data struct containing samples being clustered * @param Config provides the magic number of samples that make a good cluster - * @param Cluster cluster to be made into an elliptical prototype - * @param Statistics statistical info about cluster + * @param Cluster cluster to be made into an elliptical prototype + * @param Statistics statistical info about cluster * @return Pointer to new elliptical prototype or NULL. */ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, @@ -1215,13 +1207,13 @@ PROTOTYPE *TestEllipticalProto(CLUSTERER *Clusterer, * be approximated by a spherical normal distribution. If it * can be, then a new prototype is formed and returned to the * caller. If it can't be, then NULL is returned to the caller. - * @param Clusterer data struct containing samples being clustered - * @param Cluster cluster to be made into a spherical prototype - * @param Statistics statistical info about cluster - * @param Buckets histogram struct used to analyze distribution - * @return Pointer to new spherical prototype or NULL. - * @note Exceptions: None - * @note History: 6/1/89, DSJ, Created. + * @param Clusterer data struct containing samples being clustered + * @param Cluster cluster to be made into a spherical prototype + * @param Statistics statistical info about cluster + * @param Buckets histogram struct used to analyze distribution + * @return Pointer to new spherical prototype or NULL. + * @note Exceptions: None + * @note History: 6/1/89, DSJ, Created. */ PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, @@ -1247,19 +1239,18 @@ PROTOTYPE *MakeSphericalProto(CLUSTERER *Clusterer, return (Proto); } // MakeSphericalProto - /** * This routine tests the specified cluster to see if it can * be approximated by an elliptical normal distribution. If it * can be, then a new prototype is formed and returned to the * caller. If it can't be, then NULL is returned to the caller. - * @param Clusterer data struct containing samples being clustered - * @param Cluster cluster to be made into an elliptical prototype - * @param Statistics statistical info about cluster - * @param Buckets histogram struct used to analyze distribution - * @return Pointer to new elliptical prototype or NULL. - * @note Exceptions: None - * @note History: 6/12/89, DSJ, Created. + * @param Clusterer data struct containing samples being clustered + * @param Cluster cluster to be made into an elliptical prototype + * @param Statistics statistical info about cluster + * @param Buckets histogram struct used to analyze distribution + * @return Pointer to new elliptical prototype or NULL. + * @note Exceptions: None + * @note History: 6/12/89, DSJ, Created. */ PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, @@ -1286,7 +1277,6 @@ PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, return (Proto); } // MakeEllipticalProto - /** * This routine tests each dimension of the specified cluster to * see what distribution would best approximate that dimension. @@ -1295,14 +1285,14 @@ PROTOTYPE *MakeEllipticalProto(CLUSTERER *Clusterer, * be represented by one of these distributions, * then a new prototype is formed and returned to the * caller. If it can't be, then NULL is returned to the caller. - * @param Clusterer data struct containing samples being clustered - * @param Cluster cluster to be made into a prototype - * @param Statistics statistical info about cluster - * @param NormalBuckets histogram struct used to analyze distribution - * @param Confidence confidence level for alternate distributions - * @return Pointer to new mixed prototype or NULL. - * @note Exceptions: None - * @note History: 6/12/89, DSJ, Created. + * @param Clusterer data struct containing samples being clustered + * @param Cluster cluster to be made into a prototype + * @param Statistics statistical info about cluster + * @param NormalBuckets histogram struct used to analyze distribution + * @param Confidence confidence level for alternate distributions + * @return Pointer to new mixed prototype or NULL. + * @note Exceptions: None + * @note History: 6/12/89, DSJ, Created. */ PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, @@ -1355,16 +1345,15 @@ PROTOTYPE *MakeMixedProto(CLUSTERER *Clusterer, return (Proto); } // MakeMixedProto - /** * This routine alters the ith dimension of the specified * mixed prototype to be D_random. - * @param i index of dimension to be changed - * @param Proto prototype whose dimension is to be altered - * @param ParamDesc description of specified dimension - * @return None - * @note Exceptions: None - * @note History: 6/20/89, DSJ, Created. + * @param i index of dimension to be changed + * @param Proto prototype whose dimension is to be altered + * @param ParamDesc description of specified dimension + * @return None + * @note Exceptions: None + * @note History: 6/20/89, DSJ, Created. */ void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) { Proto->Distrib[i] = D_random; @@ -1380,16 +1369,15 @@ void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc) { // note that the proto Weight is irrelevant for D_random protos } // MakeDimRandom - /** * This routine alters the ith dimension of the specified * mixed prototype to be uniform. - * @param i index of dimension to be changed - * @param Proto prototype whose dimension is to be altered - * @param Statistics statistical info about prototype - * @return None - * @note Exceptions: None - * @note History: 6/20/89, DSJ, Created. + * @param i index of dimension to be changed + * @param Proto prototype whose dimension is to be altered + * @param Statistics statistical info about prototype + * @return None + * @note Exceptions: None + * @note History: 6/20/89, DSJ, Created. */ void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) { Proto->Distrib[i] = uniform; @@ -1410,7 +1398,6 @@ void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) { // note that the proto Weight is irrelevant for uniform protos } // MakeDimUniform - /** * This routine searches the cluster tree for all leaf nodes * which are samples in the specified cluster. It computes @@ -1420,12 +1407,12 @@ void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics) { * return this information to the caller. An incremental * algorithm for computing statistics is not used because * it will not work with circular dimensions. - * @param N number of dimensions - * @param ParamDesc array of dimension descriptions - * @param Cluster cluster whose stats are to be computed - * @return Pointer to new data structure containing statistics - * @note Exceptions: None - * @note History: 6/2/89, DSJ, Created. + * @param N number of dimensions + * @param ParamDesc array of dimension descriptions + * @param Cluster cluster whose stats are to be computed + * @return Pointer to new data structure containing statistics + * @note Exceptions: None + * @note History: 6/2/89, DSJ, Created. */ STATISTICS * ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) { @@ -1502,19 +1489,18 @@ ComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER * Cluster) { return (Statistics); } // ComputeStatistics - /** * This routine creates a spherical prototype data structure to * approximate the samples in the specified cluster. * Spherical prototypes have a single variance which is * common across all dimensions. All dimensions are normally * distributed and independent. - * @param N number of dimensions - * @param Cluster cluster to be made into a spherical prototype - * @param Statistics statistical info about samples in cluster - * @return Pointer to a new spherical prototype data structure - * @note Exceptions: None - * @note History: 6/19/89, DSJ, Created. + * @param N number of dimensions + * @param Cluster cluster to be made into a spherical prototype + * @param Statistics statistical info about samples in cluster + * @return Pointer to a new spherical prototype data structure + * @note Exceptions: None + * @note History: 6/19/89, DSJ, Created. */ PROTOTYPE *NewSphericalProto(uinT16 N, CLUSTER *Cluster, @@ -1537,18 +1523,17 @@ PROTOTYPE *NewSphericalProto(uinT16 N, return (Proto); } // NewSphericalProto - /** * This routine creates an elliptical prototype data structure to * approximate the samples in the specified cluster. * Elliptical prototypes have a variance for each dimension. * All dimensions are normally distributed and independent. - * @param N number of dimensions - * @param Cluster cluster to be made into an elliptical prototype - * @param Statistics statistical info about samples in cluster - * @return Pointer to a new elliptical prototype data structure - * @note Exceptions: None - * @note History: 6/19/89, DSJ, Created. + * @param N number of dimensions + * @param Cluster cluster to be made into an elliptical prototype + * @param Statistics statistical info about samples in cluster + * @return Pointer to a new elliptical prototype data structure + * @note Exceptions: None + * @note History: 6/19/89, DSJ, Created. */ PROTOTYPE *NewEllipticalProto(inT16 N, CLUSTER *Cluster, @@ -1579,7 +1564,6 @@ PROTOTYPE *NewEllipticalProto(inT16 N, return (Proto); } // NewEllipticalProto - /** * This routine creates a mixed prototype data structure to * approximate the samples in the specified cluster. @@ -1588,12 +1572,12 @@ PROTOTYPE *NewEllipticalProto(inT16 N, * structure is initially filled in as though it were an * elliptical prototype. The actual distributions of the * dimensions can be altered by other routines. - * @param N number of dimensions - * @param Cluster cluster to be made into a mixed prototype - * @param Statistics statistical info about samples in cluster - * @return Pointer to a new mixed prototype data structure - * @note Exceptions: None - * @note History: 6/19/89, DSJ, Created. + * @param N number of dimensions + * @param Cluster cluster to be made into a mixed prototype + * @param Statistics statistical info about samples in cluster + * @return Pointer to a new mixed prototype data structure + * @note Exceptions: None + * @note History: 6/19/89, DSJ, Created. */ PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) { PROTOTYPE *Proto; @@ -1609,16 +1593,15 @@ PROTOTYPE *NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics) { return (Proto); } // NewMixedProto - /** * This routine allocates memory to hold a simple prototype * data structure, i.e. one without independent distributions * and variances for each dimension. - * @param N number of dimensions - * @param Cluster cluster to be made into a prototype - * @return Pointer to new simple prototype - * @note Exceptions: None - * @note History: 6/19/89, DSJ, Created. + * @param N number of dimensions + * @param Cluster cluster to be made into a prototype + * @return Pointer to new simple prototype + * @note Exceptions: None + * @note History: 6/19/89, DSJ, Created. */ PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) { PROTOTYPE *Proto; @@ -1640,7 +1623,6 @@ PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) { return (Proto); } // NewSimpleProto - /** * This routine returns TRUE if the specified covariance * matrix indicates that all N dimensions are independent of @@ -1653,13 +1635,13 @@ PROTOTYPE *NewSimpleProto(inT16 N, CLUSTER *Cluster) { * coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) * The covariance matrix is assumed to be symmetric (which * should always be true). - * @param ParamDesc descriptions of each feature space dimension - * @param N number of dimensions - * @param CoVariance ptr to a covariance matrix - * @param Independence max off-diagonal correlation coefficient - * @return TRUE if dimensions are independent, FALSE otherwise - * @note Exceptions: None - * @note History: 6/4/89, DSJ, Created. + * @param ParamDesc descriptions of each feature space dimension + * @param N number of dimensions + * @param CoVariance ptr to a covariance matrix + * @param Independence max off-diagonal correlation coefficient + * @return TRUE if dimensions are independent, FALSE otherwise + * @note Exceptions: None + * @note History: 6/4/89, DSJ, Created. */ BOOL8 Independent (PARAM_DESC ParamDesc[], @@ -1692,7 +1674,6 @@ inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) { return (TRUE); } // Independent - /** * This routine returns a histogram data structure which can * be used by other routines to place samples into histogram @@ -1703,12 +1684,12 @@ inT16 N, FLOAT32 * CoVariance, FLOAT32 Independence) { * created so that it minimizes the computation time needed * to create a new bucket. * @param clusterer which keeps a bucket_cache for us. - * @param Distribution type of probability distribution to test for - * @param SampleCount number of samples that are available - * @param Confidence probability of a Type I error - * @return Bucket data structure + * @param Distribution type of probability distribution to test for + * @param SampleCount number of samples that are available + * @param Confidence probability of a Type I error + * @return Bucket data structure * @note Exceptions: none - * @note History: Thu Aug 3 12:58:10 1989, DSJ, Created. + * @note History: Thu Aug 3 12:58:10 1989, DSJ, Created. */ BUCKETS *GetBuckets(CLUSTERER* clusterer, DISTRIBUTION Distribution, @@ -1739,7 +1720,6 @@ BUCKETS *GetBuckets(CLUSTERER* clusterer, return Buckets; } // GetBuckets - /** * This routine creates a histogram data structure which can * be used by other routines to place samples into histogram @@ -1751,12 +1731,12 @@ BUCKETS *GetBuckets(CLUSTERER* clusterer, * order to make this possible, a mapping table is * computed which maps "normalized" samples into the * appropriate bucket. - * @param Distribution type of probability distribution to test for - * @param SampleCount number of samples that are available - * @param Confidence probability of a Type I error + * @param Distribution type of probability distribution to test for + * @param SampleCount number of samples that are available + * @param Confidence probability of a Type I error * @return Pointer to new histogram data structure - * @note Exceptions: None - * @note History: 6/4/89, DSJ, Created. + * @note Exceptions: None + * @note History: 6/4/89, DSJ, Created. */ BUCKETS *MakeBuckets(DISTRIBUTION Distribution, uinT32 SampleCount, @@ -1840,7 +1820,6 @@ BUCKETS *MakeBuckets(DISTRIBUTION Distribution, return Buckets; } // MakeBuckets - /** * This routine computes the optimum number of histogram * buckets that should be used in a chi-squared goodness of @@ -1851,7 +1830,7 @@ BUCKETS *MakeBuckets(DISTRIBUTION Distribution, * values. The table is intended for a 0.05 level of * significance (alpha). This routine assumes that it is * equally valid for other alpha's, which may not be true. - * @param SampleCount number of samples to be tested + * @param SampleCount number of samples to be tested * @return Optimum number of histogram buckets * @note Exceptions: None * @note History: 6/5/89, DSJ, Created. @@ -1874,7 +1853,6 @@ uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) { return kBucketsTable[Last]; } // OptimumNumberOfBuckets - /** * This routine computes the chi-squared value which will * leave a cumulative probability of Alpha in the right tail @@ -1887,8 +1865,8 @@ uinT16 OptimumNumberOfBuckets(uinT32 SampleCount) { * chi-squared value. Therefore, once a particular chi-squared * value is computed, it is stored in the list and never * needs to be computed again. - * @param DegreesOfFreedom determines shape of distribution - * @param Alpha probability of right tail + * @param DegreesOfFreedom determines shape of distribution + * @param Alpha probability of right tail * @return Desired chi-squared value * @note Exceptions: none * @note History: 6/5/89, DSJ, Created. @@ -1932,19 +1910,19 @@ ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha) } // ComputeChiSquared - /** * This routine computes the probability density function * of a discrete normal distribution defined by the global * variables kNormalMean, kNormalVariance, and kNormalMagnitude. * Normal magnitude could, of course, be computed in terms of * the normal variance but it is precomputed for efficiency. - * @param x number to compute the normal probability density for + * @param x number to compute the normal probability density for * @note Globals: - * kNormalMean mean of a discrete normal distribution - * kNormalVariance variance of a discrete normal distribution - * kNormalMagnitude magnitude of a discrete normal distribution - * @return The value of the normal distribution at x. + * kNormalMean mean of a discrete normal distribution + * kNormalVariance variance of a discrete normal distribution + * kNormalMagnitude magnitude of a discrete normal + *distribution + * @return The value of the normal distribution at x. * @note Exceptions: None * @note History: 6/4/89, DSJ, Created. */ @@ -1955,12 +1933,11 @@ FLOAT64 NormalDensity(inT32 x) { return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance); } // NormalDensity - /** * This routine computes the probability density function * of a uniform distribution at the specified point. The * range of the distribution is from 0 to BUCKETTABLESIZE. - * @param x number to compute the uniform probability density for + * @param x number to compute the uniform probability density for * @return The value of the uniform distribution at x. * @note Exceptions: None * @note History: 6/5/89, DSJ, Created. @@ -1974,13 +1951,12 @@ FLOAT64 UniformDensity(inT32 x) { return (FLOAT64) 0.0; } // UniformDensity - /** * This routine computes a trapezoidal approximation to the * integral of a function over a small delta in x. - * @param f1 value of function at x1 - * @param f2 value of function at x2 - * @param Dx x2 - x1 (should always be positive) + * @param f1 value of function at x1 + * @param f2 value of function at x2 + * @param Dx x2 - x1 (should always be positive) * @return Approximation of the integral of the function from x1 to x2. * @note Exceptions: None * @note History: 6/5/89, DSJ, Created. @@ -1989,7 +1965,6 @@ FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) { return (f1 + f2) * Dx / 2.0; } // Integral - /** * This routine counts the number of cluster samples which * fall within the various histogram buckets in Buckets. Only @@ -2002,12 +1977,12 @@ FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx) { * range and the StdDev is 1/2 the range. A dimension with * zero standard deviation cannot be statistically analyzed. * In this case, a pseudo-analysis is used. - * @param Buckets histogram buckets to count samples - * @param Cluster cluster whose samples are being analyzed - * @param Dim dimension of samples which is being analyzed - * @param ParamDesc description of the dimension - * @param Mean "mean" of the distribution - * @param StdDev "standard deviation" of the distribution + * @param Buckets histogram buckets to count samples + * @param Cluster cluster whose samples are being analyzed + * @param Dim dimension of samples which is being analyzed + * @param ParamDesc description of the dimension + * @param Mean "mean" of the distribution + * @param StdDev "standard deviation" of the distribution * @return None (the Buckets data structure is filled in) * @note Exceptions: None * @note History: 6/5/89, DSJ, Created. @@ -2071,16 +2046,15 @@ void FillBuckets(BUCKETS *Buckets, } } // FillBuckets - /** * This routine determines which bucket x falls into in the * discrete normal distribution defined by kNormalMean * and kNormalStdDev. x values which exceed the range of * the discrete distribution are clipped. - * @param ParamDesc used to identify circular dimensions - * @param x value to be normalized - * @param Mean mean of normal distribution - * @param StdDev standard deviation of normal distribution + * @param ParamDesc used to identify circular dimensions + * @param x value to be normalized + * @param Mean mean of normal distribution + * @param StdDev standard deviation of normal distribution * @return Bucket number into which x falls * @note Exceptions: None * @note History: 6/5/89, DSJ, Created. @@ -2107,16 +2081,15 @@ uinT16 NormalBucket(PARAM_DESC *ParamDesc, return (uinT16) floor((FLOAT64) X); } // NormalBucket - /** * This routine determines which bucket x falls into in the * discrete uniform distribution defined by * BUCKETTABLESIZE. x values which exceed the range of * the discrete distribution are clipped. - * @param ParamDesc used to identify circular dimensions - * @param x value to be normalized - * @param Mean center of range of uniform distribution - * @param StdDev 1/2 the range of the uniform distribution + * @param ParamDesc used to identify circular dimensions + * @param x value to be normalized + * @param Mean center of range of uniform distribution + * @param StdDev 1/2 the range of the uniform distribution * @return Bucket number into which x falls * @note Exceptions: None * @note History: 6/5/89, DSJ, Created. @@ -2143,7 +2116,6 @@ uinT16 UniformBucket(PARAM_DESC *ParamDesc, return (uinT16) floor((FLOAT64) X); } // UniformBucket - /** * This routine performs a chi-square goodness of fit test * on the histogram data in the Buckets data structure. TRUE @@ -2151,7 +2123,7 @@ uinT16 UniformBucket(PARAM_DESC *ParamDesc, * distribution which was specified when the Buckets * structure was originally created. Otherwise FALSE is * returned. - * @param Buckets histogram data to perform chi-square test on + * @param Buckets histogram data to perform chi-square test on * @return TRUE if samples match distribution, FALSE otherwise * @note Exceptions: None * @note History: 6/5/89, DSJ, Created. @@ -2176,11 +2148,10 @@ BOOL8 DistributionOK(BUCKETS *Buckets) { return TRUE; } // DistributionOK - /** * This routine frees the memory used by the statistics * data structure. - * @param Statistics pointer to data structure to be freed + * @param Statistics pointer to data structure to be freed * @return None * @note Exceptions: None * @note History: 6/5/89, DSJ, Created. @@ -2192,7 +2163,6 @@ void FreeStatistics(STATISTICS *Statistics) { memfree(Statistics); } // FreeStatistics - /** * This routine properly frees the memory used by a BUCKETS. * @@ -2204,13 +2174,12 @@ void FreeBuckets(BUCKETS *buckets) { Efree(buckets); } // FreeBuckets - /** * This routine frees the memory consumed by the specified * cluster and all of its subclusters. This is done by * recursive calls to FreeCluster(). * - * @param Cluster pointer to cluster to be freed + * @param Cluster pointer to cluster to be freed * * @return None * @@ -2225,7 +2194,6 @@ void FreeCluster(CLUSTER *Cluster) { } } // FreeCluster - /** * This routine computes the degrees of freedom that should * be used in a chi-squared test with the specified number of @@ -2234,8 +2202,8 @@ void FreeCluster(CLUSTER *Cluster) { * computed more easily. This will cause the value of * chi-squared to be higher than the optimum value, resulting * in the chi-square test being more lenient than optimum. - * @param Distribution distribution being tested for - * @param HistogramBuckets number of buckets in chi-square test + * @param Distribution distribution being tested for + * @param HistogramBuckets number of buckets in chi-square test * @return The number of degrees of freedom for a chi-square test * @note Exceptions: none * @note History: Thu Aug 3 14:04:18 1989, DSJ, Created. @@ -2252,7 +2220,6 @@ uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets) { } // DegreesOfFreedom - /** * This routine is used to search a list of histogram data * structures to find one with the specified number of @@ -2272,7 +2239,6 @@ int NumBucketsMatch(void *arg1, // BUCKETS *Histogram, } // NumBucketsMatch - /** * This routine is used to search a list for a list node * whose contents match Key. It is called by the list @@ -2287,13 +2253,12 @@ int ListEntryMatch(void *arg1, //ListNode } // ListEntryMatch - /** * This routine multiplies each ExpectedCount histogram entry * by NewSampleCount/OldSampleCount so that the histogram * is now adjusted to the new sample count. - * @param Buckets histogram data structure to adjust - * @param NewSampleCount new sample count to adjust to + * @param Buckets histogram data structure to adjust + * @param NewSampleCount new sample count to adjust to * @return none * @note Exceptions: none * @note History: Thu Aug 3 14:31:14 1989, DSJ, Created. @@ -2313,11 +2278,10 @@ void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount) { } // AdjustBuckets - /** * This routine sets the bucket counts in the specified histogram * to zero. - * @param Buckets histogram data structure to init + * @param Buckets histogram data structure to init * @return none * @note Exceptions: none * @note History: Thu Aug 3 14:31:14 1989, DSJ, Created. @@ -2331,7 +2295,6 @@ void InitBuckets(BUCKETS *Buckets) { } // InitBuckets - /** * This routine is used to search a list of structures which * hold pre-computed chi-squared values for a chi-squared @@ -2355,14 +2318,13 @@ int AlphaMatch(void *arg1, //CHISTRUCT *ChiStruct } // AlphaMatch - /** * This routine allocates a new data structure which is used * to hold a chi-squared value along with its associated * number of degrees of freedom and alpha value. * - * @param DegreesOfFreedom degrees of freedom for new chi value - * @param Alpha confidence level for new chi value + * @param DegreesOfFreedom degrees of freedom for new chi value + * @param Alpha confidence level for new chi value * @return none * @note Exceptions: none * @note History: Fri Aug 4 11:04:59 1989, DSJ, Created. @@ -2377,7 +2339,6 @@ CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) { } // NewChiStruct - /** * This routine attempts to find an x value at which Function * goes to zero (i.e. a root of the function ). It will only @@ -2385,10 +2346,10 @@ CHISTRUCT *NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha) { * are no extrema between the solution and the InitialGuess. * The algorithms used are extremely primitive. * - * @param Function function whose zero is to be found - * @param FunctionParams arbitrary data to pass to function - * @param InitialGuess point to start solution search at - * @param Accuracy maximum allowed error + * @param Function function whose zero is to be found + * @param FunctionParams arbitrary data to pass to function + * @param InitialGuess point to start solution search at + * @param Accuracy maximum allowed error * @return Solution of function ( x for which f(x) = 0 ). * @note Exceptions: none * @note History: Fri Aug 4 11:08:59 1989, DSJ, Created. @@ -2440,7 +2401,6 @@ void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy) } // Solve - /** * This routine computes the area under a chi density curve * from 0 to x, minus the desired area under the curve. The @@ -2455,8 +2415,8 @@ void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy) * integrating the chi density curve in parts to obtain * a series that can be used to compute the area under the * curve. - * @param ChiParams contains degrees of freedom and alpha - * @param x value of chi-squared to evaluate + * @param ChiParams contains degrees of freedom and alpha + * @param x value of chi-squared to evaluate * @return Error between actual and desired area under the chi curve. * @note Exceptions: none * @note History: Fri Aug 4 12:48:41 1989, DSJ, Created. @@ -2480,7 +2440,6 @@ FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) { } // ChiArea - /** * This routine looks at all samples in the specified cluster. * It computes a running estimate of the percentage of the @@ -2498,10 +2457,10 @@ FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x) { * contained in the same cluster, then the cluster should be * split. * - * @param Clusterer data structure holding cluster tree - * @param Cluster cluster containing samples to be tested - * @param MaxIllegal max percentage of samples allowed to have - * more than 1 feature in the cluster + * @param Clusterer data structure holding cluster tree + * @param Cluster cluster containing samples to be tested + * @param MaxIllegal max percentage of samples allowed to have + * more than 1 feature in the cluster * @return TRUE if the cluster should be split, FALSE otherwise. * @note Exceptions: none * @note History: Wed Aug 30 11:13:05 1989, DSJ, Created. @@ -2562,7 +2521,7 @@ CLUSTER * Cluster, FLOAT32 MaxIllegal) } // MultipleCharSamples /** - * Compute the inverse of a matrix using LU decomposition with partial pivoting. + * Compute the inverse of a matrix using LU decomposition with partial pivoting. * The return value is the sum of norms of the off-diagonal terms of the * product of a and inv. (A measure of the error.) */ diff --git a/classify/clusttool.cpp b/classify/clusttool.cpp index d86c3a24..d81ec5a3 100644 --- a/classify/clusttool.cpp +++ b/classify/clusttool.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: clustertool.c - ** Purpose: Misc. tools for use with the clustering routines - ** Author: Dan Johnson - ** History: 6/6/89, DSJ, Created. + ** Filename: clustertool.c + ** Purpose: Misc. tools for use with the clustering routines + ** Author: Dan Johnson + ** History: 6/6/89, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -25,10 +25,14 @@ #include #include +using tesseract::TFile; + //---------------Global Data Definitions and Declarations-------------------- -#define TOKENSIZE 80 //< max size of tokens read from an input file -#define MAXSAMPLESIZE 65535 //< max num of dimensions in feature space -//#define MAXBLOCKSIZE 65535 //< max num of samples in a character (block size) +#define TOKENSIZE 80 //< max size of tokens read from an input file +#define QUOTED_TOKENSIZE "79" +#define MAXSAMPLESIZE 65535 //< max num of dimensions in feature space +//#define MAXBLOCKSIZE 65535 //< max num of samples in a character (block +// size) /** * This routine reads a single integer from the specified @@ -37,14 +41,17 @@ * @param File open text file to read sample size from * @return Sample size * @note Globals: None - * @note Exceptions: ILLEGALSAMPLESIZE illegal format or range + * @note Exceptions: ILLEGALSAMPLESIZE illegal format or range * @note History: 6/6/89, DSJ, Created. */ -uinT16 ReadSampleSize(FILE *File) { - int SampleSize; +uinT16 ReadSampleSize(TFile *fp) { + int SampleSize = 0; - if ((tfscanf(File, "%d", &SampleSize) != 1) || - (SampleSize < 0) || (SampleSize > MAXSAMPLESIZE)) + const int kMaxLineSize = 100; + char line[kMaxLineSize]; + if (fp->FGets(line, kMaxLineSize) == nullptr || + sscanf(line, "%d", &SampleSize) != 1 || (SampleSize < 0) || + (SampleSize > MAXSAMPLESIZE)) DoError (ILLEGALSAMPLESIZE, "Illegal sample size"); return (SampleSize); } @@ -63,30 +70,28 @@ uinT16 ReadSampleSize(FILE *File) { * @note Globals: None * @note History: 6/6/89, DSJ, Created. */ -PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N) { - int i; +PARAM_DESC *ReadParamDesc(TFile *fp, uinT16 N) { PARAM_DESC *ParamDesc; - char Token[TOKENSIZE]; + char linear_token[TOKENSIZE], essential_token[TOKENSIZE]; ParamDesc = (PARAM_DESC *) Emalloc (N * sizeof (PARAM_DESC)); - for (i = 0; i < N; i++) { - if (tfscanf(File, "%s", Token) != 1) - DoError (ILLEGALCIRCULARSPEC, - "Illegal circular/linear specification"); - if (Token[0] == 'c') + for (int i = 0; i < N; i++) { + const int kMaxLineSize = TOKENSIZE * 4; + char line[kMaxLineSize]; + if (fp->FGets(line, kMaxLineSize) == nullptr || + sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %f %f", + linear_token, essential_token, &ParamDesc[i].Min, + &ParamDesc[i].Max) != 4) + DoError(ILLEGALCIRCULARSPEC, "Illegal Parameter specification"); + if (linear_token[0] == 'c') ParamDesc[i].Circular = TRUE; else ParamDesc[i].Circular = FALSE; - if (tfscanf(File, "%s", Token) != 1) - DoError (ILLEGALESSENTIALSPEC, - "Illegal essential/non-essential spec"); - if (Token[0] == 'e') + if (linear_token[0] == 'e') ParamDesc[i].NonEssential = FALSE; else ParamDesc[i].NonEssential = TRUE; - if (tfscanf(File, "%f%f", &(ParamDesc[i].Min), &(ParamDesc[i].Max)) != 2) - DoError (ILLEGALMINMAXSPEC, "Illegal min or max specification"); ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min; ParamDesc[i].HalfRange = ParamDesc[i].Range / 2; ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2; @@ -110,123 +115,68 @@ PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N) { * @note Globals: None * @note History: 6/6/89, DSJ, Created. */ -PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) { - char Token[TOKENSIZE]; - int Status; +PROTOTYPE *ReadPrototype(TFile *fp, uinT16 N) { + char sig_token[TOKENSIZE], shape_token[TOKENSIZE]; PROTOTYPE *Proto; int SampleCount; int i; - if ((Status = tfscanf(File, "%s", Token)) == 1) { - Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE)); - Proto->Cluster = NULL; - if (Token[0] == 's') - Proto->Significant = TRUE; - else - Proto->Significant = FALSE; - - Proto->Style = ReadProtoStyle (File); - - if ((tfscanf(File, "%d", &SampleCount) != 1) || (SampleCount < 0)) - DoError (ILLEGALSAMPLECOUNT, "Illegal sample count"); - Proto->NumSamples = SampleCount; - - Proto->Mean = ReadNFloats (File, N, NULL); - if (Proto->Mean == NULL) - DoError (ILLEGALMEANSPEC, "Illegal prototype mean"); - - switch (Proto->Style) { - case spherical: - if (ReadNFloats (File, 1, &(Proto->Variance.Spherical)) == NULL) - DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance"); - Proto->Magnitude.Spherical = - 1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical)); - Proto->TotalMagnitude = - pow (Proto->Magnitude.Spherical, (float) N); - Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); - Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; - Proto->Distrib = NULL; - break; - case elliptical: - Proto->Variance.Elliptical = ReadNFloats (File, N, NULL); - if (Proto->Variance.Elliptical == NULL) - DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance"); - Proto->Magnitude.Elliptical = - (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); - Proto->Weight.Elliptical = - (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); - Proto->TotalMagnitude = 1.0; - for (i = 0; i < N; i++) { - Proto->Magnitude.Elliptical[i] = - 1.0 / - sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i])); - Proto->Weight.Elliptical[i] = - 1.0 / Proto->Variance.Elliptical[i]; - Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; - } - Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); - Proto->Distrib = NULL; - break; - case mixed: - Proto->Distrib = - (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION)); - for (i = 0; i < N; i++) { - if (tfscanf(File, "%s", Token) != 1) - DoError (ILLEGALDISTRIBUTION, - "Illegal prototype distribution"); - switch (Token[0]) { - case 'n': - Proto->Distrib[i] = normal; - break; - case 'u': - Proto->Distrib[i] = uniform; - break; - case 'r': - Proto->Distrib[i] = D_random; - break; - default: - DoError (ILLEGALDISTRIBUTION, - "Illegal prototype distribution"); - } - } - Proto->Variance.Elliptical = ReadNFloats (File, N, NULL); - if (Proto->Variance.Elliptical == NULL) - DoError (ILLEGALVARIANCESPEC, "Illegal prototype variance"); - Proto->Magnitude.Elliptical = - (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); - Proto->Weight.Elliptical = - (FLOAT32 *) Emalloc (N * sizeof (FLOAT32)); - Proto->TotalMagnitude = 1.0; - for (i = 0; i < N; i++) { - switch (Proto->Distrib[i]) { - case normal: - Proto->Magnitude.Elliptical[i] = 1.0 / - sqrt ((double) - (2.0 * PI * Proto->Variance.Elliptical[i])); - Proto->Weight.Elliptical[i] = - 1.0 / Proto->Variance.Elliptical[i]; - break; - case uniform: - case D_random: - Proto->Magnitude.Elliptical[i] = 1.0 / - (2.0 * Proto->Variance.Elliptical[i]); - break; - case DISTRIBUTION_COUNT: - ASSERT_HOST(!"Distribution count not allowed!"); - } - Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; - } - Proto->LogMagnitude = log ((double) Proto->TotalMagnitude); - break; - } - return (Proto); + const int kMaxLineSize = TOKENSIZE * 4; + char line[kMaxLineSize]; + if (fp->FGets(line, kMaxLineSize) == nullptr || + sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d", + sig_token, shape_token, &SampleCount) != 3) { + tprintf("Invalid prototype: %s\n", line); + return nullptr; } - else if (Status == EOF) - return (NULL); - else { - DoError (ILLEGALSIGNIFICANCESPEC, "Illegal significance specification"); - return (NULL); + Proto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE)); + Proto->Cluster = NULL; + if (sig_token[0] == 's') + Proto->Significant = TRUE; + else + Proto->Significant = FALSE; + + Proto->Style = ReadProtoStyle(shape_token); + + if (SampleCount < 0) DoError(ILLEGALSAMPLECOUNT, "Illegal sample count"); + Proto->NumSamples = SampleCount; + + Proto->Mean = ReadNFloats(fp, N, NULL); + if (Proto->Mean == NULL) DoError(ILLEGALMEANSPEC, "Illegal prototype mean"); + + switch (Proto->Style) { + case spherical: + if (ReadNFloats(fp, 1, &(Proto->Variance.Spherical)) == NULL) + DoError(ILLEGALVARIANCESPEC, "Illegal prototype variance"); + Proto->Magnitude.Spherical = + 1.0 / sqrt((double)(2.0 * PI * Proto->Variance.Spherical)); + Proto->TotalMagnitude = pow(Proto->Magnitude.Spherical, (float)N); + Proto->LogMagnitude = log((double)Proto->TotalMagnitude); + Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical; + Proto->Distrib = NULL; + break; + case elliptical: + Proto->Variance.Elliptical = ReadNFloats(fp, N, NULL); + if (Proto->Variance.Elliptical == NULL) + DoError(ILLEGALVARIANCESPEC, "Illegal prototype variance"); + Proto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); + Proto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); + Proto->TotalMagnitude = 1.0; + for (i = 0; i < N; i++) { + Proto->Magnitude.Elliptical[i] = + 1.0 / sqrt((double)(2.0 * PI * Proto->Variance.Elliptical[i])); + Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i]; + Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i]; + } + Proto->LogMagnitude = log((double)Proto->TotalMagnitude); + Proto->Distrib = NULL; + break; + default: + Efree(Proto); + tprintf("Invalid prototype style\n"); + return nullptr; } + return Proto; } /** @@ -238,30 +188,19 @@ PROTOTYPE *ReadPrototype(FILE *File, uinT16 N) { * @note Exceptions: ILLEGALSTYLESPEC illegal prototype style specification * @note History: 6/8/89, DSJ, Created. */ -PROTOSTYLE ReadProtoStyle(FILE *File) { - char Token[TOKENSIZE]; - PROTOSTYLE Style; - - if (tfscanf(File, "%s", Token) != 1) - DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification"); - switch (Token[0]) { +PROTOSTYLE ReadProtoStyle(const char *shape) { + switch (shape[0]) { case 's': - Style = spherical; - break; + return spherical; case 'e': - Style = elliptical; - break; - case 'm': - Style = mixed; - break; + return elliptical; case 'a': - Style = automatic; - break; + return automatic; default: - Style = elliptical; - DoError (ILLEGALSTYLESPEC, "Illegal prototype style specification"); + break; } - return (Style); + tprintf("Invalid prototype style specification:%s\n", shape); + return elliptical; } /** @@ -278,28 +217,30 @@ PROTOSTYLE ReadProtoStyle(FILE *File) { * @note Exceptions: ILLEGALFLOAT * @note History: 6/6/89, DSJ, Created. */ -FLOAT32* ReadNFloats(FILE * File, uinT16 N, FLOAT32 Buffer[]) { +FLOAT32 *ReadNFloats(TFile *fp, uinT16 N, FLOAT32 Buffer[]) { + const int kMaxLineSize = 1024; + char line[kMaxLineSize]; + if (fp->FGets(line, kMaxLineSize) == nullptr) { + tprintf("Hit EOF in ReadNFloats!\n"); + return nullptr; + } bool needs_free = false; - int i; - int NumFloatsRead; if (Buffer == NULL) { Buffer = reinterpret_cast(Emalloc(N * sizeof(FLOAT32))); needs_free = true; } - for (i = 0; i < N; i++) { - NumFloatsRead = tfscanf(File, "%f", &(Buffer[i])); - if (NumFloatsRead != 1) { - if ((NumFloatsRead == EOF) && (i == 0)) { - if (needs_free) { - Efree(Buffer); - } - return NULL; - } else { - DoError(ILLEGALFLOAT, "Illegal float specification"); - } + char *startptr = line; + for (int i = 0; i < N; i++) { + char *endptr; + Buffer[i] = strtof(startptr, &endptr); + if (endptr == startptr) { + tprintf("Read of %d floats failed!\n", N); + if (needs_free) Efree(Buffer); + return nullptr; } + startptr = endptr; } return Buffer; } @@ -315,8 +256,7 @@ FLOAT32* ReadNFloats(FILE * File, uinT16 N, FLOAT32 Buffer[]) { * @note Exceptions: None * @note History: 6/6/89, DSJ, Created. */ -void -WriteParamDesc (FILE * File, uinT16 N, PARAM_DESC ParamDesc[]) { +void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[]) { int i; for (i = 0; i < N; i++) { @@ -446,15 +386,10 @@ void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) { * @note History: 6/12/89, DSJ, Created. */ -void WriteProtoList( - FILE *File, - uinT16 N, - PARAM_DESC ParamDesc[], - LIST ProtoList, - BOOL8 WriteSigProtos, - BOOL8 WriteInsigProtos) -{ - PROTOTYPE *Proto; +void WriteProtoList(FILE *File, uinT16 N, PARAM_DESC ParamDesc[], + LIST ProtoList, BOOL8 WriteSigProtos, + BOOL8 WriteInsigProtos) { + PROTOTYPE *Proto; /* write file header */ fprintf(File,"%0d\n",N); @@ -464,8 +399,8 @@ void WriteProtoList( iterate(ProtoList) { Proto = (PROTOTYPE *) first_node ( ProtoList ); - if (( Proto->Significant && WriteSigProtos ) || - ( ! Proto->Significant && WriteInsigProtos ) ) - WritePrototype( File, N, Proto ); + if ((Proto->Significant && WriteSigProtos) || + (!Proto->Significant && WriteInsigProtos)) + WritePrototype(File, N, Proto); } } diff --git a/classify/clusttool.h b/classify/clusttool.h index e82fa1ef..e4c22690 100644 --- a/classify/clusttool.h +++ b/classify/clusttool.h @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: clusttool.h - ** Purpose: Definition of clustering utility tools - ** Author: Dan Johnson - ** History: 6/6/89, DSJ, Created. + ** Filename: clusttool.h + ** Purpose: Definition of clustering utility tools + ** Author: Dan Johnson + ** History: 6/6/89, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -15,28 +15,30 @@ ** See the License for the specific language governing permissions and ** limitations under the License. ******************************************************************************/ -#ifndef __CLUSTERTOOL__ -#define __CLUSTERTOOL__ + +#ifndef TESSERACT_CLASSIFY_CLUSTTOOL_H_ +#define TESSERACT_CLASSIFY_CLUSTTOOL_H_ //--------------------------Include Files--------------------------------------- -#include "host.h" -#include "cluster.h" #include +#include "cluster.h" +#include "host.h" +#include "serialis.h" /*------------------------------------------------------------------------- Public Function Prototype --------------------------------------------------------------------------*/ -uinT16 ReadSampleSize(FILE *File); +uinT16 ReadSampleSize(tesseract::TFile *fp); -PARAM_DESC *ReadParamDesc(FILE *File, uinT16 N); +PARAM_DESC *ReadParamDesc(tesseract::TFile *fp, uinT16 N); -PROTOTYPE *ReadPrototype(FILE *File, uinT16 N); +PROTOTYPE *ReadPrototype(tesseract::TFile *fp, uinT16 N); -PROTOSTYLE ReadProtoStyle(FILE *File); +PROTOSTYLE ReadProtoStyle(const char *style); -FLOAT32 *ReadNFloats (FILE * File, uinT16 N, FLOAT32 Buffer[]); +FLOAT32 *ReadNFloats(tesseract::TFile *fp, uinT16 N, FLOAT32 Buffer[]); -void WriteParamDesc (FILE * File, uinT16 N, PARAM_DESC ParamDesc[]); +void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[]); void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto); @@ -44,13 +46,9 @@ void WriteNFloats (FILE * File, uinT16 N, FLOAT32 Array[]); void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle); -void WriteProtoList( - FILE *File, - uinT16 N, - PARAM_DESC ParamDesc[], - LIST ProtoList, - BOOL8 WriteSigProtos, - BOOL8 WriteInsigProtos); +void WriteProtoList(FILE *File, uinT16 N, PARAM_DESC ParamDesc[], + LIST ProtoList, BOOL8 WriteSigProtos, + BOOL8 WriteInsigProtos); //--------------Global Data Definitions and Declarations--------------------- // define errors that can be trapped @@ -65,4 +63,4 @@ void WriteProtoList( #define ILLEGALDISTRIBUTION 5008 #define ILLEGALFLOAT 5009 #define ILLEGALESSENTIALSPEC 5013 -#endif +#endif // TESSERACT_CLASSIFY_CLUSTTOOL_H_ diff --git a/classify/cutoffs.cpp b/classify/cutoffs.cpp index 4f641714..cdcaf361 100644 --- a/classify/cutoffs.cpp +++ b/classify/cutoffs.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: cutoffs.c - ** Purpose: Routines to manipulate an array of class cutoffs. - ** Author: Dan Johnson - ** History: Wed Feb 20 09:28:51 1991, DSJ, Created. + ** Filename: cutoffs.c + ** Purpose: Routines to manipulate an array of class cutoffs. + ** Author: Dan Johnson + ** History: Wed Feb 20 09:28:51 1991, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -49,7 +49,7 @@ namespace tesseract { * @note Exceptions: none * @note History: Wed Feb 20 09:38:26 1991, DSJ, Created. */ -void Classify::ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset, +void Classify::ReadNewCutoffs(TFile* fp, bool swap, CLASS_CUTOFF_ARRAY Cutoffs) { char Class[UNICHAR_LEN + 1]; CLASS_ID ClassId; @@ -57,23 +57,24 @@ void Classify::ReadNewCutoffs(FILE *CutoffFile, bool swap, inT64 end_offset, int i; if (shape_table_ != NULL) { - if (!shapetable_cutoffs_.DeSerialize(swap, CutoffFile)) { + if (!shapetable_cutoffs_.DeSerialize(swap, fp)) { tprintf("Error during read of shapetable pffmtable!\n"); } } for (i = 0; i < MAX_NUM_CLASSES; i++) Cutoffs[i] = MAX_CUTOFF; - while ((end_offset < 0 || ftell(CutoffFile) < end_offset) && - tfscanf(CutoffFile, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d", - Class, &Cutoff) == 2) { + const int kMaxLineSize = 100; + char line[kMaxLineSize]; + while (fp->FGets(line, kMaxLineSize) != nullptr && + sscanf(line, "%" REALLY_QUOTE_IT(UNICHAR_LEN) "s %d", Class, + &Cutoff) == 2) { if (strcmp(Class, "NULL") == 0) { ClassId = unicharset.unichar_to_id(" "); } else { ClassId = unicharset.unichar_to_id(Class); } Cutoffs[ClassId] = Cutoff; - SkipNewline(CutoffFile); } } diff --git a/classify/featdefs.cpp b/classify/featdefs.cpp index ad7b7996..dd31f91d 100644 --- a/classify/featdefs.cpp +++ b/classify/featdefs.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: featdefs.c - ** Purpose: Definitions of currently defined feature types. - ** Author: Dan Johnson - ** History: Mon May 21 10:26:21 1990, DSJ, Created. + ** Filename: featdefs.c + ** Purpose: Definitions of currently defined feature types. + ** Author: Dan Johnson + ** History: Mon May 21 10:26:21 1990, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -289,13 +289,13 @@ CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, * the feature type for the feature with the specified short * name. Trap an error if the specified name is not found. * - * Globals: + * Globals: * - none * * @param FeatureDefs definitions of feature types/extractors * @param ShortName short name of a feature type * @return Feature type which corresponds to ShortName. - * @note Exceptions: + * @note Exceptions: * - ILLEGAL_SHORT_NAME * @note History: Wed May 23 15:36:05 1990, DSJ, Created. */ diff --git a/classify/featdefs.h b/classify/featdefs.h index 704bbdfd..7c168f3d 100644 --- a/classify/featdefs.h +++ b/classify/featdefs.h @@ -77,7 +77,7 @@ int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, Global Data Definitions and Declarations ----------------------------------------------------------------------------**/ extern const FEATURE_DESC_STRUCT MicroFeatureDesc; -extern const FEATURE_DESC_STRUCT PicoFeatDesc; +extern TESS_API const FEATURE_DESC_STRUCT PicoFeatDesc; extern const FEATURE_DESC_STRUCT CharNormDesc; extern const FEATURE_DESC_STRUCT OutlineFeatDesc; extern const FEATURE_DESC_STRUCT IntFeatDesc; diff --git a/classify/fpoint.cpp b/classify/fpoint.cpp index 854bea7b..ff5b7b7c 100644 --- a/classify/fpoint.cpp +++ b/classify/fpoint.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: fpoint.c - ** Purpose: Abstract data type for a 2D point (floating point coords) - ** Author: Dan Johnson - ** History: Thu Apr 12 10:44:15 1990, DSJ, Created. + ** Filename: fpoint.c + ** Purpose: Abstract data type for a 2D point (floating point coords) + ** Author: Dan Johnson + ** History: Thu Apr 12 10:44:15 1990, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -58,5 +58,4 @@ FLOAT32 NormalizedAngleFrom(FPOINT *Point1, if (Angle < 0.0 || Angle >= FullScale) Angle = 0.0; return (Angle); - } diff --git a/classify/intfeaturemap.h b/classify/intfeaturemap.h index 55c5b5cf..5c5a54b8 100644 --- a/classify/intfeaturemap.h +++ b/classify/intfeaturemap.h @@ -18,8 +18,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H__ -#define TESSERACT_CLASSIFY_INTFEATUREMAP_H__ +#ifndef TESSERACT_CLASSIFY_INTFEATUREMAP_H_ +#define TESSERACT_CLASSIFY_INTFEATUREMAP_H_ #include "intfeaturespace.h" #include "indexmapbidi.h" @@ -160,4 +160,4 @@ class IntFeatureMap { } // namespace tesseract. -#endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H__ +#endif // TESSERACT_CLASSIFY_INTFEATUREMAP_H_ diff --git a/classify/intfeaturespace.h b/classify/intfeaturespace.h index e1e8e6ec..2b84e390 100644 --- a/classify/intfeaturespace.h +++ b/classify/intfeaturespace.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_CLASSIFY_INTFEATURESPACE_H__ -#define TESSERACT_CLASSIFY_INTFEATURESPACE_H__ +#ifndef TESSERACT_CLASSIFY_INTFEATURESPACE_H_ +#define TESSERACT_CLASSIFY_INTFEATURESPACE_H_ #include "genericvector.h" #include "intproto.h" @@ -106,5 +106,4 @@ class IntFeatureSpace { } // namespace tesseract. - -#endif // TESSERACT_CLASSIFY_INTFEATURESPACE_H__ +#endif // TESSERACT_CLASSIFY_INTFEATURESPACE_H_ diff --git a/classify/intfx.cpp b/classify/intfx.cpp index 78aa59bb..9c9870a2 100644 --- a/classify/intfx.cpp +++ b/classify/intfx.cpp @@ -520,7 +520,7 @@ bool ExtractIntFeat(const TBLOB& blob, tesseract::Classify::ExtractFeatures(blob, nonlinear_norm, &bl_features, &cn_features, results, NULL); - if (bl_features.size() == 0 || cn_features.size() == 0 || + if (bl_features.empty() || cn_features.empty() || bl_features.size() > MAX_NUM_INT_FEATURES || cn_features.size() > MAX_NUM_INT_FEATURES) { return false; // Feature extraction failed. diff --git a/classify/intmatcher.cpp b/classify/intmatcher.cpp index 8fc135ea..ff999608 100644 --- a/classify/intmatcher.cpp +++ b/classify/intmatcher.cpp @@ -295,7 +295,8 @@ class ClassPruner { HeapSort(num_classes_, sort_key_, sort_index_); } - /** Prints debug info on the class pruner matches for the pruned classes only. */ + /** Prints debug info on the class pruner matches for the pruned classes only. + */ void DebugMatch(const Classify& classify, const INT_TEMPLATES_STRUCT* int_templates, const INT_FEATURE_STRUCT* features) const { @@ -370,8 +371,9 @@ class ClassPruner { private: /** Array[rounded_classes_] of initial counts for each class. */ int *class_count_; - /// Array[rounded_classes_] of modified counts for each class after normalizing - /// for expected number of features, disabled classes, fragments, and xheights. + /// Array[rounded_classes_] of modified counts for each class after + /// normalizing for expected number of features, disabled classes, fragments, + /// and xheights. int *norm_count_; /** Array[rounded_classes_ +1] of pruned counts that gets sorted */ int *sort_key_; @@ -402,8 +404,9 @@ class ClassPruner { * normalization process (by CLASS_INDEX) * @param expected_num_features Array of expected number of features * for each class (by CLASS_INDEX) - * @param results Sorted Array of pruned classes. Must be an array - * of size at least int_templates->NumClasses. + * @param results Sorted Array of pruned classes. Must be an + * array of size at least + * int_templates->NumClasses. * @param keep_this */ int Classify::PruneClasses(const INT_TEMPLATES_STRUCT* int_templates, @@ -606,7 +609,6 @@ int IntegerMatcher::FindGoodProtos( return NumGoodProtos; } - /** * FindBadFeatures finds all features with maximum feature-evidence < * AdaptFeatureThresh. The list is ordered by increasing feature number. @@ -701,7 +703,6 @@ void IntegerMatcher::Init(tesseract::IntParam *classify_debug_level) { evidence_mult_mask_ = ((1 << kIntEvidenceTruncBits) - 1); } - /*---------------------------------------------------------------------------- Private Code ----------------------------------------------------------------------------*/ @@ -717,8 +718,6 @@ void ScratchEvidence::ClearFeatureEvidence(const INT_CLASS class_template) { class_template->NumConfigs * sizeof(feature_evidence_[0])); } - - /** * Print debugging information for Configuations * @return none @@ -742,7 +741,6 @@ void IMDebugConfiguration(int FeatureNum, cprintf ("\n"); } - /** * Print debugging information for Configuations * @return none @@ -795,10 +793,10 @@ int IntegerMatcher::UpdateTablesForFeature( uinT32 XFeatureAddress; uinT32 YFeatureAddress; uinT32 ThetaFeatureAddress; - uinT8 *UINT8Pointer; + uinT8* UINT8Pointer; int ProtoIndex; uinT8 Temp; - int *IntPointer; + int* IntPointer; int ConfigNum; inT32 M3; inT32 A3; @@ -916,7 +914,6 @@ int IntegerMatcher::UpdateTablesForFeature( return SumOverConfigs; } - /** * Print debugging information for Configuations * @return none @@ -1165,8 +1162,6 @@ void ScratchEvidence::UpdateSumOfProtoEvidences( } } - - /** * Normalize Sum of Proto and Feature Evidence by dividing by the sum of * the Feature Lengths and the Proto Lengths for each configuration. @@ -1180,7 +1175,6 @@ void ScratchEvidence::NormalizeSums( } } - /** * Find the best match for the current class and update the Result * with the configuration and match rating. diff --git a/classify/intmatcher.h b/classify/intmatcher.h index 46dbfc5a..df678d75 100644 --- a/classify/intmatcher.h +++ b/classify/intmatcher.h @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: intmatcher.h - ** Purpose: Interface to high level generic classifier routines. - ** Author: Robert Moss - ** History: Wed Feb 13 15:24:15 MST 1991, RWM, Created. + ** Filename: intmatcher.h + ** Purpose: Interface to high level generic classifier routines. + ** Author: Robert Moss + ** History: Wed Feb 13 15:24:15 MST 1991, RWM, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at diff --git a/classify/intproto.cpp b/classify/intproto.cpp index 4c2f0d95..c20992b7 100644 --- a/classify/intproto.cpp +++ b/classify/intproto.cpp @@ -46,9 +46,7 @@ #include "config_auto.h" #endif -using tesseract::FontInfo; using tesseract::FontSet; -using tesseract::FontSpacingInfo; /* match debug display constants*/ #define PROTO_PRUNER_SCALE (4.0) @@ -326,10 +324,8 @@ int AddIntProto(INT_CLASS Class) { Word < Proto->Configs + WERDS_PER_CONFIG_VEC; *Word++ = 0); return (Index); - } - /** * This routine adds Proto to the class pruning tables * for the specified class in Templates. @@ -372,7 +368,6 @@ void AddProtoToClassPruner (PROTO Proto, CLASS_ID ClassId, } } /* AddProtoToClassPruner */ - /** * This routine updates the proto pruner lookup tables * for Class to include a new proto identified by ProtoId @@ -432,7 +427,6 @@ void AddProtoToProtoPruner(PROTO Proto, int ProtoId, FillPPLinearBits(ProtoSet->ProtoPruner[PRUNER_Y], Index, Y, Pad, debug); } /* AddProtoToProtoPruner */ - /** * Returns a quantized bucket for the given param shifted by offset, * notionally (param + offset) * num_buckets, but clipped and casted to the @@ -550,7 +544,6 @@ void Classify::ConvertProto(PROTO Proto, int ProtoId, INT_CLASS Class) { P->A, P->B, P->C, Class->ProtoLengths[ProtoId]); } /* ConvertProto */ - /** * This routine converts from the old floating point format * to the new integer format. @@ -627,7 +620,7 @@ INT_TEMPLATES Classify::CreateIntTemplates(CLASSES FloatProtos, * @note Exceptions: none * @note History: Thu Mar 21 14:45:04 1991, DSJ, Created. */ -void DisplayIntFeature(const INT_FEATURE_STRUCT* Feature, FLOAT32 Evidence) { +void DisplayIntFeature(const INT_FEATURE_STRUCT *Feature, FLOAT32 Evidence) { ScrollView::Color color = GetMatchColorFor(Evidence); RenderIntFeature(IntMatchWindow, Feature, color); if (FeatureDisplayWindow) { @@ -635,7 +628,6 @@ void DisplayIntFeature(const INT_FEATURE_STRUCT* Feature, FLOAT32 Evidence) { } } /* DisplayIntFeature */ - /** * This routine renders the specified proto into a * global display list. @@ -720,7 +712,6 @@ void free_int_class(INT_CLASS int_class) { Efree(int_class); } - /** * This routine allocates a new set of integer templates * initialized to hold 0 classes. @@ -767,9 +758,8 @@ namespace tesseract { * @note Exceptions: none * @note History: Wed Feb 27 11:48:46 1991, DSJ, Created. */ -INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) { +INT_TEMPLATES Classify::ReadIntTemplates(bool swap, TFile *fp) { int i, j, w, x, y, z; - BOOL8 swap; int nread; int unicharset_size; int version_id = 0; @@ -795,29 +785,19 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) { /* first read the high level template struct */ Templates = NewIntTemplates(); // Read Templates in parts for 64 bit compatibility. - if (fread(&unicharset_size, sizeof(int), 1, File) != 1) - cprintf("Bad read of inttemp!\n"); - if (fread(&Templates->NumClasses, - sizeof(Templates->NumClasses), 1, File) != 1 || - fread(&Templates->NumClassPruners, - sizeof(Templates->NumClassPruners), 1, File) != 1) - cprintf("Bad read of inttemp!\n"); - // Swap status is determined automatically. - swap = Templates->NumClassPruners < 0 || - Templates->NumClassPruners > MAX_NUM_CLASS_PRUNERS; - if (swap) { - Reverse32(&Templates->NumClassPruners); - Reverse32(&Templates->NumClasses); - Reverse32(&unicharset_size); - } + if (fp->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1, swap) != 1) + tprintf("Bad read of inttemp!\n"); + if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses), 1, + swap) != 1 || + fp->FReadEndian(&Templates->NumClassPruners, + sizeof(Templates->NumClassPruners), 1, swap) != 1) + tprintf("Bad read of inttemp!\n"); if (Templates->NumClasses < 0) { // This file has a version id! version_id = -Templates->NumClasses; - if (fread(&Templates->NumClasses, sizeof(Templates->NumClasses), - 1, File) != 1) - cprintf("Bad read of inttemp!\n"); - if (swap) - Reverse32(&Templates->NumClasses); + if (fp->FReadEndian(&Templates->NumClasses, sizeof(Templates->NumClasses), + 1, swap) != 1) + tprintf("Bad read of inttemp!\n"); } if (version_id < 3) { @@ -826,39 +806,24 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) { } if (version_id < 2) { - for (i = 0; i < unicharset_size; ++i) { - if (fread(&IndexFor[i], sizeof(inT16), 1, File) != 1) - cprintf("Bad read of inttemp!\n"); + if (fp->FReadEndian(IndexFor, sizeof(IndexFor[0]), unicharset_size, swap) != + unicharset_size) { + tprintf("Bad read of inttemp!\n"); } - for (i = 0; i < Templates->NumClasses; ++i) { - if (fread(&ClassIdFor[i], sizeof(CLASS_ID), 1, File) != 1) - cprintf("Bad read of inttemp!\n"); - } - if (swap) { - for (i = 0; i < Templates->NumClasses; i++) - Reverse16(&IndexFor[i]); - for (i = 0; i < Templates->NumClasses; i++) - Reverse32(&ClassIdFor[i]); + if (fp->FReadEndian(ClassIdFor, sizeof(ClassIdFor[0]), + Templates->NumClasses, swap) != Templates->NumClasses) { + tprintf("Bad read of inttemp!\n"); } } /* then read in the class pruners */ + const int kNumBuckets = + NUM_CP_BUCKETS * NUM_CP_BUCKETS * NUM_CP_BUCKETS * WERDS_PER_CP_VECTOR; for (i = 0; i < Templates->NumClassPruners; i++) { Pruner = new CLASS_PRUNER_STRUCT; - if ((nread = - fread(Pruner, 1, sizeof(CLASS_PRUNER_STRUCT), - File)) != sizeof(CLASS_PRUNER_STRUCT)) - cprintf("Bad read of inttemp!\n"); - if (swap) { - for (x = 0; x < NUM_CP_BUCKETS; x++) { - for (y = 0; y < NUM_CP_BUCKETS; y++) { - for (z = 0; z < NUM_CP_BUCKETS; z++) { - for (w = 0; w < WERDS_PER_CP_VECTOR; w++) { - Reverse32(&Pruner->p[x][y][z][w]); - } - } - } - } + if (fp->FReadEndian(Pruner, sizeof(Pruner->p[0][0][0][0]), kNumBuckets, + swap) != kNumBuckets) { + tprintf("Bad read of inttemp!\n"); } if (version_id < 2) { TempClassPruner[i] = Pruner; @@ -923,39 +888,24 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) { for (i = 0; i < Templates->NumClasses; i++) { /* first read in the high level struct for the class */ Class = (INT_CLASS) Emalloc (sizeof (INT_CLASS_STRUCT)); - if (fread(&Class->NumProtos, sizeof(Class->NumProtos), 1, File) != 1 || - fread(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1, File) != 1 || - fread(&Class->NumConfigs, sizeof(Class->NumConfigs), 1, File) != 1) - cprintf ("Bad read of inttemp!\n"); + if (fp->FReadEndian(&Class->NumProtos, sizeof(Class->NumProtos), 1, swap) != + 1 || + fp->FRead(&Class->NumProtoSets, sizeof(Class->NumProtoSets), 1) != 1 || + fp->FRead(&Class->NumConfigs, sizeof(Class->NumConfigs), 1) != 1) + tprintf("Bad read of inttemp!\n"); if (version_id == 0) { // Only version 0 writes 5 pointless pointers to the file. for (j = 0; j < 5; ++j) { - int junk; - if (fread(&junk, sizeof(junk), 1, File) != 1) - cprintf ("Bad read of inttemp!\n"); + inT32 junk; + if (fp->FRead(&junk, sizeof(junk), 1) != 1) + tprintf("Bad read of inttemp!\n"); } } - if (version_id < 4) { - for (j = 0; j < MaxNumConfigs; ++j) { - if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1) - cprintf ("Bad read of inttemp!\n"); - } - if (swap) { - Reverse16(&Class->NumProtos); - for (j = 0; j < MaxNumConfigs; j++) - Reverse16(&Class->ConfigLengths[j]); - } - } else { - ASSERT_HOST(Class->NumConfigs < MaxNumConfigs); - for (j = 0; j < Class->NumConfigs; ++j) { - if (fread(&Class->ConfigLengths[j], sizeof(uinT16), 1, File) != 1) - cprintf ("Bad read of inttemp!\n"); - } - if (swap) { - Reverse16(&Class->NumProtos); - for (j = 0; j < MaxNumConfigs; j++) - Reverse16(&Class->ConfigLengths[j]); - } + int num_configs = version_id < 4 ? MaxNumConfigs : Class->NumConfigs; + ASSERT_HOST(num_configs <= MaxNumConfigs); + if (fp->FReadEndian(Class->ConfigLengths, sizeof(uinT16), num_configs, + swap) != num_configs) { + tprintf("Bad read of inttemp!\n"); } if (version_id < 2) { ClassForClassId (Templates, ClassIdFor[i]) = Class; @@ -967,59 +917,41 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) { Lengths = NULL; if (MaxNumIntProtosIn (Class) > 0) { Lengths = (uinT8 *)Emalloc(sizeof(uinT8) * MaxNumIntProtosIn(Class)); - if ((nread = - fread((char *)Lengths, sizeof(uinT8), - MaxNumIntProtosIn(Class), File)) != MaxNumIntProtosIn (Class)) - cprintf ("Bad read of inttemp!\n"); + if (fp->FRead(Lengths, sizeof(uinT8), MaxNumIntProtosIn(Class)) != + MaxNumIntProtosIn(Class)) + tprintf("Bad read of inttemp!\n"); } Class->ProtoLengths = Lengths; /* then read in the proto sets */ for (j = 0; j < Class->NumProtoSets; j++) { ProtoSet = (PROTO_SET)Emalloc(sizeof(PROTO_SET_STRUCT)); - if (version_id < 3) { - if ((nread = - fread((char *) &ProtoSet->ProtoPruner, 1, - sizeof(PROTO_PRUNER), File)) != sizeof(PROTO_PRUNER)) + int num_buckets = NUM_PP_PARAMS * NUM_PP_BUCKETS * WERDS_PER_PP_VECTOR; + if (fp->FReadEndian(&ProtoSet->ProtoPruner, + sizeof(ProtoSet->ProtoPruner[0][0][0]), num_buckets, + swap) != num_buckets) + tprintf("Bad read of inttemp!\n"); + for (x = 0; x < PROTOS_PER_PROTO_SET; x++) { + if (fp->FRead(&ProtoSet->Protos[x].A, sizeof(ProtoSet->Protos[x].A), + 1) != 1 || + fp->FRead(&ProtoSet->Protos[x].B, sizeof(ProtoSet->Protos[x].B), + 1) != 1 || + fp->FRead(&ProtoSet->Protos[x].C, sizeof(ProtoSet->Protos[x].C), + 1) != 1 || + fp->FRead(&ProtoSet->Protos[x].Angle, + sizeof(ProtoSet->Protos[x].Angle), 1) != 1) + tprintf("Bad read of inttemp!\n"); + if (fp->FReadEndian(&ProtoSet->Protos[x].Configs, + sizeof(ProtoSet->Protos[x].Configs[0]), + WerdsPerConfigVec, swap) != WerdsPerConfigVec) cprintf("Bad read of inttemp!\n"); - for (x = 0; x < PROTOS_PER_PROTO_SET; x++) { - if ((nread = fread((char *) &ProtoSet->Protos[x].A, 1, - sizeof(inT8), File)) != sizeof(inT8) || - (nread = fread((char *) &ProtoSet->Protos[x].B, 1, - sizeof(uinT8), File)) != sizeof(uinT8) || - (nread = fread((char *) &ProtoSet->Protos[x].C, 1, - sizeof(inT8), File)) != sizeof(inT8) || - (nread = fread((char *) &ProtoSet->Protos[x].Angle, 1, - sizeof(uinT8), File)) != sizeof(uinT8)) - cprintf("Bad read of inttemp!\n"); - for (y = 0; y < WerdsPerConfigVec; y++) - if ((nread = fread((char *) &ProtoSet->Protos[x].Configs[y], 1, - sizeof(uinT32), File)) != sizeof(uinT32)) - cprintf("Bad read of inttemp!\n"); - } - } else { - if ((nread = - fread((char *) ProtoSet, 1, sizeof(PROTO_SET_STRUCT), - File)) != sizeof(PROTO_SET_STRUCT)) - cprintf("Bad read of inttemp!\n"); - } - if (swap) { - for (x = 0; x < NUM_PP_PARAMS; x++) - for (y = 0; y < NUM_PP_BUCKETS; y++) - for (z = 0; z < WERDS_PER_PP_VECTOR; z++) - Reverse32(&ProtoSet->ProtoPruner[x][y][z]); - for (x = 0; x < PROTOS_PER_PROTO_SET; x++) - for (y = 0; y < WerdsPerConfigVec; y++) - Reverse32(&ProtoSet->Protos[x].Configs[y]); } Class->ProtoSets[j] = ProtoSet; } - if (version_id < 4) + if (version_id < 4) { Class->font_set_id = -1; - else { - fread(&Class->font_set_id, sizeof(int), 1, File); - if (swap) - Reverse32(&Class->font_set_id); + } else { + fp->FReadEndian(&Class->font_set_id, sizeof(Class->font_set_id), 1, swap); } } @@ -1046,13 +978,12 @@ INT_TEMPLATES Classify::ReadIntTemplates(FILE *File) { } } if (version_id >= 4) { - this->fontinfo_table_.read(File, NewPermanentTessCallback(read_info), swap); + this->fontinfo_table_.read(fp, NewPermanentTessCallback(read_info), swap); if (version_id >= 5) { - this->fontinfo_table_.read(File, - NewPermanentTessCallback(read_spacing_info), - swap); + this->fontinfo_table_.read( + fp, NewPermanentTessCallback(read_spacing_info), swap); } - this->fontset_table_.read(File, NewPermanentTessCallback(read_set), swap); + this->fontset_table_.read(fp, NewPermanentTessCallback(read_set), swap); } // Clean up. @@ -1218,7 +1149,6 @@ FLOAT32 BucketStart(int Bucket, FLOAT32 Offset, int NumBuckets) { } /* BucketStart */ - /** * This routine returns the parameter value which * corresponds to the end of the specified bucket. @@ -1236,7 +1166,6 @@ FLOAT32 BucketEnd(int Bucket, FLOAT32 Offset, int NumBuckets) { return (((FLOAT32) (Bucket + 1) / NumBuckets) - Offset); } /* BucketEnd */ - /** * This routine fills in the section of a class pruner * corresponding to a single x value for a single proto of @@ -1284,7 +1213,6 @@ void DoFill(FILL_SPEC *FillSpec, } } /* DoFill */ - /** * Return TRUE if the specified table filler is done, i.e. * if it has no more lines to fill. @@ -1306,7 +1234,6 @@ BOOL8 FillerDone(TABLE_FILLER *Filler) { } /* FillerDone */ - /** * This routine sets Bit in each bit vector whose * bucket lies within the range Center +- Spread. The fill @@ -1349,7 +1276,6 @@ void FillPPCircularBits(uinT32 ParamTable[NUM_PP_BUCKETS][WERDS_PER_PP_VECTOR], } /* FillPPCircularBits */ - /** * This routine sets Bit in each bit vector whose * bucket lies within the range Center +- Spread. The fill @@ -1516,7 +1442,6 @@ void GetCPPadsForLevel(int Level, } /* GetCPPadsForLevel */ - /** * @param Evidence evidence value to return color for * @return Color which corresponds to specified Evidence value. @@ -1538,7 +1463,6 @@ ScrollView::Color GetMatchColorFor(FLOAT32 Evidence) { return ScrollView::BLUE; } /* GetMatchColorFor */ - /** * This routine returns (in Fill) the specification of * the next line to be filled from Filler. FillerDone() should @@ -1589,7 +1513,6 @@ void GetNextFill(TABLE_FILLER *Filler, FILL_SPEC *Fill) { } /* GetNextFill */ - /** * This routine computes a data structure (Filler) * which can be used to fill in a rectangle surrounding @@ -1723,8 +1646,10 @@ void InitTableFiller (FLOAT32 EndPad, FLOAT32 SidePad, /* translate into bucket positions and deltas */ Filler->X = Bucket8For(Start.x, XS, NB); - Filler->StartDelta = -(inT16) ((Sin / Cos) * 256); - Filler->EndDelta = (inT16) ((Cos / Sin) * 256); + Filler->StartDelta = static_cast(ClipToRange( + -IntCastRounded((Sin / Cos) * 256), MIN_INT16, MAX_INT16)); + Filler->EndDelta = static_cast(ClipToRange( + IntCastRounded((Cos / Sin) * 256), MIN_INT16, MAX_INT16)); XAdjust = BucketEnd(Filler->X, XS, NB) - Start.x; YAdjust = XAdjust * Sin / Cos; @@ -1787,7 +1712,6 @@ void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT* Feature, window->DrawTo(X + Dx, Y + Dy); } /* RenderIntFeature */ - /** * This routine extracts the parameters of the specified * proto from the class description and adds a rendering of diff --git a/classify/intproto.h b/classify/intproto.h index d2c07147..262974b8 100644 --- a/classify/intproto.h +++ b/classify/intproto.h @@ -52,7 +52,7 @@ class FCOORD; #define NUM_CP_BUCKETS 24 #define CLASSES_PER_CP 32 #define NUM_BITS_PER_CLASS 2 -#define CLASS_PRUNER_CLASS_MASK (~(~0 << NUM_BITS_PER_CLASS)) +#define CLASS_PRUNER_CLASS_MASK (~(~0u << NUM_BITS_PER_CLASS)) #define CLASSES_PER_CP_WERD (CLASSES_PER_CP / NUM_BITS_PER_CLASS) #define PROTOS_PER_PP_WERD BITS_PER_WERD #define BITS_PER_CP_VECTOR (CLASSES_PER_CP * NUM_BITS_PER_CLASS) diff --git a/classify/kdtree.cpp b/classify/kdtree.cpp index 61a94f66..6ba7086d 100644 --- a/classify/kdtree.cpp +++ b/classify/kdtree.cpp @@ -70,11 +70,11 @@ class MinK { const Element* elements() { return elements_; } private: - const Key max_key_; //< the maximum possible Key - Element* elements_; //< unsorted array of elements + const Key max_key_; //< the maximum possible Key + Element *elements_; //< unsorted array of elements int elements_count_; //< the number of results collected so far - int k_; //< the number of results we want from the search - int max_index_; //< the index of the result with the largest key + int k_; //< the number of results we want from the search + int max_index_; //< the index of the result with the largest key }; template @@ -117,7 +117,8 @@ bool MinK::insert(Key key, Value value) { //----------------------------------------------------------------------------- -/** Helper class for searching for the k closest points to query_point in tree. */ +/** Helper class for searching for the k closest points to query_point in tree. + */ class KDTreeSearch { public: KDTreeSearch(KDTREE* tree, FLOAT32 *query_point, int k_closest); @@ -241,14 +242,13 @@ void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data) { *PtrToNode = MakeKDNode(Tree, Key, (void *) Data, Level); } /* KDStore */ - /** - * This routine deletes a node from Tree. The node to be - * deleted is specified by the Key for the node and the Data - * contents of the node. These two pointers must be identical - * to the pointers that were used for the node when it was - * originally stored in the tree. A node will be deleted from - * the tree only if its key and data pointers are identical + * This routine deletes a node from Tree. The node to be + * deleted is specified by the Key for the node and the Data + * contents of the node. These two pointers must be identical + * to the pointers that were used for the node when it was + * originally stored in the tree. A node will be deleted from + * the tree only if its key and data pointers are identical * to Key and Data respectively. The tree is re-formed by removing * the affected subtree and inserting all elements but the root. * @@ -298,7 +298,6 @@ KDDelete (KDTREE * Tree, FLOAT32 Key[], void *Data) { } } /* KDDelete */ - /** * This routine searches the K-D tree specified by Tree and * finds the QuerySize nearest neighbors of Query. All neighbors @@ -442,7 +441,7 @@ void KDTreeSearch::SearchRec(int level, KDNODE *sub_tree) { /*---------------------------------------------------------------------------*/ -/** +/** *Returns the Euclidean distance squared between p1 and p2 for all essential * dimensions. * @param k keys are in k-space @@ -541,7 +540,6 @@ void Walk(KDTREE *tree, void_proc action, void *context, Walk(tree, action, context, sub_tree->Right, NextLevel(tree, level)); } - /** Given a subtree nodes, insert all of its elements into tree. */ void InsertNodes(KDTREE *tree, KDNODE *nodes) { if (nodes == NULL) diff --git a/classify/mastertrainer.cpp b/classify/mastertrainer.cpp index ed8967cd..0492c2a7 100644 --- a/classify/mastertrainer.cpp +++ b/classify/mastertrainer.cpp @@ -86,27 +86,6 @@ bool MasterTrainer::Serialize(FILE* fp) const { return true; } -// Reads from the given file. Returns false in case of error. -// If swap is true, assumes a big/little-endian swap is needed. -bool MasterTrainer::DeSerialize(bool swap, FILE* fp) { - if (fread(&norm_mode_, sizeof(norm_mode_), 1, fp) != 1) return false; - if (swap) { - ReverseN(&norm_mode_, sizeof(norm_mode_)); - } - if (!unicharset_.load_from_file(fp)) return false; - charsetsize_ = unicharset_.size(); - if (!feature_space_.DeSerialize(swap, fp)) return false; - feature_map_.Init(feature_space_); - if (!samples_.DeSerialize(swap, fp)) return false; - if (!junk_samples_.DeSerialize(swap, fp)) return false; - if (!verify_samples_.DeSerialize(swap, fp)) return false; - if (!master_shapes_.DeSerialize(swap, fp)) return false; - if (!flat_shapes_.DeSerialize(swap, fp)) return false; - if (!fontinfo_table_.DeSerialize(swap, fp)) return false; - if (!xheights_.DeSerialize(swap, fp)) return false; - return true; -} - // Load an initial unicharset, or set one up if the file cannot be read. void MasterTrainer::LoadUnicharset(const char* filename) { if (!unicharset_.load_from_file(filename)) { @@ -214,10 +193,14 @@ void MasterTrainer::AddSample(bool verification, const char* unichar, // Must be called after ReadTrainingSamples, as the current number of images // is used as an offset for page numbers in the samples. void MasterTrainer::LoadPageImages(const char* filename) { + size_t offset = 0; int page; Pix* pix; - for (page = 0; (pix = pixReadTiff(filename, page)) != NULL; ++page) { + for (page = 0;; page++) { + pix = pixReadFromMultipageTiff(filename, &offset); + if (!pix) break; page_images_.push_back(pix); + if (!offset) break; } tprintf("Loaded %d page images from %s\n", page, filename); } @@ -362,9 +345,11 @@ bool MasterTrainer::LoadFontInfo(const char* filename) { fontinfo.name = font_name; fontinfo.properties = 0; fontinfo.universal_id = 0; - if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name, - &italic, &bold, &fixed, &serif, &fraktur) != 6) + if (tfscanf(fp, "%1024s %i %i %i %i %i\n", font_name, &italic, &bold, + &fixed, &serif, &fraktur) != 6) { + delete[] font_name; continue; + } fontinfo.properties = (italic << 0) + (bold << 1) + @@ -373,6 +358,8 @@ bool MasterTrainer::LoadFontInfo(const char* filename) { (fraktur << 4); if (!fontinfo_table_.contains(fontinfo)) { fontinfo_table_.push_back(fontinfo); + } else { + delete[] font_name; } } fclose(fp); @@ -877,6 +864,7 @@ void MasterTrainer::ReplaceFragmentedSamples() { if (good_ch != INVALID_UNICHAR_ID) good_junk[good_ch] = true; // We want this one. } + delete frag; } #endif // For now just use all the junk that was from natural fragments. @@ -891,6 +879,7 @@ void MasterTrainer::ReplaceFragmentedSamples() { junk_samples_.extract_sample(s); samples_.AddSample(frag_set.id_to_unichar(junk_id), sample); } + delete frag; } junk_samples_.DeleteDeadSamples(); junk_samples_.OrganizeByFontAndClass(); diff --git a/classify/mastertrainer.h b/classify/mastertrainer.h index 8cc7158a..80837c94 100644 --- a/classify/mastertrainer.h +++ b/classify/mastertrainer.h @@ -19,8 +19,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TRAINING_MASTERTRAINER_H__ -#define TESSERACT_TRAINING_MASTERTRAINER_H__ +#ifndef TESSERACT_TRAINING_MASTERTRAINER_H_ +#define TESSERACT_TRAINING_MASTERTRAINER_H_ /**---------------------------------------------------------------------------- Include Files and Type Defines @@ -74,9 +74,6 @@ class MasterTrainer { // Writes to the given file. Returns false in case of error. bool Serialize(FILE* fp) const; - // Reads from the given file. Returns false in case of error. - // If swap is true, assumes a big/little-endian swap is needed. - bool DeSerialize(bool swap, FILE* fp); // Loads an initial unicharset, or sets one up if the file cannot be read. void LoadUnicharset(const char* filename); @@ -309,4 +306,4 @@ class MasterTrainer { } // namespace tesseract. -#endif +#endif // TESSERACT_TRAINING_MASTERTRAINER_H_ diff --git a/classify/mf.cpp b/classify/mf.cpp index d0c59487..37cd2eca 100644 --- a/classify/mf.cpp +++ b/classify/mf.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: mf.c - ** Purpose: Micro-feature interface to flexible feature extractor. - ** Author: Dan Johnson - ** History: Thu May 24 09:08:38 1990, DSJ, Created. + ** Filename: mf.c + ** Purpose: Micro-feature interface to flexible feature extractor. + ** Author: Dan Johnson + ** History: Thu May 24 09:08:38 1990, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -36,7 +36,7 @@ * Call the old micro-feature extractor and then copy * the features into the new format. Then deallocate the * old micro-features. - * @param Blob blob to extract micro-features from + * @param Blob blob to extract micro-features from * @param cn_denorm control parameter to feature extractor. * @return Micro-features for Blob. * @note Exceptions: none diff --git a/classify/mfdefs.cpp b/classify/mfdefs.cpp index abe8d0c7..0f225e8b 100644 --- a/classify/mfdefs.cpp +++ b/classify/mfdefs.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: mfdefs.c - ** Purpose: Basic routines for manipulating micro-features - ** Author: Dan Johnson - ** History: Mon Jan 22 08:48:58 1990, DSJ, Created. + ** Filename: mfdefs.c + ** Purpose: Basic routines for manipulating micro-features + ** Author: Dan Johnson + ** History: Mon Jan 22 08:48:58 1990, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -32,7 +32,7 @@ * @return New MICROFEATURE * @note History: 7/27/89, DSJ, Created. */ -MICROFEATURE NewMicroFeature() { +MICROFEATURE NewMicroFeature() { return ((MICROFEATURE) Emalloc (sizeof (MFBLOCK))); } /* NewMicroFeature */ @@ -41,10 +41,10 @@ MICROFEATURE NewMicroFeature() { /** * This routine deallocates all of the memory consumed by * a list of micro-features. - * @param MicroFeatures list of micro-features to be freed + * @param MicroFeatures list of micro-features to be freed * @return none * @note History: 7/27/89, DSJ, Created. */ -void FreeMicroFeatures(MICROFEATURES MicroFeatures) { +void FreeMicroFeatures(MICROFEATURES MicroFeatures) { destroy_nodes(MicroFeatures, Efree); } /* FreeMicroFeatures */ diff --git a/classify/mfoutline.cpp b/classify/mfoutline.cpp index 511c34d4..59593a85 100644 --- a/classify/mfoutline.cpp +++ b/classify/mfoutline.cpp @@ -35,7 +35,8 @@ ----------------------------------------------------------------------------*/ /*---------------------------------------------------------------------------*/ -/** Convert a blob into a list of MFOUTLINEs (float-based microfeature format). */ +/** Convert a blob into a list of MFOUTLINEs (float-based microfeature format). + */ LIST ConvertBlob(TBLOB *blob) { LIST outlines = NIL_LIST; return (blob == NULL) @@ -344,7 +345,6 @@ void ChangeDirection(MFOUTLINE Start, MFOUTLINE End, DIRECTION Direction) { } /* ChangeDirection */ - /** * This routine normalizes each point in Outline by * translating it to the specified center and scaling it @@ -378,7 +378,6 @@ void CharNormalizeOutline(MFOUTLINE Outline, const DENORM& cn_denorm) { } /* CharNormalizeOutline */ - /** * This routine computes the slope from Start to Finish and * and then computes the approximate direction of the line diff --git a/classify/mfx.cpp b/classify/mfx.cpp index 3da4fb3d..6fd8ed5d 100644 --- a/classify/mfx.cpp +++ b/classify/mfx.cpp @@ -128,7 +128,6 @@ FLOAT32 ComputeOrientation(MFEDGEPT *Start, MFEDGEPT *End) { return (Orientation); } /* ComputeOrientation */ - /** * Convert Outline to MicroFeatures * @param Outline outline to extract micro-features from @@ -164,7 +163,6 @@ MICROFEATURES ConvertToMicroFeatures(MFOUTLINE Outline, return (MicroFeatures); } /* ConvertToMicroFeatures */ - /** * This routine computes the feature parameters which describe * the micro-feature that starts and Start and ends at End. @@ -178,7 +176,7 @@ MICROFEATURES ConvertToMicroFeatures(MFOUTLINE Outline, * @return New micro-feature or NULL if the feature was rejected. * @note Globals: none * @note Exceptions: none - * @note History: + * @note History: * - 7/26/89, DSJ, Created. * - 11/17/89, DSJ, Added handling for Start and End same point. */ diff --git a/classify/mfx.h b/classify/mfx.h index 05ce29ce..5ed006dc 100644 --- a/classify/mfx.h +++ b/classify/mfx.h @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: mfx.h - ** Purpose: Definition of micro-feature extraction routines - ** Author: Dan Johnson - ** History: 5/29/89, DSJ, Created. + ** Filename: mfx.h + ** Purpose: Definition of micro-feature extraction routines + ** Author: Dan Johnson + ** History: 5/29/89, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at diff --git a/classify/normfeat.cpp b/classify/normfeat.cpp index a4ac672a..f297b3b0 100644 --- a/classify/normfeat.cpp +++ b/classify/normfeat.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: normfeat.c - ** Purpose: Definition of char normalization features. - ** Author: Dan Johnson - ** History: 12/14/90, DSJ, Created. + ** Filename: normfeat.c + ** Purpose: Definition of char normalization features. + ** Author: Dan Johnson + ** History: 12/14/90, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -33,7 +33,6 @@ FLOAT32 ActualOutlineLength(FEATURE Feature) { return (Feature->Params[CharNormLength] * LENGTH_COMPRESSION); } - /** * Return the character normalization feature for a blob. * diff --git a/classify/normmatch.cpp b/classify/normmatch.cpp index 488cd165..a88facee 100644 --- a/classify/normmatch.cpp +++ b/classify/normmatch.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: normmatch.c - ** Purpose: Simple matcher based on character normalization features. - ** Author: Dan Johnson - ** History: Wed Dec 19 16:18:06 1990, DSJ, Created. + ** Filename: normmatch.c + ** Purpose: Simple matcher based on character normalization features. + ** Author: Dan Johnson + ** History: Wed Dec 19 16:18:06 1990, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -197,10 +197,10 @@ double NormEvidenceOf(register double NormAdj) { /*---------------------------------------------------------------------------*/ /** * This routine dumps out detailed normalization match info. - * @param File open text file to dump match debug info to - * @param NumParams # of parameters in proto and feature - * @param Proto[] array of prototype parameters - * @param Feature[] array of feature parameters + * @param File open text file to dump match debug info to + * @param NumParams # of parameters in proto and feature + * @param Proto[] array of prototype parameters + * @param Feature[] array of feature parameters * Globals: none * @return none * @note Exceptions: none @@ -242,7 +242,7 @@ namespace tesseract { * @note Exceptions: none * @note History: Wed Dec 19 16:38:49 1990, DSJ, Created. */ -NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) { +NORM_PROTOS *Classify::ReadNormProtos(TFile *fp) { NORM_PROTOS *NormProtos; int i; char unichar[2 * UNICHAR_LEN + 1]; @@ -258,26 +258,26 @@ NORM_PROTOS *Classify::ReadNormProtos(FILE *File, inT64 end_offset) { NormProtos->Protos[i] = NIL_LIST; /* read file header and save in data structure */ - NormProtos->NumParams = ReadSampleSize (File); - NormProtos->ParamDesc = ReadParamDesc (File, NormProtos->NumParams); + NormProtos->NumParams = ReadSampleSize(fp); + NormProtos->ParamDesc = ReadParamDesc(fp, NormProtos->NumParams); /* read protos for each class into a separate list */ - while ((end_offset < 0 || ftell(File) < end_offset) && - tfscanf(File, "%s %d", unichar, &NumProtos) == 2) { + const int kMaxLineSize = 100; + char line[kMaxLineSize]; + while (fp->FGets(line, kMaxLineSize) != nullptr) { + if (sscanf(line, "%s %d", unichar, &NumProtos) != 2) continue; if (unicharset.contains_unichar(unichar)) { unichar_id = unicharset.unichar_to_id(unichar); Protos = NormProtos->Protos[unichar_id]; for (i = 0; i < NumProtos; i++) - Protos = - push_last (Protos, ReadPrototype (File, NormProtos->NumParams)); + Protos = push_last(Protos, ReadPrototype(fp, NormProtos->NumParams)); NormProtos->Protos[unichar_id] = Protos; } else { - cprintf("Error: unichar %s in normproto file is not in unichar set.\n", + tprintf("Error: unichar %s in normproto file is not in unichar set.\n", unichar); for (i = 0; i < NumProtos; i++) - FreePrototype(ReadPrototype (File, NormProtos->NumParams)); + FreePrototype(ReadPrototype(fp, NormProtos->NumParams)); } - SkipNewline(File); } return (NormProtos); } /* ReadNormProtos */ diff --git a/classify/ocrfeatures.cpp b/classify/ocrfeatures.cpp index 0895ed08..7df81350 100644 --- a/classify/ocrfeatures.cpp +++ b/classify/ocrfeatures.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: features.c - ** Purpose: Generic definition of a feature. - ** Author: Dan Johnson - ** History: Mon May 21 10:49:04 1990, DSJ, Created. + ** Filename: features.c + ** Purpose: Generic definition of a feature. + ** Author: Dan Johnson + ** History: Mon May 21 10:49:04 1990, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -66,12 +66,11 @@ void FreeFeature(FEATURE Feature) { } /* FreeFeature */ - /** * Release the memory consumed by the specified feature * set. This routine also frees the memory consumed by the * features contained in the set. - * @param FeatureSet set of features to be freed + * @param FeatureSet set of features to be freed * @return none * @note History: Mon May 21 13:59:46 1990, DSJ, Created. */ @@ -85,11 +84,10 @@ void FreeFeatureSet(FEATURE_SET FeatureSet) { } } /* FreeFeatureSet */ - /** * Allocate and return a new feature of the specified * type. - * @param FeatureDesc description of feature to be created. + * @param FeatureDesc description of feature to be created. * @return New #FEATURE. * @note History: Mon May 21 14:06:42 1990, DSJ, Created. */ @@ -105,11 +103,10 @@ FEATURE NewFeature(const FEATURE_DESC_STRUCT* FeatureDesc) { } /* NewFeature */ - /** * Allocate and return a new feature set large enough to * hold the specified number of features. - * @param NumFeatures maximum # of features to be put in feature set + * @param NumFeatures maximum # of features to be put in feature set * @return New #FEATURE_SET. * @note History: Mon May 21 14:22:40 1990, DSJ, Created. */ @@ -124,7 +121,6 @@ FEATURE_SET NewFeatureSet(int NumFeatures) { } /* NewFeatureSet */ - /** * Create a new feature of the specified type and read in * the value of its parameters from File. The extra penalty @@ -135,10 +131,11 @@ FEATURE_SET NewFeatureSet(int NumFeatures) { * @param File open text file to read feature from * @param FeatureDesc specifies type of feature to read from File * @return New #FEATURE read from File. - * @note Exceptions: #ILLEGAL_FEATURE_PARAM if text file doesn't match expected format + * @note Exceptions: #ILLEGAL_FEATURE_PARAM if text file doesn't match expected + * format * @note History: Wed May 23 08:53:16 1990, DSJ, Created. */ -FEATURE ReadFeature(FILE *File, const FEATURE_DESC_STRUCT* FeatureDesc) { +FEATURE ReadFeature(FILE* File, const FEATURE_DESC_STRUCT* FeatureDesc) { FEATURE Feature; int i; @@ -153,7 +150,6 @@ FEATURE ReadFeature(FILE *File, const FEATURE_DESC_STRUCT* FeatureDesc) { return (Feature); } /* ReadFeature */ - /** * Create a new feature set of the specified type and read in * the features from File. The correct text representation @@ -165,7 +161,7 @@ FEATURE ReadFeature(FILE *File, const FEATURE_DESC_STRUCT* FeatureDesc) { * @return New feature set read from File. * @note History: Wed May 23 09:17:31 1990, DSJ, Created. */ -FEATURE_SET ReadFeatureSet(FILE *File, const FEATURE_DESC_STRUCT* FeatureDesc) { +FEATURE_SET ReadFeatureSet(FILE* File, const FEATURE_DESC_STRUCT* FeatureDesc) { FEATURE_SET FeatureSet; int NumFeatures; int i; @@ -180,7 +176,6 @@ FEATURE_SET ReadFeatureSet(FILE *File, const FEATURE_DESC_STRUCT* FeatureDesc) { return (FeatureSet); } /* ReadFeatureSet */ - /** * Appends a textual representation of Feature to str. * This representation is simply a list of the N parameters @@ -203,7 +198,6 @@ void WriteFeature(FEATURE Feature, STRING* str) { *str += "\n"; } /* WriteFeature */ - /** * Write a textual representation of FeatureSet to File. * This representation is an integer specifying the number of @@ -224,7 +218,6 @@ void WriteFeatureSet(FEATURE_SET FeatureSet, STRING* str) { } } /* WriteFeatureSet */ - /** * Write a textual representation of FeatureDesc to File * in the old format (i.e. the format used by the clusterer). @@ -240,7 +233,7 @@ void WriteFeatureSet(FEATURE_SET FeatureSet, STRING* str) { * @return none * @note History: Fri May 25 15:27:18 1990, DSJ, Created. */ -void WriteOldParamDesc(FILE *File, const FEATURE_DESC_STRUCT* FeatureDesc) { +void WriteOldParamDesc(FILE* File, const FEATURE_DESC_STRUCT* FeatureDesc) { int i; fprintf (File, "%d\n", FeatureDesc->NumParams); diff --git a/classify/outfeat.cpp b/classify/outfeat.cpp index b1a4a9be..76597f7c 100644 --- a/classify/outfeat.cpp +++ b/classify/outfeat.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: outfeat.c - ** Purpose: Definition of outline-features. - ** Author: Dan Johnson - ** History: 11/13/90, DSJ, Created. + ** Filename: outfeat.c + ** Purpose: Definition of outline-features. + ** Author: Dan Johnson + ** History: 11/13/90, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -40,7 +40,7 @@ namespace tesseract { * @return Outline-features for Blob. * @note Globals: none * @note Exceptions: none - * @note History: + * @note History: * - 11/13/90, DSJ, Created. * - 05/24/91, DSJ, Updated for either char or baseline normalize. */ @@ -115,7 +115,7 @@ void AddOutlineFeatureToSet(FPOINT *Start, * @return none (results are returned in FeatureSet) * @note Globals: none * @note Exceptions: none - * @note History: + * @note History: * - 11/13/90, DSJ, Created. * - 5/24/91, DSJ, Added hidden edge capability. */ diff --git a/classify/picofeat.cpp b/classify/picofeat.cpp index 74beb18f..a4a39263 100644 --- a/classify/picofeat.cpp +++ b/classify/picofeat.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: picofeat.c - ** Purpose: Definition of pico-features. - ** Author: Dan Johnson - ** History: 9/4/90, DSJ, Created. + ** Filename: picofeat.c + ** Purpose: Definition of pico-features. + ** Author: Dan Johnson + ** History: 9/4/90, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -98,7 +98,7 @@ FEATURE_SET Classify::ExtractPicoFeatures(TBLOB *Blob) { * nearest whole number of pico-features. The pico-features * are spaced evenly over the entire segment. * Globals: - * - classify_pico_feature_length length of a single pico-feature + * - classify_pico_feature_length length of a single pico-feature * @param Start starting point of pico-feature * @param End ending point of pico-feature * @param FeatureSet set to add pico-feature to diff --git a/classify/picofeat.h b/classify/picofeat.h index 208b7e77..966ffc32 100644 --- a/classify/picofeat.h +++ b/classify/picofeat.h @@ -61,5 +61,5 @@ extern double_VAR_H(classify_pico_feature_length, 0.05, "Pico Feature Length"); /**---------------------------------------------------------------------------- Global Data Definitions and Declarations ----------------------------------------------------------------------------**/ -extern FLOAT32 PicoFeatureLength; +extern TESS_API FLOAT32 PicoFeatureLength; #endif diff --git a/classify/shapeclassifier.cpp b/classify/shapeclassifier.cpp index a39c8a24..e0ee3373 100644 --- a/classify/shapeclassifier.cpp +++ b/classify/shapeclassifier.cpp @@ -176,7 +176,7 @@ void ShapeClassifier::UnicharPrintResults( for (int i = 0; i < results.size(); ++i) { tprintf("%g: c_id=%d=%s", results[i].rating, results[i].unichar_id, GetUnicharset().id_to_unichar(results[i].unichar_id)); - if (results[i].fonts.size() != 0) { + if (!results[i].fonts.empty()) { tprintf(" Font Vector:"); for (int f = 0; f < results[i].fonts.size(); ++f) { tprintf(" %d", results[i].fonts[f].fontinfo_id); diff --git a/classify/shapetable.cpp b/classify/shapetable.cpp index 0800860b..24e26d8c 100644 --- a/classify/shapetable.cpp +++ b/classify/shapetable.cpp @@ -71,10 +71,9 @@ bool UnicharAndFonts::Serialize(FILE* fp) const { } // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. -bool UnicharAndFonts::DeSerialize(bool swap, FILE* fp) { - if (fread(&unichar_id, sizeof(unichar_id), 1, fp) != 1) return false; - if (swap) - ReverseN(&unichar_id, sizeof(unichar_id)); +bool UnicharAndFonts::DeSerialize(bool swap, TFile* fp) { + if (fp->FReadEndian(&unichar_id, sizeof(unichar_id), 1, swap) != 1) + return false; if (!font_ids.DeSerialize(swap, fp)) return false; return true; } @@ -96,10 +95,9 @@ bool Shape::Serialize(FILE* fp) const { } // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. -bool Shape::DeSerialize(bool swap, FILE* fp) { +bool Shape::DeSerialize(bool swap, TFile* fp) { uinT8 sorted; - if (fread(&sorted, sizeof(sorted), 1, fp) != 1) - return false; + if (fp->FRead(&sorted, sizeof(sorted), 1) != 1) return false; unichars_sorted_ = sorted != 0; if (!unichars_.DeSerializeClasses(swap, fp)) return false; return true; @@ -253,7 +251,7 @@ bool ShapeTable::Serialize(FILE* fp) const { } // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. -bool ShapeTable::DeSerialize(bool swap, FILE* fp) { +bool ShapeTable::DeSerialize(bool swap, TFile* fp) { if (!shape_table_.DeSerialize(swap, fp)) return false; num_fonts_ = 0; return true; diff --git a/classify/shapetable.h b/classify/shapetable.h index d8faae88..2dc3bee6 100644 --- a/classify/shapetable.h +++ b/classify/shapetable.h @@ -168,7 +168,7 @@ struct UnicharAndFonts { bool Serialize(FILE* fp) const; // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. - bool DeSerialize(bool swap, FILE* fp); + bool DeSerialize(bool swap, TFile* fp); // Sort function to sort a pair of UnicharAndFonts by unichar_id. static int SortByUnicharId(const void* v1, const void* v2); @@ -191,7 +191,7 @@ class Shape { bool Serialize(FILE* fp) const; // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. - bool DeSerialize(bool swap, FILE* fp); + bool DeSerialize(bool swap, TFile* fp); int destination_index() const { return destination_index_; @@ -272,7 +272,7 @@ class ShapeTable { bool Serialize(FILE* fp) const; // Reads from the given file. Returns false in case of error. // If swap is true, assumes a big/little-endian swap is needed. - bool DeSerialize(bool swap, FILE* fp); + bool DeSerialize(bool swap, TFile* fp); // Accessors. int NumShapes() const { diff --git a/classify/trainingsample.cpp b/classify/trainingsample.cpp index 7fe83b77..ee6c9d7f 100644 --- a/classify/trainingsample.cpp +++ b/classify/trainingsample.cpp @@ -209,7 +209,7 @@ void TrainingSample::ExtractCharDesc(int int_feature_type, int geo_type, CHAR_DESC_STRUCT* char_desc) { // Extract the INT features. - if (features_ != NULL) delete [] features_; + delete[] features_; FEATURE_SET_STRUCT* char_features = char_desc->FeatureSets[int_feature_type]; if (char_features == NULL) { tprintf("Error: no features to train on of type %s\n", @@ -230,7 +230,7 @@ void TrainingSample::ExtractCharDesc(int int_feature_type, } } // Extract the Micro features. - if (micro_features_ != NULL) delete [] micro_features_; + delete[] micro_features_; char_features = char_desc->FeatureSets[micro_type]; if (char_features == NULL) { tprintf("Error: no features to train on of type %s\n", diff --git a/classify/trainingsample.h b/classify/trainingsample.h index 6df1ce82..251e33b6 100644 --- a/classify/trainingsample.h +++ b/classify/trainingsample.h @@ -13,8 +13,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TRAINING_TRAININGSAMPLE_H__ -#define TESSERACT_TRAINING_TRAININGSAMPLE_H__ +#ifndef TESSERACT_TRAINING_TRAININGSAMPLE_H_ +#define TESSERACT_TRAINING_TRAININGSAMPLE_H_ #include "elst.h" #include "featdefs.h" @@ -247,4 +247,4 @@ ELISTIZEH(TrainingSample) } // namespace tesseract -#endif // TESSERACT_TRAINING_TRAININGSAMPLE_H__ +#endif // TESSERACT_TRAINING_TRAININGSAMPLE_H_ diff --git a/classify/trainingsampleset.cpp b/classify/trainingsampleset.cpp index afbf3f42..2531e576 100644 --- a/classify/trainingsampleset.cpp +++ b/classify/trainingsampleset.cpp @@ -96,10 +96,8 @@ bool TrainingSampleSet::DeSerialize(bool swap, FILE* fp) { num_raw_samples_ = samples_.size(); if (!unicharset_.load_from_file(fp)) return false; if (!font_id_map_.DeSerialize(swap, fp)) return false; - if (font_class_array_ != NULL) { - delete font_class_array_; - font_class_array_ = NULL; - } + delete font_class_array_; + font_class_array_ = NULL; inT8 not_null; if (fread(¬_null, sizeof(not_null), 1, fp) != 1) return false; if (not_null) { @@ -489,81 +487,6 @@ void TrainingSampleSet::IndexFeatures(const IntFeatureSpace& feature_space) { samples_[s]->IndexFeatures(feature_space); } -// Delete outlier samples with few features that are shared with others. -// IndexFeatures must have been called already. -void TrainingSampleSet::DeleteOutliers(const IntFeatureSpace& feature_space, - bool debug) { - if (font_class_array_ == NULL) - OrganizeByFontAndClass(); - Pixa* pixa = NULL; - if (debug) - pixa = pixaCreate(0); - GenericVector feature_counts; - int fs_size = feature_space.Size(); - int font_size = font_id_map_.CompactSize(); - for (int font_index = 0; font_index < font_size; ++font_index) { - for (int c = 0; c < unicharset_size_; ++c) { - // Create a histogram of the features used by all samples of this - // font/class combination. - feature_counts.init_to_size(fs_size, 0); - FontClassInfo& fcinfo = (*font_class_array_)(font_index, c); - int sample_count = fcinfo.samples.size(); - if (sample_count < kMinOutlierSamples) - continue; - for (int i = 0; i < sample_count; ++i) { - int s = fcinfo.samples[i]; - const GenericVector& features = samples_[s]->indexed_features(); - for (int f = 0; f < features.size(); ++f) { - ++feature_counts[features[f]]; - } - } - for (int i = 0; i < sample_count; ++i) { - int s = fcinfo.samples[i]; - const TrainingSample& sample = *samples_[s]; - const GenericVector& features = sample.indexed_features(); - // A feature that has a histogram count of 1 is only used by this - // sample, making it 'bad'. All others are 'good'. - int good_features = 0; - int bad_features = 0; - for (int f = 0; f < features.size(); ++f) { - if (feature_counts[features[f]] > 1) - ++good_features; - else - ++bad_features; - } - // If more than 1/3 features are bad, then this is an outlier. - if (bad_features * 2 > good_features) { - tprintf("Deleting outlier sample of %s, %d good, %d bad\n", - SampleToString(sample).string(), - good_features, bad_features); - if (debug) { - pixaAddPix(pixa, sample.RenderToPix(&unicharset_), L_INSERT); - // Add the previous sample as well, so it is easier to see in - // the output what is wrong with this sample. - int t; - if (i == 0) - t = fcinfo.samples[1]; - else - t = fcinfo.samples[i - 1]; - const TrainingSample &csample = *samples_[t]; - pixaAddPix(pixa, csample.RenderToPix(&unicharset_), L_INSERT); - } - // Mark the sample for deletion. - KillSample(samples_[s]); - } - } - } - } - // Truly delete all bad samples and renumber everything. - DeleteDeadSamples(); - if (pixa != NULL) { - Pix* pix = pixaDisplayTiledInRows(pixa, 1, 2600, 1.0, 0, 10, 10); - pixaDestroy(&pixa); - pixWrite("outliers.png", pix, IFF_PNG); - pixDestroy(&pix); - } -} - // Marks the given sample index for deletion. // Deletion is actually completed by DeleteDeadSamples. void TrainingSampleSet::KillSample(TrainingSample* sample) { @@ -584,22 +507,6 @@ bool TrainingSampleSet::DeleteableSample(const TrainingSample* sample) { return sample == NULL || sample->class_id() < 0; } -static Pix* DebugSample(const UNICHARSET& unicharset, - TrainingSample* sample) { - tprintf("\nOriginal features:\n"); - for (int i = 0; i < sample->num_features(); ++i) { - sample->features()[i].print(); - } - if (sample->features_are_mapped()) { - tprintf("\nMapped features:\n"); - for (int i = 0; i < sample->mapped_features().size(); ++i) { - tprintf("%d ", sample->mapped_features()[i]); - } - tprintf("\n"); - } - return sample->RenderToPix(&unicharset); -} - // Construct an array to access the samples by font,class pair. void TrainingSampleSet::OrganizeByFontAndClass() { // Font indexes are sparse, so we used a map to compact them, so we can @@ -747,12 +654,6 @@ void TrainingSampleSet::ComputeCanonicalSamples(const IntFeatureMap& map, if (debug) { tprintf("Global worst dist = %g, between sample %d and %d\n", global_worst_dist, worst_s1, worst_s2); - Pix* pix1 = DebugSample(unicharset_, samples_[worst_s1]); - Pix* pix2 = DebugSample(unicharset_, samples_[worst_s2]); - pixOr(pix1, pix1, pix2); - pixWrite("worstpair.png", pix1, IFF_PNG); - pixDestroy(&pix1); - pixDestroy(&pix2); } } diff --git a/classify/trainingsampleset.h b/classify/trainingsampleset.h index ad92c6c4..f48ad5ff 100644 --- a/classify/trainingsampleset.h +++ b/classify/trainingsampleset.h @@ -13,8 +13,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H__ -#define TESSERACT_TRAINING_TRAININGSAMPLESET_H__ +#ifndef TESSERACT_TRAINING_TRAININGSAMPLESET_H_ +#define TESSERACT_TRAINING_TRAININGSAMPLESET_H_ #include "bitvector.h" #include "genericvector.h" @@ -171,10 +171,6 @@ class TrainingSampleSet { // Generates indexed features for all samples with the supplied feature_space. void IndexFeatures(const IntFeatureSpace& feature_space); - // Delete outlier samples with few features that are shared with others. - // IndexFeatures must have been called already. - void DeleteOutliers(const IntFeatureSpace& feature_space, bool debug); - // Marks the given sample for deletion. // Deletion is actually completed by DeleteDeadSamples. void KillSample(TrainingSample* sample); diff --git a/cmake/BuildFunctions.cmake b/cmake/BuildFunctions.cmake index eea5a396..39fd6d70 100644 --- a/cmake/BuildFunctions.cmake +++ b/cmake/BuildFunctions.cmake @@ -1,3 +1,12 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. ################################################################################ # # macros and functions diff --git a/cmake/Configure.cmake b/cmake/Configure.cmake index d7f4ac6a..bd5b80c5 100644 --- a/cmake/Configure.cmake +++ b/cmake/Configure.cmake @@ -1,3 +1,12 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. ################################################################################ # # configure diff --git a/cmake/FindICU.cmake b/cmake/FindICU.cmake index cd6bf926..8381c2eb 100644 --- a/cmake/FindICU.cmake +++ b/cmake/FindICU.cmake @@ -1,3 +1,12 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. # This module can find the International Components for Unicode (ICU) Library # # Requirements: diff --git a/cmake/SourceGroups.cmake b/cmake/SourceGroups.cmake index ca87e808..4bc6e3fa 100644 --- a/cmake/SourceGroups.cmake +++ b/cmake/SourceGroups.cmake @@ -1,3 +1,12 @@ +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. #include(SourceGroups) set(SSRC ${CMAKE_SOURCE_DIR}) @@ -14,14 +23,14 @@ set(H_CPP "(${H}|${CPP})") source_group("Resource files" ".*\\.(rc|ico)") source_group("api" "${SSRC}/api/${H_CPP}") +source_group("arch" "${SSRC}/arch/${H_CPP}") source_group("ccmain" "${SSRC}/ccmain/${H_CPP}") source_group("ccstruct" "${SSRC}/ccstruct/${H_CPP}") source_group("ccutil" "${SSRC}/ccutil/${H_CPP}") source_group("classify" "${SSRC}/classify/${H_CPP}") -source_group("cube" "${SSRC}/cube/${H_CPP}") source_group("cutil" "${SSRC}/cutil/${H_CPP}") source_group("dict" "${SSRC}/dict/${H_CPP}") -source_group("neural" "${SSRC}/neural_networks/runtime/${H_CPP}") +source_group("lstm" "${SSRC}/lstm/${H_CPP}") source_group("opencl" "${SSRC}/opencl/${H_CPP}") source_group("textord" "${SSRC}/textord/${H_CPP}") source_group("viewer" "${SSRC}/viewer/${H_CPP}") diff --git a/configure.ac b/configure.ac index e0e4cf3a..8951b0aa 100644 --- a/configure.ac +++ b/configure.ac @@ -5,8 +5,8 @@ # ---------------------------------------- # Initialization # ---------------------------------------- -AC_PREREQ([2.50]) -AC_INIT([tesseract], [3.05.00dev], [https://github.com/tesseract-ocr/tesseract/issues]) +AC_PREREQ([2.59]) +AC_INIT([tesseract], [4.00.00dev], [https://github.com/tesseract-ocr/tesseract/issues]) AC_PROG_CXX([g++ clang++]) AC_LANG([C++]) AC_LANG_COMPILER_REQUIRE @@ -18,8 +18,8 @@ AC_PREFIX_DEFAULT([/usr/local]) # Define date of package, etc. Could be useful in auto-generated # documentation. -PACKAGE_YEAR=2015 -PACKAGE_DATE="07/11" +PACKAGE_YEAR=2016 +PACKAGE_DATE="11/11" abs_top_srcdir=`AS_DIRNAME([$0])` gitrev="`git --git-dir=${abs_top_srcdir}/.git --work-tree=${abs_top_srcdir} describe --always --tags`" @@ -42,8 +42,8 @@ AC_SUBST([PACKAGE_DATE]) GENERIC_LIBRARY_NAME=tesseract # Release versioning -GENERIC_MAJOR_VERSION=3 -GENERIC_MINOR_VERSION=4 +GENERIC_MAJOR_VERSION=4 +GENERIC_MINOR_VERSION=0 GENERIC_MICRO_VERSION=0 # API version (often = GENERIC_MAJOR_VERSION.GENERIC_MINOR_VERSION) @@ -75,9 +75,10 @@ AM_CONDITIONAL([T_WIN], false) AM_CONDITIONAL([MINGW], false) AM_CONDITIONAL([OSX], false) AM_CONDITIONAL([GRAPHICS_DISABLED], false) +AC_SUBST([AM_CPPFLAGS]) OPENCL_INC="/opt/AMDAPP/include" -OPENCL_LIBS="-lOpenCL" +OPENCL_LIBS="-lOpenCL -ltiff" ############################# # # Platform specific setup @@ -114,10 +115,24 @@ case "${host_os}" in ;; esac +## Checks for supported compiler options. +AM_CONDITIONAL([AVX_OPT], false) +AM_CONDITIONAL([SSE41_OPT], false) + +AX_CHECK_COMPILE_FLAG([-mavx], [avx=true], [avx=false]) +if $avx; then + AM_CONDITIONAL([AVX_OPT], true) +fi + +AX_CHECK_COMPILE_FLAG([-msse4.1], [sse41=true], [sse41=false]) +if $sse41; then + AM_CONDITIONAL([SSE41_OPT], true) +fi + includedir="${includedir}/tesseract" AC_ARG_WITH([extra-includes], - [AC_HELP_STRING([--with-extra-includes=DIR], + [AS_HELP_STRING([--with-extra-includes=DIR], [Define an additional directory for include files])], [if test -d "$withval" ; then CFLAGS="$CFLAGS -I$withval" @@ -126,7 +141,7 @@ AC_ARG_WITH([extra-includes], fi]) AC_ARG_WITH([extra-libraries], - [AC_HELP_STRING([--with-extra-libraries=DIR], + [AS_HELP_STRING([--with-extra-libraries=DIR], [Define an additional directory for library files])], [if test -d "$withval" ; then LDFLAGS="$LDFLAGS -L$withval" @@ -136,8 +151,8 @@ AC_ARG_WITH([extra-libraries], AC_MSG_CHECKING([--enable-graphics argument]) AC_ARG_ENABLE([graphics], - [AC_HELP_STRING([--enable-graphics],[enable graphics (ScrollView) (default)]) -AC_HELP_STRING([--disable-graphics],[disable graphics (ScrollView)])], + [AS_HELP_STRING([--enable-graphics],[enable graphics (ScrollView) (default)]) +AS_HELP_STRING([--disable-graphics],[disable graphics (ScrollView)])], [enable_graphics=$enableval], [enable_graphics="yes"]) AC_MSG_RESULT([$enable_graphics]) @@ -146,17 +161,6 @@ if test "$enable_graphics" = "no"; then AM_CONDITIONAL([GRAPHICS_DISABLED], true) fi -# Check if cube should be disabled -AC_MSG_CHECKING([whether to disable cube]) -AC_ARG_ENABLE([cube], - [AC_HELP_STRING([--disable-cube], [don't build cube support (experimental)])], - [disable_cube="yes"], [disable_cube="no"]) -AC_MSG_RESULT([$disable_cube]) -AM_CONDITIONAL([NO_CUBE_BUILD], [test "$disable_cube" = "yes"]) -if test "$disable_cube" = "yes"; then - AC_SUBST([AM_CPPFLAGS], [-DNO_CUBE_BUILD]) -fi - # check whether to build embedded version AC_MSG_CHECKING([--enable-embedded argument]) AC_ARG_ENABLE([embedded], @@ -166,18 +170,11 @@ AC_ARG_ENABLE([embedded], AC_MSG_RESULT([$enable_embedded]) AM_CONDITIONAL([EMBEDDED], [test "$enable_embedded" = "yes"]) if test "$enable_embedded" = "yes"; then - AC_SUBST([AM_CPPFLAGS], [-DEMBEDDED]) + AM_CPPFLAGS="-DEMBEDDED $AM_CPPFLAGS" fi # check whether to build OpenMP support -AM_CONDITIONAL([OPENMP], false) AC_OPENMP -AS_IF([test "x$OPENMP_CFLAGS" != "x"], - [AM_CONDITIONAL([OPENMP], true) - AC_SUBST([AM_CPPFLAGS], ["$OPENMP_CXXFLAGS"]) - AC_DEFINE([OPENMP], [], [Defined when compiled with OpenMP support])] -) - # check whether to build opencl version AC_MSG_CHECKING([--enable-opencl argument]) @@ -227,9 +224,9 @@ case "${host_os}" in if !($have_opencl_lib); then AC_MSG_ERROR([Required OpenCL library not found!]) fi - AC_SUBST([AM_CPPFLAGS], [-DUSE_OPENCL]) + AM_CPPFLAGS="-DUSE_OPENCL $AM_CPPFLAGS" OPENCL_CPPFLAGS="" - OPENCL_LDFLAGS="-framework OpenCL" + OPENCL_LDFLAGS="-framework OpenCL -ltiff" fi ;; *) @@ -246,7 +243,7 @@ case "${host_os}" in if !($have_tiff); then AC_MSG_ERROR([Required TIFF headers not found! Try to install libtiff-dev?? package.]) fi - AC_SUBST([AM_CPPFLAGS], [-DUSE_OPENCL]) + AM_CPPFLAGS="-DUSE_OPENCL $AM_CPPFLAGS" OPENCL_CPPFLAGS="-I${OPENCL_INC}" OPENCL_LDFLAGS="${OPENCL_LIBS}" fi @@ -261,7 +258,7 @@ AC_SUBST([OPENCL_LDFLAGS]) # http://groups.google.com/group/tesseract-dev/browse_thread/thread/976645ae98189127 AC_MSG_CHECKING([--enable-visibility argument]) AC_ARG_ENABLE([visibility], - [AC_HELP_STRING([--enable-visibility],[enable experimental build with fvisibility (default=no)])], + [AS_HELP_STRING([--enable-visibility],[enable experimental build with fvisibility (default=no)])], [enable_visibility=$enableval], [enable_visibility="no"]) AC_MSG_RESULT([$enable_visibility]) @@ -270,7 +267,7 @@ AM_CONDITIONAL([VISIBILITY], [test "$enable_visibility" = "yes"]) # check whether to build multiple libraries AC_MSG_CHECKING([--enable-multiple-libraries argument]) AC_ARG_ENABLE([multiple-libraries], - [AC_HELP_STRING([--enable-multiple-libraries],[enable multiple libraries (default=no)])], + [AS_HELP_STRING([--enable-multiple-libraries],[enable multiple libraries (default=no)])], [enable_mlibs=$enableval], [enable_mlibs="no"]) AC_MSG_RESULT([$enable_mlibs]) @@ -279,7 +276,7 @@ AM_CONDITIONAL([USING_MULTIPLELIBS], [test "$enable_mlibs" = "yes"]) # Check if tessdata-prefix is disabled AC_MSG_CHECKING([whether to use tessdata-prefix]) AC_ARG_ENABLE([tessdata-prefix], - [AC_HELP_STRING([--disable-tessdata-prefix], + [AS_HELP_STRING([--disable-tessdata-prefix], [don't set TESSDATA-PREFIX during compile])], [tessdata_prefix="no"], [tessdata_prefix="yes"]) AC_MSG_RESULT([$tessdata_prefix]) @@ -288,7 +285,7 @@ AM_CONDITIONAL([NO_TESSDATA_PREFIX], [test "$tessdata_prefix" = "no"]) # Check whether enable debuging AC_MSG_CHECKING([whether to enable debugging]) AC_ARG_ENABLE([debug], - [AC_HELP_STRING([--enable-debug], + [AS_HELP_STRING([--enable-debug], [turn on debugging (default=no)])], [debug=$enableval], [debug="no"]) @@ -373,15 +370,35 @@ AC_COMPILE_IFELSE( ]]) ], [ AC_MSG_RESULT(yes) - has_cpp11=yes ], [ AC_MSG_RESULT(no) - has_cpp11=no + AC_MSG_ERROR([Your compiler does not have the necessary c++11 support! Cannot proceed.]) ]) AC_CHECK_FUNCS([snprintf],, [snprintfworks=yes]) CXXFLAGS="$OLD_CXXFLAGS" + +# set c++11 support based on platform/compiler +case "${host_os}" in + cygwin*) + CXXFLAGS="$CXXFLAGS -std=gnu++11" + ;; + *-darwin* | *-macos10*) + if test "x$CLANG" = "xyes"; then + CXXFLAGS="$CXXFLAGS -std=c++11 " + LDFLAGS="$LDFLAGS -stdlib=libc++" + else + CXXFLAGS="$CXXFLAGS -std=c++11" + fi + ;; + *) + # default + CXXFLAGS="$CXXFLAGS -std=c++11" + ;; +esac + + # ---------------------------------------- # Check for libraries # ---------------------------------------- @@ -420,53 +437,27 @@ AC_CHECK_TYPES([mbstate_t],,, [#include "wchar.h"]) # Test auxiliary packages # ---------------------------------------- -# Check location of leptonica/liblept headers. -AC_MSG_CHECKING([for leptonica]) -AC_ARG_VAR([LIBLEPT_HEADERSDIR], [Leptonica headers directory]) - -have_lept=no -if test "$LIBLEPT_HEADERSDIR" = "" ; then - LIBLEPT_HEADERSDIR="/usr/local/include /usr/include /opt/local/include/leptonica" -fi -for incd in $LIBLEPT_HEADERSDIR -do - for lept in . leptonica liblept - do - if test -r "$incd/$lept/allheaders.h" ; then - CPPFLAGS="$CPPFLAGS -I$incd/$lept" - have_lept=yes - fi - done -done - -if test "$have_lept" = yes ; then - AC_MSG_RESULT(yes) - AC_CHECK_LIB([lept], [l_generateCIDataForPdf], [], - [AC_MSG_ERROR([leptonica library with pdf support (>= 1.71) is missing])]) +PKG_CHECK_MODULES([LEPTONICA], [lept >= 1.74], [have_lept=true], [have_lept=false]) +if $have_lept; then + CPPFLAGS="$CPPFLAGS $LEPTONICA_CFLAGS" else - AC_MSG_ERROR([leptonica not found]) + AC_MSG_ERROR([Leptonica 1.74 or higher is required. Try to install libleptonica-dev package.]) fi -AC_MSG_CHECKING([leptonica headers version >= 1.71]) -AC_PREPROC_IFELSE( - [AC_LANG_PROGRAM([#include "allheaders.h"], -[#if (LIBLEPT_MAJOR_VERSION >= 1) && (LIBLEPT_MINOR_VERSION >= 71) -int i = 0; -#else -#error You need to upgrade your leptonica library! -#endif])], - [AC_MSG_RESULT(yes)], - [AC_MSG_FAILURE([leptonica 1.71 or higher is required])]) - AM_CONDITIONAL([ENABLE_TRAINING], true) # Check location of icu headers -have_icu=false -AC_CHECK_HEADERS([unicode/uchar.h], [have_icu=true], [have_icu=false]) -if !($have_icu); then +PKG_CHECK_MODULES([ICU_UC], [icu-uc], [have_icu_uc=true], [have_icu_uc=false]) +PKG_CHECK_MODULES([ICU_I18N], [icu-i18n], [have_icu_i18n=true], [have_icu_i18n=false]) +if !($have_icu_uc && $have_icu_i18n); then + AC_CHECK_HEADERS([unicode/uchar.h], [have_icu=true], [have_icu=false]) + if !($have_icu); then AC_MSG_WARN([Training tools WILL NOT be built because of missing icu library.]) AC_MSG_WARN([Try to install libicu-devel package.]) AM_CONDITIONAL([ENABLE_TRAINING], false) + else + ICU_UC_LIBS="-licui18n -licuuc" + fi fi # Check location of pango headers @@ -489,29 +480,6 @@ else CPPFLAGS="$CPPFLAGS $cairo_CFLAGS" fi -# set c++11 support based on platform/compiler -if test "x$has_cpp11" = "xyes"; then - case "${host_os}" in - cygwin*) - CXXFLAGS="$CXXFLAGS -std=gnu++11" - ;; - *-darwin* | *-macos10*) - if test "x$CLANG" = "xyes"; then - CXXFLAGS="$CXXFLAGS -std=c++11 " - LDFLAGS="$LDFLAGS -stdlib=libc++" - else - CXXFLAGS="$CXXFLAGS -std=c++11" - fi - ;; - *) - # default - CXXFLAGS="$CXXFLAGS -std=c++11" - ;; - esac -else - AC_MSG_WARN([Training tools WILL NOT be built because of missing c++11 support.]) - AM_CONDITIONAL([ENABLE_TRAINING], false) -fi # ---------------------------------------- # Final Tasks and Output @@ -520,15 +488,15 @@ fi # Output files AC_CONFIG_FILES([Makefile tesseract.pc]) AC_CONFIG_FILES([api/Makefile]) +AC_CONFIG_FILES([arch/Makefile]) AC_CONFIG_FILES([ccmain/Makefile]) AC_CONFIG_FILES([opencl/Makefile]) AC_CONFIG_FILES([ccstruct/Makefile]) AC_CONFIG_FILES([ccutil/Makefile]) AC_CONFIG_FILES([classify/Makefile]) -AC_CONFIG_FILES([cube/Makefile]) AC_CONFIG_FILES([cutil/Makefile]) AC_CONFIG_FILES([dict/Makefile]) -AC_CONFIG_FILES([neural_networks/runtime/Makefile]) +AC_CONFIG_FILES([lstm/Makefile]) AC_CONFIG_FILES([textord/Makefile]) AC_CONFIG_FILES([viewer/Makefile]) AC_CONFIG_FILES([wordrec/Makefile]) diff --git a/contrib/genlangdata.pl b/contrib/genlangdata.pl new file mode 100644 index 00000000..53e3431e --- /dev/null +++ b/contrib/genlangdata.pl @@ -0,0 +1,264 @@ +#!/usr/bin/perl + +use warnings; +use strict; +use utf8; + +use Getopt::Std; + +=pod + +=head1 NAME + +genwordlists.pl - generate word lists for Tesseract + +=head1 SYNOPSIS + +genwordlists.pl -i large_text_file -d outdir -p lang + +=head1 DESCRIPTION + + genwordlists.pl -i large_text_file -d outdir -p lang + +Creates 4 files in C: F, +F, F, and +F, which (when sorted) can be used with +C for Tesseract's language data. + +The script can also run as a filter. Given a set of files created +by WikiExtractor (L), +use: + + find WikiExtractor -type f | while read i; do \ + pfx=$(echo $i|tr '/' '_'); cat $i | \ + perl genwordlists.pl -d OUTDIR -p $pfx; done + +This will create a set of output files to match each of the files +WikiExtractor created. + +To combine these files: + + for i in word.bigrams.unsorted word.numbers.unsorted \ + word.punc.unsorted wordlist.unsorted; do \ + find OUTDIR -name "*$i" -exec cat '{}' \; |\ + perl -CS -ane 'BEGIN{my %c=();} chomp; + my($a,$b)=split/\t/;if(defined $c{$a}){$c{$a}+=$b} + else {$c{$a} = $b;} END{while(my($k,$v)=each %c) + {print "$v\t$k\n";}}'|sort -nr > tmp.$i ;done + +Followed by: + + for i in word.punc.unsorted word.bigrams.unsorted \ + word.numbers.unsorted;do cat tmp.$i \ + awk -F'\t' '{print $2 "\t" $1}' > real.$i ; done + cat tmp.wordlist.unsorted | awk -F'\t' '{print $2}' \ + > real.wordlist.unsorted + +Note that, although the langdata repository contains the +counts of each item in most of the punctuation, number, and +bigram files, these files must be filtered to only contain +the first column, otherwise C will fail to write +the output file. + +=head1 CAVEATS + +The format of the output files, and how the data are extracted, +is based only on staring at the input files and taking a guess. +They may be wildly inaccurate. + +The only part I can say for certain is correct is that digits +are replaced with '?' in the .numbers wordlist. (See F +in the Tesseract source). + +=head1 COPYRIGHT + +Copyright 2014 Jim O'Regan + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + +L + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. + +=head1 SEE ALSO + +L + +=cut + +# I haven't looked into this too much +my %lig = ( + # Longest first + 'ffi' => 'ffi', + 'ct' => "\N{U+E003}", + 'ff' => 'ff', + 'fi' => 'fi', + 'fl' => 'fl', + 'st' => 'st', +); + +my %punct; +my %num; +my %bigrams; +my %opts; +my %words; + +my $do_ligatures = 0; + +getopts("hli:p:d:", \%opts); + +if (defined $opts{h}) { + print "Usage: genwordlists [options]\n"; + print "-h\tPrints a brief help message\n"; + print "-d\tSet the output directory (default is current)\n"; + print "-b\tSet the prefix for the language data (e.g., eng for English)\n"; + print "-l\tProcess ligatures\n"; + print "-i\tSet the input file. If not set, reads from stdin\n"; + exit; +} + +if (defined $opts{l}) { + $do_ligatures = 1; +} + +my $prefix = ''; +if (!defined $opts{p}) { + print "Prefix (-p) must be set!\n"; + exit; +} else { + if (defined $opts{d}) { + $prefix = $opts{d}; + $prefix =~ s/\/$//; + $prefix .= '/'; + } + $prefix .= $opts{p}; + # Easiest is to drop it, if present, and readd + $prefix =~ s/\.$//; + $prefix .= "."; +} + +my $input; +if (defined $opts{i}) { + open ($input, "<", $opts{i}) or die $!; +#} elsif ($#ARGV > 0) { +# open ($input, "<", $ARGV[0]) or die $!; +} else { + $input = *STDIN; +} +binmode $input, ":utf8"; + +while (<$input>) { + chomp; + tr/\t/ /; + + next if (/^ 0) { + my @first = @bitoksraw; + my $discard = shift @bitoksraw; + for (my $j = 0; $j != $#first; $j++) { + if ($bitoksraw[$j] ne '' && $first[$j] ne '') { + my $tok = $first[$j] . " " . $bitoksraw[$j]; + #Not keeping count of these, but this can be useful for trimming + if(defined($bigrams{$tok})) { + $bigrams{$tok}++; + } else { + $bigrams{$tok} = 1; + } + if($do_ligatures == 1) { + my $other = do_lig($tok); + if ($other ne $tok) { + if(defined($bigrams{$other})) { + $bigrams{$other}++; + } else { + $bigrams{$other} = 1; + } + } + } + } + } + } + my @wordl = grep { !/[0-9 \p{Punct}]/ } split (/[ \p{Punct}]+/); + if ($#wordl >= 0) { + for my $word (@wordl) { + if (defined $words{$word}) { + $words{$word}++; + } else { + $words{$word} = 1; + } + } + } +} + +if (defined $opts{i}) { + close $input; +} + +open(BIGRAMS, ">", "${prefix}word.bigrams.unsorted"); +binmode BIGRAMS, ":utf8"; +while (my($k, $v) = each %bigrams) { + print BIGRAMS "$k\t$v\n"; +} +close BIGRAMS; +%bigrams = (); + +open(PUNCT, ">", "${prefix}word.punc.unsorted"); +binmode PUNCT, ":utf8"; +while (my($k, $v) = each %punct) { + print PUNCT "$k\t$v\n"; +} +close PUNCT; +%punct = (); + +open(NUMS, ">", "${prefix}word.numbers.unsorted"); +binmode NUMS, ":utf8"; +while (my($k, $v) = each %num) { + print NUMS "$k\t$v\n"; +} +close NUMS; +%num = (); + +open(WORDS, ">", "${prefix}wordlist.unsorted"); +binmode WORDS, ":utf8"; +while (my($k, $v) = each %words) { + print WORDS "$k\t$v\n"; +} +close WORDS; +%words = (); + +sub do_lig { + my $word = shift; + while (my($k, $v) = each %lig) { + $word =~ s/$k/$v/g; + } + $word; +} diff --git a/cppan.yml b/cppan.yml index 8065da79..99d1b6c4 100644 --- a/cppan.yml +++ b/cppan.yml @@ -1,124 +1,270 @@ local_settings: - cppan_dir: cppan + #use_shared_libs: true + #generator: Visual Studio 14 2015 Win64 + silent: false + #copy_import_libs: true + build: + c_flags: /W0 + cxx_flags: /W0 + + dependencies: + pvt.cppan.demo.danbloomberg.leptonica: 1 + pvt.cppan.demo.unicode.icu.i18n: "*" + +root_project: pvt.cppan.demo.google.tesseract + +common_settings: + c++: 11 + +projects: + libtesseract: + type: lib + export_all_symbols: true + files: + - api/.*\.cpp + - arch/.*\.cpp + - ccmain/.*\.cpp + - ccstruct/.*\.cpp + - ccutil/.*\.cpp + - classify/.*\.cpp + - cutil/.*\.cpp + - dict/.*\.cpp + - lstm/.*\.cpp + - opencl/.*\.cpp + - textord/.*\.cpp + - viewer/.*\.cpp + - wordrec/.*\.cpp + + - api/.*\.h + - arch/.*\.h + - ccmain/.*\.h + - ccstruct/.*\.h + - ccutil/.*\.h + - classify/.*\.h + - cutil/.*\.h + - dict/.*\.h + - lstm/.*\.h + - opencl/.*\.h + - textord/.*\.h + - viewer/.*\.h + - wordrec/.*\.h + + - vs2010/port/.* + + exclude_from_build: + - api/tesseractmain.cpp + - viewer/svpaint.cpp + + include_directories: + public: + #private: + - arch + - classify + - cutil + - ccutil + - dict + - lstm + - opencl + - textord + - vs2010/port + - viewer + - wordrec + #public: + - api + - ccmain + - ccstruct + - ccutil + + check_function_exists: + - getline + + check_symbol_exists: + snprintf: stdio.h + + check_include_exists: + - dlfcn.h + - inttypes.h + - limits.h + - malloc.h + - memory.h + - stdbool.h + - stdint.h + - stdlib.h + - strings.h + - string.h + - sys/ipc.h + - sys/shm.h + - sys/stat.h + - sys/types.h + - sys/wait.h + - tiffio.h + - unistd.h + + check_type_size: + - long long int + - off_t + - mbstate_t + - wchar_t + - _Bool + + pre_sources: | + file_write_once(${BDIR}/config_auto.h "") + + post_sources: | + if (WIN32) + set_source_files_properties( + ${SDIR}/arch/dotproductsse.cpp + PROPERTIES COMPILE_DEFINITIONS __SSE4_1__) + if (MSVC) + set_source_files_properties( + ${SDIR}/arch/dotproductavx.cpp + PROPERTIES COMPILE_FLAGS "/arch:AVX") + endif() + else() + remove_src_dir(vs2010/port/*) + endif() + + options: + any: + definitions: + public: + - HAVE_CONFIG_H + - _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS=1 + - USE_STD_NAMESPACE=1 + - WINDLLNAME="tesseract" + shared: + definitions: + private: + - TESS_EXPORTS + interface: + - TESS_IMPORTS + + dependencies: + pvt.cppan.demo.danbloomberg.leptonica: 1 + + tesseract: + files: api/tesseractmain.cpp + dependencies: + - libtesseract + + tessopt: + type: lib + static_only: true + files: training/tessopt.* + include_directories: training + dependencies: libtesseract + + common_training: + type: lib + static_only: true + files: + - training/commandlineflags.cpp + - training/commandlineflags.h + - training/commontraining.cpp + - training/commontraining.h + include_directories: training + dependencies: + - tessopt + + ambiguous_words: + files: training/ambiguous_words.cpp + dependencies: + - libtesseract + + classifier_tester: + files: training/classifier_tester.cpp + dependencies: common_training + + combine_tessdata: + files: training/combine_tessdata.cpp + dependencies: libtesseract + + cntraining: + files: training/cntraining.cpp + dependencies: common_training + + dawg2wordlist: + files: training/dawg2wordlist.cpp + dependencies: libtesseract + + mftraining: + files: + - training/mftraining.cpp + - training/mergenf.* + dependencies: common_training + + shapeclustering: + files: training/shapeclustering.cpp + dependencies: common_training + + unicharset_extractor: + files: training/unicharset_extractor.cpp + dependencies: tessopt + + wordlist2dawg: + files: training/wordlist2dawg.cpp + dependencies: libtesseract + + unicharset_training: + type: lib + static_only: true + files: + - training/fileio.* + - training/icuerrorcode.h + - training/lstmtester.* + - training/normstrngs.* + - training/unicharset_training_utils.* + include_directories: training + dependencies: + - common_training + - pvt.cppan.demo.unicode.icu.i18n + + lstmeval: + files: training/lstmeval.cpp + dependencies: unicharset_training + + lstmtraining: + files: training/lstmtraining.cpp + dependencies: unicharset_training + + set_unicharset_properties: + files: training/set_unicharset_properties.cpp + dependencies: unicharset_training + + text2image: + files: + - training/text2image.cpp + - training/boxchar.cpp + - training/boxchar.h + - training/degradeimage.cpp + - training/degradeimage.h + - training/ligature_table.cpp + - training/ligature_table.h + - training/normstrngs.cpp + - training/normstrngs.h + - training/pango_font_info.cpp + - training/pango_font_info.h + - training/stringrenderer.cpp + - training/stringrenderer.h + - training/tlog.cpp + - training/tlog.h + - training/util.h + - training/icuerrorcode.h + + dependencies: + - unicharset_training + - pvt.cppan.demo.gnome.pango.pangocairo: 1 + + -files: - - api/.*\.cpp - - ccmain/.*\.cpp - - ccstruct/.*\.cpp - - ccutil/.*\.cpp - - classify/.*\.cpp - - cube/.*\.cpp - - cutil/.*\.cpp - - dict/.*\.cpp - - neural_networks/runtime/.*\.cpp - - opencl/.*\.cpp - - textord/.*\.cpp - - viewer/.*\.cpp - - wordrec/.*\.cpp - - api/.*\.h - - ccmain/.*\.h - - ccstruct/.*\.h - - ccutil/.*\.h - - classify/.*\.h - - cube/.*\.h - - cutil/.*\.h - - dict/.*\.h - - neural_networks/runtime/.*\.h - - opencl/.*\.h - - textord/.*\.h - - viewer/.*\.h - - wordrec/.*\.h - - vs2010/port/.* -include_directories: - private: - - classify - - cube - - cutil - - dict - - neural_networks/runtime - - opencl - - textord - - vs2010/port - - viewer - - wordrec - public: - - api - - ccmain - - ccstruct - - ccutil -check_function_exists: - - getline -check_symbol_exists: - snprintf: stdio.h -check_include_exists: - - dlfcn.h - - inttypes.h - - limits.h - - malloc.h - - memory.h - - stdbool.h - - stdint.h - - stdlib.h - - strings.h - - string.h - - sys/ipc.h - - sys/shm.h - - sys/stat.h - - sys/types.h - - sys/wait.h - - tiffio.h - - unistd.h - - cairo/cairo-version.h - - CL/cl.h - - OpenCL/cl.h - - pango-1.0/pango/pango-features.h - - unicode/uchar.h -check_type_size: - - long long int - - off_t - - mbstate_t - - wchar_t - - _Bool -pre_sources: | - # dummy config file - if (NOT EXISTS ${CMAKE_CURRENT_BINARY_DIR}/config_auto.h) - file(WRITE ${CMAKE_CURRENT_BINARY_DIR}/config_auto.h) - endif() -post_sources: | - if (NOT WIN32) - list(REMOVE_ITEM src "${CMAKE_CURRENT_SOURCE_DIR}/vs2010/port/gettimeofday.cpp") - list(REMOVE_ITEM src "${CMAKE_CURRENT_SOURCE_DIR}/vs2010/port/gettimeofday.h") - list(REMOVE_ITEM src "${CMAKE_CURRENT_SOURCE_DIR}/vs2010/port/mathfix.h") - list(REMOVE_ITEM src "${CMAKE_CURRENT_SOURCE_DIR}/vs2010/port/strcasestr.cpp") - list(REMOVE_ITEM src "${CMAKE_CURRENT_SOURCE_DIR}/vs2010/port/strcasestr.h") - list(REMOVE_ITEM src "${CMAKE_CURRENT_SOURCE_DIR}/vs2010/port/strtok_r.cpp") - list(REMOVE_ITEM src "${CMAKE_CURRENT_SOURCE_DIR}/vs2010/port/strtok_r.h") - list(REMOVE_ITEM src "${CMAKE_CURRENT_SOURCE_DIR}/vs2010/port/vcsversion.h") - endif() -options: - any: - definitions: - public: - - HAVE_CONFIG_H - - _SILENCE_STDEXT_HASH_DEPRECATION_WARNINGS=1 - - USE_STD_NAMESPACE=1 - - WINDLLNAME="tesseract" - shared: - definitions: - public: TESS_EXPORTS -dependencies: - private: - # tesseract uses leptonica only internally - # and does not expose its interface to users - pvt.cppan.demo.leptonica: master diff --git a/cube/Makefile.am b/cube/Makefile.am deleted file mode 100644 index b551d33f..00000000 --- a/cube/Makefile.am +++ /dev/null @@ -1,55 +0,0 @@ -AM_CPPFLAGS += \ - -DUSE_STD_NAMESPACE \ - -I$(top_srcdir)/cutil -I$(top_srcdir)/ccutil \ - -I$(top_srcdir)/ccstruct -I$(top_srcdir)/dict \ - -I$(top_srcdir)/ccmain -I$(top_srcdir)/classify \ - -I$(top_srcdir)/textord -I$(top_srcdir)/wordrec \ - -I$(top_srcdir)/neural_networks/runtime \ - -I$(top_srcdir)/viewer - -if VISIBILITY -AM_CPPFLAGS += -DTESS_EXPORTS \ - -fvisibility=hidden -fvisibility-inlines-hidden -endif - -noinst_HEADERS = \ - altlist.h beam_search.h bmp_8.h cached_file.h \ - char_altlist.h char_bigrams.h char_samp.h char_samp_enum.h \ - char_samp_set.h char_set.h classifier_base.h classifier_factory.h \ - con_comp.h cube_const.h conv_net_classifier.h cube_line_object.h \ - cube_line_segmenter.h cube_object.h cube_search_object.h \ - cube_tuning_params.h cube_utils.h feature_base.h feature_bmp.h \ - feature_chebyshev.h feature_hybrid.h hybrid_neural_net_classifier.h \ - lang_mod_edge.h lang_model.h search_column.h search_node.h \ - search_object.h string_32.h tess_lang_mod_edge.h tess_lang_model.h \ - tuning_params.h word_altlist.h word_list_lang_model.h word_size_model.h \ - word_unigrams.h - -if !USING_MULTIPLELIBS -noinst_LTLIBRARIES = libtesseract_cube.la -else -lib_LTLIBRARIES = libtesseract_cube.la -libtesseract_cube_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION) -libtesseract_cube_la_LIBADD = \ - ../ccstruct/libtesseract_ccstruct.la \ - ../ccutil/libtesseract_ccutil.la \ - ../neural_networks/runtime/libtesseract_neural.la \ - ../viewer/libtesseract_viewer.la \ - ../wordrec/libtesseract_wordrec.la \ - ../cutil/libtesseract_cutil.la \ - ../classify/libtesseract_classify.la \ - ../dict/libtesseract_dict.la -endif - -libtesseract_cube_la_SOURCES = \ - altlist.cpp beam_search.cpp bmp_8.cpp cached_file.cpp \ - char_altlist.cpp char_bigrams.cpp char_samp.cpp char_samp_enum.cpp \ - char_samp_set.cpp char_set.cpp classifier_factory.cpp \ - con_comp.cpp conv_net_classifier.cpp cube_line_object.cpp \ - cube_line_segmenter.cpp cube_object.cpp cube_search_object.cpp \ - cube_tuning_params.cpp cube_utils.cpp feature_bmp.cpp \ - feature_chebyshev.cpp feature_hybrid.cpp hybrid_neural_net_classifier.cpp \ - search_column.cpp search_node.cpp \ - tess_lang_mod_edge.cpp tess_lang_model.cpp \ - word_altlist.cpp word_list_lang_model.cpp word_size_model.cpp \ - word_unigrams.cpp diff --git a/cube/altlist.cpp b/cube/altlist.cpp deleted file mode 100644 index b96796ee..00000000 --- a/cube/altlist.cpp +++ /dev/null @@ -1,60 +0,0 @@ -/********************************************************************** - * File: alt_list.cpp - * Description: Class to abstarct a list of alternate results - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "altlist.h" -#include - -namespace tesseract { - -AltList::AltList(int max_alt) { - max_alt_ = max_alt; - alt_cnt_ = 0; - alt_cost_ = NULL; - alt_tag_ = NULL; -} - -AltList::~AltList() { - if (alt_cost_ != NULL) { - delete []alt_cost_; - alt_cost_ = NULL; - } - - if (alt_tag_ != NULL) { - delete []alt_tag_; - alt_tag_ = NULL; - } -} - -// return the best possible cost and index of corresponding alternate -int AltList::BestCost(int *best_alt) const { - if (alt_cnt_ <= 0) { - (*best_alt) = -1; - return -1; - } - - int best_alt_idx = 0; - for (int alt_idx = 1; alt_idx < alt_cnt_; alt_idx++) { - if (alt_cost_[alt_idx] < alt_cost_[best_alt_idx]) { - best_alt_idx = alt_idx; - } - } - (*best_alt) = best_alt_idx; - return alt_cost_[best_alt_idx]; -} -} diff --git a/cube/altlist.h b/cube/altlist.h deleted file mode 100644 index 3aebf313..00000000 --- a/cube/altlist.h +++ /dev/null @@ -1,61 +0,0 @@ -/********************************************************************** - * File: alt_list.h - * Description: Class to abstarct a list of alternate results - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The AltList class is the base class for the list of alternate recognition -// results. Each alternate has a cost an an optional tag associated with it - -#ifndef ALT_LIST_H -#define ALT_LIST_H - -#include -#include "cube_utils.h" - -namespace tesseract { -class AltList { - public: - explicit AltList(int max_alt); - virtual ~AltList(); - // sort the list of alternates based - virtual void Sort() = 0; - // return the best possible cost and index of corresponding alternate - int BestCost (int *best_alt) const; - // return the count of alternates - inline int AltCount() const { return alt_cnt_; } - // returns the cost (-ve log prob) of an alternate - inline int AltCost(int alt_idx) const { return alt_cost_[alt_idx]; } - // returns the prob of an alternate - inline double AltProb(int alt_idx) const { - return CubeUtils::Cost2Prob(AltCost(alt_idx)); - } - // returns the alternate tag - inline void *AltTag(int alt_idx) const { return alt_tag_[alt_idx]; } - - protected: - // max number of alternates the list can hold - int max_alt_; - // actual alternate count - int alt_cnt_; - // array of alternate costs - int *alt_cost_; - // array of alternate tags - void **alt_tag_; -}; -} - -#endif // ALT_LIST_H diff --git a/cube/beam_search.cpp b/cube/beam_search.cpp deleted file mode 100644 index fd17a1d5..00000000 --- a/cube/beam_search.cpp +++ /dev/null @@ -1,487 +0,0 @@ -/********************************************************************** - * File: beam_search.cpp - * Description: Class to implement Beam Word Search Algorithm - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include - -#include "beam_search.h" -#include "tesseractclass.h" - -namespace tesseract { - -BeamSearch::BeamSearch(CubeRecoContext *cntxt, bool word_mode) { - cntxt_ = cntxt; - seg_pt_cnt_ = 0; - col_cnt_ = 1; - col_ = NULL; - word_mode_ = word_mode; -} - -// Cleanup the lattice corresponding to the last search -void BeamSearch::Cleanup() { - if (col_ != NULL) { - for (int col = 0; col < col_cnt_; col++) { - if (col_[col]) - delete col_[col]; - } - delete []col_; - } - col_ = NULL; -} - -BeamSearch::~BeamSearch() { - Cleanup(); -} - -// Creates a set of children nodes emerging from a parent node based on -// the character alternate list and the language model. -void BeamSearch::CreateChildren(SearchColumn *out_col, LangModel *lang_mod, - SearchNode *parent_node, - LangModEdge *lm_parent_edge, - CharAltList *char_alt_list, int extra_cost) { - // get all the edges from this parent - int edge_cnt; - LangModEdge **lm_edges = lang_mod->GetEdges(char_alt_list, - lm_parent_edge, &edge_cnt); - if (lm_edges) { - // add them to the ending column with the appropriate parent - for (int edge = 0; edge < edge_cnt; edge++) { - // add a node to the column if the current column is not the - // last one, or if the lang model edge indicates it is valid EOW - if (!cntxt_->NoisyInput() && out_col->ColIdx() >= seg_pt_cnt_ && - !lm_edges[edge]->IsEOW()) { - // free edge since no object is going to own it - delete lm_edges[edge]; - continue; - } - - // compute the recognition cost of this node - int recognition_cost = MIN_PROB_COST; - if (char_alt_list && char_alt_list->AltCount() > 0) { - recognition_cost = MAX(0, char_alt_list->ClassCost( - lm_edges[edge]->ClassID())); - // Add the no space cost. This should zero in word mode - recognition_cost += extra_cost; - } - - // Note that the edge will be freed inside the column if - // AddNode is called - if (recognition_cost >= 0) { - out_col->AddNode(lm_edges[edge], recognition_cost, parent_node, - cntxt_); - } else { - delete lm_edges[edge]; - } - } // edge - // free edge array - delete []lm_edges; - } // lm_edges -} - -// Performs a beam search in the specified search using the specified -// language model; returns an alternate list of possible words as a result. -WordAltList * BeamSearch::Search(SearchObject *srch_obj, LangModel *lang_mod) { - // verifications - if (!lang_mod) - lang_mod = cntxt_->LangMod(); - if (!lang_mod) { - fprintf(stderr, "Cube ERROR (BeamSearch::Search): could not construct " - "LangModel\n"); - return NULL; - } - - // free existing state - Cleanup(); - - // get seg pt count - seg_pt_cnt_ = srch_obj->SegPtCnt(); - if (seg_pt_cnt_ < 0) { - return NULL; - } - col_cnt_ = seg_pt_cnt_ + 1; - - // disregard suspicious cases - if (seg_pt_cnt_ > 128) { - fprintf(stderr, "Cube ERROR (BeamSearch::Search): segment point count is " - "suspiciously high; bailing out\n"); - return NULL; - } - - // alloc memory for columns - col_ = new SearchColumn *[col_cnt_]; - if (!col_) { - fprintf(stderr, "Cube ERROR (BeamSearch::Search): could not construct " - "SearchColumn array\n"); - return NULL; - } - memset(col_, 0, col_cnt_ * sizeof(*col_)); - - // for all possible segments - for (int end_seg = 1; end_seg <= (seg_pt_cnt_ + 1); end_seg++) { - // create a search column - col_[end_seg - 1] = new SearchColumn(end_seg - 1, - cntxt_->Params()->BeamWidth()); - if (!col_[end_seg - 1]) { - fprintf(stderr, "Cube ERROR (BeamSearch::Search): could not construct " - "SearchColumn for column %d\n", end_seg - 1); - return NULL; - } - - // for all possible start segments - int init_seg = MAX(0, end_seg - cntxt_->Params()->MaxSegPerChar()); - for (int strt_seg = init_seg; strt_seg < end_seg; strt_seg++) { - int parent_nodes_cnt; - SearchNode **parent_nodes; - - // for the root segment, we do not have a parent - if (strt_seg == 0) { - parent_nodes_cnt = 1; - parent_nodes = NULL; - } else { - // for all the existing nodes in the starting column - parent_nodes_cnt = col_[strt_seg - 1]->NodeCount(); - parent_nodes = col_[strt_seg - 1]->Nodes(); - } - - // run the shape recognizer - CharAltList *char_alt_list = srch_obj->RecognizeSegment(strt_seg - 1, - end_seg - 1); - // for all the possible parents - for (int parent_idx = 0; parent_idx < parent_nodes_cnt; parent_idx++) { - // point to the parent node - SearchNode *parent_node = !parent_nodes ? NULL - : parent_nodes[parent_idx]; - LangModEdge *lm_parent_edge = !parent_node ? lang_mod->Root() - : parent_node->LangModelEdge(); - - // compute the cost of not having spaces within the segment range - int contig_cost = srch_obj->NoSpaceCost(strt_seg - 1, end_seg - 1); - - // In phrase mode, compute the cost of not having a space before - // this character - int no_space_cost = 0; - if (!word_mode_ && strt_seg > 0) { - no_space_cost = srch_obj->NoSpaceCost(strt_seg - 1); - } - - // if the no space cost is low enough - if ((contig_cost + no_space_cost) < MIN_PROB_COST) { - // Add the children nodes - CreateChildren(col_[end_seg - 1], lang_mod, parent_node, - lm_parent_edge, char_alt_list, - contig_cost + no_space_cost); - } - - // In phrase mode and if not starting at the root - if (!word_mode_ && strt_seg > 0) { // parent_node must be non-NULL - // consider starting a new word for nodes that are valid EOW - if (parent_node->LangModelEdge()->IsEOW()) { - // get the space cost - int space_cost = srch_obj->SpaceCost(strt_seg - 1); - // if the space cost is low enough - if ((contig_cost + space_cost) < MIN_PROB_COST) { - // Restart the language model and add nodes as children to the - // space node. - CreateChildren(col_[end_seg - 1], lang_mod, parent_node, NULL, - char_alt_list, contig_cost + space_cost); - } - } - } - } // parent - } // strt_seg - - // prune the column nodes - col_[end_seg - 1]->Prune(); - - // Free the column hash table. No longer needed - col_[end_seg - 1]->FreeHashTable(); - } // end_seg - - WordAltList *alt_list = CreateWordAltList(srch_obj); - return alt_list; -} - -// Creates a Word alternate list from the results in the lattice. -WordAltList *BeamSearch::CreateWordAltList(SearchObject *srch_obj) { - // create an alternate list of all the nodes in the last column - int node_cnt = col_[col_cnt_ - 1]->NodeCount(); - SearchNode **srch_nodes = col_[col_cnt_ - 1]->Nodes(); - CharBigrams *bigrams = cntxt_->Bigrams(); - WordUnigrams *word_unigrams = cntxt_->WordUnigramsObj(); - - // Save the index of the best-cost node before the alt list is - // sorted, so that we can retrieve it from the node list when backtracking. - best_presorted_node_idx_ = 0; - int best_cost = -1; - - if (node_cnt <= 0) - return NULL; - - // start creating the word alternate list - WordAltList *alt_list = new WordAltList(node_cnt + 1); - for (int node_idx = 0; node_idx < node_cnt; node_idx++) { - // recognition cost - int recognition_cost = srch_nodes[node_idx]->BestCost(); - // compute the size cost of the alternate - char_32 *ch_buff = NULL; - int size_cost = SizeCost(srch_obj, srch_nodes[node_idx], &ch_buff); - // accumulate other costs - if (ch_buff) { - int cost = 0; - // char bigram cost - int bigram_cost = !bigrams ? 0 : - bigrams->Cost(ch_buff, cntxt_->CharacterSet()); - // word unigram cost - int unigram_cost = !word_unigrams ? 0 : - word_unigrams->Cost(ch_buff, cntxt_->LangMod(), - cntxt_->CharacterSet()); - // overall cost - cost = static_cast( - (size_cost * cntxt_->Params()->SizeWgt()) + - (bigram_cost * cntxt_->Params()->CharBigramWgt()) + - (unigram_cost * cntxt_->Params()->WordUnigramWgt()) + - (recognition_cost * cntxt_->Params()->RecoWgt())); - - // insert into word alt list - alt_list->Insert(ch_buff, cost, - static_cast(srch_nodes[node_idx])); - // Note that strict < is necessary because WordAltList::Sort() - // uses it in a bubble sort to swap entries. - if (best_cost < 0 || cost < best_cost) { - best_presorted_node_idx_ = node_idx; - best_cost = cost; - } - delete []ch_buff; - } - } - - // sort the alternates based on cost - alt_list->Sort(); - return alt_list; -} - -// Returns the lattice column corresponding to the specified column index. -SearchColumn *BeamSearch::Column(int col) const { - if (col < 0 || col >= col_cnt_ || !col_) - return NULL; - return col_[col]; -} - -// Returns the best node in the last column of last performed search. -SearchNode *BeamSearch::BestNode() const { - if (col_cnt_ < 1 || !col_ || !col_[col_cnt_ - 1]) - return NULL; - - int node_cnt = col_[col_cnt_ - 1]->NodeCount(); - SearchNode **srch_nodes = col_[col_cnt_ - 1]->Nodes(); - if (node_cnt < 1 || !srch_nodes || !srch_nodes[0]) - return NULL; - return srch_nodes[0]; -} - -// Returns the string corresponding to the specified alt. -char_32 *BeamSearch::Alt(int alt) const { - // get the last column of the lattice - if (col_cnt_ <= 0) - return NULL; - - SearchColumn *srch_col = col_[col_cnt_ - 1]; - if (!srch_col) - return NULL; - - // point to the last node in the selected path - if (alt >= srch_col->NodeCount() || srch_col->Nodes() == NULL) { - return NULL; - } - - SearchNode *srch_node = srch_col->Nodes()[alt]; - if (!srch_node) - return NULL; - - // get string - char_32 *str32 = srch_node->PathString(); - if (!str32) - return NULL; - - return str32; -} - -// Backtracks from the specified node index and returns the corresponding -// character mapped segments and character count. Optional return -// arguments are the char_32 result string and character bounding -// boxes, if non-NULL values are passed in. -CharSamp **BeamSearch::BackTrack(SearchObject *srch_obj, int node_index, - int *char_cnt, char_32 **str32, - Boxa **char_boxes) const { - // get the last column of the lattice - if (col_cnt_ <= 0) - return NULL; - SearchColumn *srch_col = col_[col_cnt_ - 1]; - if (!srch_col) - return NULL; - - // point to the last node in the selected path - if (node_index >= srch_col->NodeCount() || !srch_col->Nodes()) - return NULL; - - SearchNode *srch_node = srch_col->Nodes()[node_index]; - if (!srch_node) - return NULL; - return BackTrack(srch_obj, srch_node, char_cnt, str32, char_boxes); -} - -// Backtracks from the specified node index and returns the corresponding -// character mapped segments and character count. Optional return -// arguments are the char_32 result string and character bounding -// boxes, if non-NULL values are passed in. -CharSamp **BeamSearch::BackTrack(SearchObject *srch_obj, SearchNode *srch_node, - int *char_cnt, char_32 **str32, - Boxa **char_boxes) const { - if (!srch_node) - return NULL; - - if (str32) { - if (*str32) - delete [](*str32); // clear existing value - *str32 = srch_node->PathString(); - if (!*str32) - return NULL; - } - - if (char_boxes && *char_boxes) { - boxaDestroy(char_boxes); // clear existing value - } - - CharSamp **chars; - chars = SplitByNode(srch_obj, srch_node, char_cnt, char_boxes); - if (!chars && str32) - delete []*str32; - return chars; -} - -// Backtracks from the given lattice node and return the corresponding -// char mapped segments and character count. The character bounding -// boxes are optional return arguments, if non-NULL values are passed in. -CharSamp **BeamSearch::SplitByNode(SearchObject *srch_obj, - SearchNode *srch_node, - int *char_cnt, - Boxa **char_boxes) const { - // Count the characters (could be less than the path length when in - // phrase mode) - *char_cnt = 0; - SearchNode *node = srch_node; - while (node) { - node = node->ParentNode(); - (*char_cnt)++; - } - - if (*char_cnt == 0) - return NULL; - - // Allocate box array - if (char_boxes) { - if (*char_boxes) - boxaDestroy(char_boxes); // clear existing value - *char_boxes = boxaCreate(*char_cnt); - if (*char_boxes == NULL) - return NULL; - } - - // Allocate memory for CharSamp array. - CharSamp **chars = new CharSamp *[*char_cnt]; - if (!chars) { - if (char_boxes) - boxaDestroy(char_boxes); - return NULL; - } - - int ch_idx = *char_cnt - 1; - int seg_pt_cnt = srch_obj->SegPtCnt(); - bool success=true; - while (srch_node && ch_idx >= 0) { - // Parent node (could be null) - SearchNode *parent_node = srch_node->ParentNode(); - - // Get the seg pts corresponding to the search node - int st_col = !parent_node ? 0 : parent_node->ColIdx() + 1; - int st_seg_pt = st_col <= 0 ? -1 : st_col - 1; - int end_col = srch_node->ColIdx(); - int end_seg_pt = end_col >= seg_pt_cnt ? seg_pt_cnt : end_col; - - // Get a char sample corresponding to the segmentation points - CharSamp *samp = srch_obj->CharSample(st_seg_pt, end_seg_pt); - if (!samp) { - success = false; - break; - } - samp->SetLabel(srch_node->NodeString()); - chars[ch_idx] = samp; - if (char_boxes) { - // Create the corresponding character bounding box - Box *char_box = boxCreate(samp->Left(), samp->Top(), - samp->Width(), samp->Height()); - if (!char_box) { - success = false; - break; - } - boxaAddBox(*char_boxes, char_box, L_INSERT); - } - srch_node = parent_node; - ch_idx--; - } - if (!success) { - delete []chars; - if (char_boxes) - boxaDestroy(char_boxes); - return NULL; - } - - // Reverse the order of boxes. - if (char_boxes) { - int char_boxa_size = boxaGetCount(*char_boxes); - int limit = char_boxa_size / 2; - for (int i = 0; i < limit; ++i) { - int box1_idx = i; - int box2_idx = char_boxa_size - 1 - i; - Box *box1 = boxaGetBox(*char_boxes, box1_idx, L_CLONE); - Box *box2 = boxaGetBox(*char_boxes, box2_idx, L_CLONE); - boxaReplaceBox(*char_boxes, box2_idx, box1); - boxaReplaceBox(*char_boxes, box1_idx, box2); - } - } - return chars; -} - -// Returns the size cost of a string for a lattice path that -// ends at the specified lattice node. -int BeamSearch::SizeCost(SearchObject *srch_obj, SearchNode *node, - char_32 **str32) const { - CharSamp **chars = NULL; - int char_cnt = 0; - if (!node) - return 0; - // Backtrack to get string and character segmentation - chars = BackTrack(srch_obj, node, &char_cnt, str32, NULL); - if (!chars) - return WORST_COST; - int size_cost = (cntxt_->SizeModel() == NULL) ? 0 : - cntxt_->SizeModel()->Cost(chars, char_cnt); - delete []chars; - return size_cost; -} -} // namespace tesesract diff --git a/cube/beam_search.h b/cube/beam_search.h deleted file mode 100644 index cd8fc011..00000000 --- a/cube/beam_search.h +++ /dev/null @@ -1,126 +0,0 @@ -/********************************************************************** - * File: beam_search.h - * Description: Declaration of Beam Word Search Algorithm Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The Beam Search class implements a Beam Search algorithm for the -// N-best paths through the lattice of a search object using a language model -// The search object is a segmented bitmap of a word image. The language model -// is a state machine that defines valid sequences of characters -// The cost of each path is the combined (product) probabilities of the -// characters along the path. The character probabilities are computed using -// the character classifier member of the RecoContext -// The BeamSearch class itself holds the state of the last search it performed -// using its "Search" method. Subsequent class to the Search method erase the -// states of previously done searches - -#ifndef BEAM_SEARCH_H -#define BEAM_SEARCH_H - -#include "search_column.h" -#include "word_altlist.h" -#include "search_object.h" -#include "lang_model.h" -#include "cube_utils.h" -#include "cube_reco_context.h" -#include "allheaders.h" - -namespace tesseract { - -class BeamSearch { - public: - explicit BeamSearch(CubeRecoContext *cntxt, bool word_mode = true); - ~BeamSearch(); - // Performs a beam search in the specified search using the specified - // language model; returns an alternate list of possible words as a result. - WordAltList *Search(SearchObject *srch_obj, LangModel *lang_mod = NULL); - // Returns the best node in the last column of last performed search. - SearchNode *BestNode() const; - // Returns the string corresponding to the specified alt. - char_32 *Alt(int alt) const; - // Backtracks from the specified lattice node and returns the corresponding - // character-mapped segments, character count, char_32 result string, and - // character bounding boxes (if char_boxes is not NULL). If the segments - // cannot be constructed, returns NULL, and all result arguments - // will be NULL. - CharSamp **BackTrack(SearchObject *srch_obj, int node_index, - int *char_cnt, char_32 **str32, Boxa **char_boxes) const; - // Same as above, except it takes a pointer to a search node object - // instead of node index. - CharSamp **BackTrack(SearchObject *srch_obj, SearchNode *node, - int *char_cnt, char_32 **str32, Boxa **char_boxes) const; - // Returns the size cost of a specified string of a lattice - // path that ends at the specified lattice node. - int SizeCost(SearchObject *srch_obj, SearchNode *node, - char_32 **str32 = NULL) const; - // Returns the word unigram cost of the given string, possibly - // stripping out a single trailing punctuation character. - int WordUnigramCost(char_32 *str32, WordUnigrams* word_unigrams) const; - - // Supplementary functions needed for visualization - // Return column count of the lattice. - inline int ColCnt() const { return col_cnt_; } - // Returns the lattice column corresponding to the specified column index. - SearchColumn *Column(int col_idx) const; - // Return the index of the best node in the last column of the - // best-cost path before the alternates list is sorted. - inline int BestPresortedNodeIndex() const { - return best_presorted_node_idx_; - }; - - private: - // Maximum reasonable segmentation point count - static const int kMaxSegPointCnt = 128; - // Recognition context object; the context holds the character classifier - // and the tuning parameters object - CubeRecoContext *cntxt_; - // Count of segmentation pts - int seg_pt_cnt_; - // Lattice column count; currently redundant with respect to seg_pt_cnt_ - // but that might change in the future - int col_cnt_; - // Array of lattice columns - SearchColumn **col_; - // Run in word or phrase mode - bool word_mode_; - // Node index of best-cost node, before alternates are merged and sorted - int best_presorted_node_idx_; - // Cleans up beam search state - void Cleanup(); - // Creates a Word alternate list from the results in the lattice. - // This function computes a cost for each node in the final column - // of the lattice, which is a weighted average of several costs: - // size cost, character bigram cost, word unigram cost, and - // recognition cost from the beam search. The weights are the - // CubeTuningParams, which are learned together with the character - // classifiers. - WordAltList *CreateWordAltList(SearchObject *srch_obj); - // Creates a set of children nodes emerging from a parent node based on - // the character alternate list and the language model. - void CreateChildren(SearchColumn *out_col, LangModel *lang_mod, - SearchNode *parent_node, LangModEdge *lm_parent_edge, - CharAltList *char_alt_list, int extra_cost); - // Backtracks from the given lattice node and returns the corresponding - // char mapped segments, character count, and character bounding boxes (if - // char_boxes is not NULL). If the segments cannot be constructed, - // returns NULL, and all result arguments will be NULL. - CharSamp **SplitByNode(SearchObject *srch_obj, SearchNode *srch_node, - int* char_cnt, Boxa **char_boxes) const; -}; -} - -#endif // BEAM_SEARCH_H diff --git a/cube/bmp_8.cpp b/cube/bmp_8.cpp deleted file mode 100644 index f7b6e0a1..00000000 --- a/cube/bmp_8.cpp +++ /dev/null @@ -1,1144 +0,0 @@ -/********************************************************************** - * File: bmp_8.cpp - * Description: Implementation of an 8-bit Bitmap class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include -#include "bmp_8.h" -#include "con_comp.h" -#include "platform.h" -#ifdef USE_STD_NAMESPACE -using std::min; -using std::max; -#endif - -namespace tesseract { - -const int Bmp8::kDeslantAngleCount = (1 + static_cast(0.5f + - (kMaxDeslantAngle - kMinDeslantAngle) / kDeslantAngleDelta)); -float *Bmp8::tan_table_ = NULL; - -Bmp8::Bmp8(unsigned short wid, unsigned short hgt) - : wid_(wid) - , hgt_(hgt) { - line_buff_ = CreateBmpBuffer(); -} - -Bmp8::~Bmp8() { - FreeBmpBuffer(line_buff_); -} - -// free buffer -void Bmp8::FreeBmpBuffer(unsigned char **buff) { - if (buff != NULL) { - if (buff[0] != NULL) { - delete []buff[0]; - } - delete []buff; - } -} - -void Bmp8::FreeBmpBuffer(unsigned int **buff) { - if (buff != NULL) { - if (buff[0] != NULL) { - delete []buff[0]; - } - delete []buff; - } -} - -// init bmp buffers -unsigned char **Bmp8::CreateBmpBuffer(unsigned char init_val) { - unsigned char **buff; - - // Check valid sizes - if (!hgt_ || !wid_) - return NULL; - - // compute stride (align on 4 byte boundries) - stride_ = ((wid_ % 4) == 0) ? wid_ : (4 * (1 + (wid_ / 4))); - - buff = (unsigned char **) new unsigned char *[hgt_ * sizeof(*buff)]; - if (!buff) { - delete []buff; - return NULL; - } - - // alloc and init memory for buffer and line buffer - buff[0] = (unsigned char *) - new unsigned char[stride_ * hgt_ * sizeof(*buff[0])]; - if (!buff[0]) { - return NULL; - } - - memset(buff[0], init_val, stride_ * hgt_ * sizeof(*buff[0])); - - for (int y = 1; y < hgt_; y++) { - buff[y] = buff[y -1] + stride_; - } - - return buff; -} - -// init bmp buffers -unsigned int ** Bmp8::CreateBmpBuffer(int wid, int hgt, - unsigned char init_val) { - unsigned int **buff; - - // compute stride (align on 4 byte boundries) - buff = (unsigned int **) new unsigned int *[hgt * sizeof(*buff)]; - if (!buff) { - delete []buff; - return NULL; - } - - // alloc and init memory for buffer and line buffer - buff[0] = (unsigned int *) new unsigned int[wid * hgt * sizeof(*buff[0])]; - if (!buff[0]) { - return NULL; - } - - memset(buff[0], init_val, wid * hgt * sizeof(*buff[0])); - - for (int y = 1; y < hgt; y++) { - buff[y] = buff[y -1] + wid; - } - - return buff; -} - -// clears the contents of the bmp -bool Bmp8::Clear() { - if (line_buff_ == NULL) { - return false; - } - - memset(line_buff_[0], 0xff, stride_ * hgt_ * sizeof(*line_buff_[0])); - return true; -} - -bool Bmp8::LoadFromCharDumpFile(CachedFile *fp) { - unsigned short wid; - unsigned short hgt; - unsigned short x; - unsigned short y; - int buf_size; - int pix; - int pix_cnt; - unsigned int val32; - unsigned char *buff; - - // read and check 32 bit marker - if (fp->Read(&val32, sizeof(val32)) != sizeof(val32)) { - return false; - } - - if (val32 != kMagicNumber) { - return false; - } - - // read wid and hgt - if (fp->Read(&wid, sizeof(wid)) != sizeof(wid)) { - return false; - } - - if (fp->Read(&hgt, sizeof(hgt)) != sizeof(hgt)) { - return false; - } - - // read buf size - if (fp->Read(&buf_size, sizeof(buf_size)) != sizeof(buf_size)) { - return false; - } - - // validate buf size: for now, only 3 channel (RBG) is supported - pix_cnt = wid * hgt; - if (buf_size != (3 * pix_cnt)) { - return false; - } - - // alloc memory & read the 3 channel buffer - buff = new unsigned char[buf_size]; - if (buff == NULL) { - return false; - } - - if (fp->Read(buff, buf_size) != buf_size) { - delete []buff; - return false; - } - - // create internal buffers - wid_ = wid; - hgt_ = hgt; - - line_buff_ = CreateBmpBuffer(); - if (line_buff_ == NULL) { - delete []buff; - return false; - } - - // copy the data - for (y = 0, pix = 0; y < hgt_; y++) { - for (x = 0; x < wid_; x++, pix += 3) { - // for now we only support gray scale, - // so we expect R = G = B, it this is not the case, bail out - if (buff[pix] != buff[pix + 1] || buff[pix] != buff[pix + 2]) { - delete []buff; - return false; - } - line_buff_[y][x] = buff[pix]; - } - } - - // delete temp buffer - delete[]buff; - - return true; -} - -Bmp8 * Bmp8::FromCharDumpFile(CachedFile *fp) { - // create a Bmp8 object - Bmp8 *bmp_obj = new Bmp8(0, 0); - if (bmp_obj == NULL) { - return NULL; - } - - if (bmp_obj->LoadFromCharDumpFile(fp) == false) { - delete bmp_obj; - return NULL; - } - - return bmp_obj; -} - -bool Bmp8::LoadFromCharDumpFile(FILE *fp) { - unsigned short wid; - unsigned short hgt; - unsigned short x; - unsigned short y; - int buf_size; - int pix; - int pix_cnt; - unsigned int val32; - unsigned char *buff; - - // read and check 32 bit marker - if (fread(&val32, 1, sizeof(val32), fp) != sizeof(val32)) { - return false; - } - - if (val32 != kMagicNumber) { - return false; - } - - // read wid and hgt - if (fread(&wid, 1, sizeof(wid), fp) != sizeof(wid)) { - return false; - } - - if (fread(&hgt, 1, sizeof(hgt), fp) != sizeof(hgt)) { - return false; - } - - // read buf size - if (fread(&buf_size, 1, sizeof(buf_size), fp) != sizeof(buf_size)) { - return false; - } - - // validate buf size: for now, only 3 channel (RBG) is supported - pix_cnt = wid * hgt; - if (buf_size != (3 * pix_cnt)) { - return false; - } - - // alloc memory & read the 3 channel buffer - buff = new unsigned char[buf_size]; - if (buff == NULL) { - return false; - } - - if (fread(buff, 1, buf_size, fp) != buf_size) { - delete []buff; - return false; - } - - // create internal buffers - wid_ = wid; - hgt_ = hgt; - - line_buff_ = CreateBmpBuffer(); - if (line_buff_ == NULL) { - delete []buff; - return false; - } - - // copy the data - for (y = 0, pix = 0; y < hgt_; y++) { - for (x = 0; x < wid_; x++, pix += 3) { - // for now we only support gray scale, - // so we expect R = G = B, it this is not the case, bail out - if (buff[pix] != buff[pix + 1] || buff[pix] != buff[pix + 2]) { - delete []buff; - return false; - } - line_buff_[y][x] = buff[pix]; - } - } - - // delete temp buffer - delete[]buff; - - return true; -} - -Bmp8 * Bmp8::FromCharDumpFile(FILE *fp) { - // create a Bmp8 object - Bmp8 *bmp_obj = new Bmp8(0, 0); - if (bmp_obj == NULL) { - return NULL; - } - - if (bmp_obj->LoadFromCharDumpFile(fp) == false) { - delete bmp_obj; - return NULL; - } - - return bmp_obj; -} - -bool Bmp8::IsBlankColumn(int x) const { - for (int y = 0; y < hgt_; y++) { - if (line_buff_[y][x] != 0xff) { - return false; - } - } - - return true; -} - -bool Bmp8::IsBlankRow(int y) const { - for (int x = 0; x < wid_; x++) { - if (line_buff_[y][x] != 0xff) { - return false; - } - } - - return true; -} - -// crop the bitmap returning new dimensions -void Bmp8::Crop(int *xst, int *yst, int *wid, int *hgt) { - (*xst) = 0; - (*yst) = 0; - - int xend = wid_ - 1; - int yend = hgt_ - 1; - - while ((*xst) < (wid_ - 1) && (*xst) <= xend) { - // column is not empty - if (!IsBlankColumn((*xst))) { - break; - } - (*xst)++; - } - - while (xend > 0 && xend >= (*xst)) { - // column is not empty - if (!IsBlankColumn(xend)) { - break; - } - xend--; - } - - while ((*yst) < (hgt_ - 1) && (*yst) <= yend) { - // column is not empty - if (!IsBlankRow((*yst))) { - break; - } - (*yst)++; - } - - while (yend > 0 && yend >= (*yst)) { - // column is not empty - if (!IsBlankRow(yend)) { - break; - } - yend--; - } - - (*wid) = xend - (*xst) + 1; - (*hgt) = yend - (*yst) + 1; -} - -// generates a scaled bitmap with dimensions the new bmp will have the -// same aspect ratio and will be centered in the box -bool Bmp8::ScaleFrom(Bmp8 *bmp, bool isotropic) { - int x_num; - int x_denom; - int y_num; - int y_denom; - int xoff; - int yoff; - int xsrc; - int ysrc; - int xdest; - int ydest; - int xst_src = 0; - int yst_src = 0; - int xend_src = bmp->wid_ - 1; - int yend_src = bmp->hgt_ - 1; - int wid_src; - int hgt_src; - - // src dimensions - wid_src = xend_src - xst_src + 1, - hgt_src = yend_src - yst_src + 1; - - // scale to maintain aspect ratio if required - if (isotropic) { - if ((wid_ * hgt_src) > (hgt_ * wid_src)) { - x_num = y_num = hgt_; - x_denom = y_denom = hgt_src; - } else { - x_num = y_num = wid_; - x_denom = y_denom = wid_src; - } - } else { - x_num = wid_; - y_num = hgt_; - x_denom = wid_src; - y_denom = hgt_src; - } - - // compute offsets needed to center new bmp - xoff = (wid_ - ((x_num * wid_src) / x_denom)) / 2; - yoff = (hgt_ - ((y_num * hgt_src) / y_denom)) / 2; - - // scale up - if (y_num > y_denom) { - for (ydest = yoff; ydest < (hgt_ - yoff); ydest++) { - // compute un-scaled y - ysrc = static_cast(0.5 + (1.0 * (ydest - yoff) * - y_denom / y_num)); - if (ysrc < 0 || ysrc >= hgt_src) { - continue; - } - - for (xdest = xoff; xdest < (wid_ - xoff); xdest++) { - // compute un-scaled y - xsrc = static_cast(0.5 + (1.0 * (xdest - xoff) * - x_denom / x_num)); - if (xsrc < 0 || xsrc >= wid_src) { - continue; - } - - line_buff_[ydest][xdest] = - bmp->line_buff_[ysrc + yst_src][xsrc + xst_src]; - } - } - } else { - // or scale down - // scaling down is a bit tricky: we'll accumulate pixels - // and then compute the means - unsigned int **dest_line_buff = CreateBmpBuffer(wid_, hgt_, 0), - **dest_pix_cnt = CreateBmpBuffer(wid_, hgt_, 0); - - for (ysrc = 0; ysrc < hgt_src; ysrc++) { - // compute scaled y - ydest = yoff + static_cast(0.5 + (1.0 * ysrc * y_num / y_denom)); - if (ydest < 0 || ydest >= hgt_) { - continue; - } - - for (xsrc = 0; xsrc < wid_src; xsrc++) { - // compute scaled y - xdest = xoff + static_cast(0.5 + (1.0 * xsrc * x_num / x_denom)); - if (xdest < 0 || xdest >= wid_) { - continue; - } - - dest_line_buff[ydest][xdest] += - bmp->line_buff_[ysrc + yst_src][xsrc + xst_src]; - dest_pix_cnt[ydest][xdest]++; - } - } - - for (ydest = 0; ydest < hgt_; ydest++) { - for (xdest = 0; xdest < wid_; xdest++) { - if (dest_pix_cnt[ydest][xdest] > 0) { - unsigned int pixval = - dest_line_buff[ydest][xdest] / dest_pix_cnt[ydest][xdest]; - - line_buff_[ydest][xdest] = - (unsigned char) min((unsigned int)255, pixval); - } - } - } - - // we no longer need these temp buffers - FreeBmpBuffer(dest_line_buff); - FreeBmpBuffer(dest_pix_cnt); - } - - return true; -} - -bool Bmp8::LoadFromRawData(unsigned char *data) { - unsigned char *pline_data = data; - - // copy the data - for (int y = 0; y < hgt_; y++, pline_data += wid_) { - memcpy(line_buff_[y], pline_data, wid_ * sizeof(*pline_data)); - } - - return true; -} - -bool Bmp8::SaveBmp2CharDumpFile(FILE *fp) const { - unsigned short wid; - unsigned short hgt; - unsigned short x; - unsigned short y; - int buf_size; - int pix; - int pix_cnt; - unsigned int val32; - unsigned char *buff; - - // write and check 32 bit marker - val32 = kMagicNumber; - if (fwrite(&val32, 1, sizeof(val32), fp) != sizeof(val32)) { - return false; - } - - // write wid and hgt - wid = wid_; - if (fwrite(&wid, 1, sizeof(wid), fp) != sizeof(wid)) { - return false; - } - - hgt = hgt_; - if (fwrite(&hgt, 1, sizeof(hgt), fp) != sizeof(hgt)) { - return false; - } - - // write buf size - pix_cnt = wid * hgt; - buf_size = 3 * pix_cnt; - if (fwrite(&buf_size, 1, sizeof(buf_size), fp) != sizeof(buf_size)) { - return false; - } - - // alloc memory & write the 3 channel buffer - buff = new unsigned char[buf_size]; - if (buff == NULL) { - return false; - } - - // copy the data - for (y = 0, pix = 0; y < hgt_; y++) { - for (x = 0; x < wid_; x++, pix += 3) { - buff[pix] = - buff[pix + 1] = - buff[pix + 2] = line_buff_[y][x]; - } - } - - if (fwrite(buff, 1, buf_size, fp) != buf_size) { - delete []buff; - return false; - } - - // delete temp buffer - delete[]buff; - - return true; -} - -// copy part of the specified bitmap to the top of the bitmap -// does any necessary clipping -void Bmp8::Copy(int x_st, int y_st, int wid, int hgt, Bmp8 *bmp_dest) const { - int x_end = min(x_st + wid, static_cast(wid_)), - y_end = min(y_st + hgt, static_cast(hgt_)); - - for (int y = y_st; y < y_end; y++) { - for (int x = x_st; x < x_end; x++) { - bmp_dest->line_buff_[y - y_st][x - x_st] = - line_buff_[y][x]; - } - } -} - -bool Bmp8::IsIdentical(Bmp8 *pBmp) const { - if (wid_ != pBmp->wid_ || hgt_ != pBmp->hgt_) { - return false; - } - - for (int y = 0; y < hgt_; y++) { - if (memcmp(line_buff_[y], pBmp->line_buff_[y], wid_) != 0) { - return false; - } - } - - return true; -} - -// Detect connected components in the bitmap -ConComp ** Bmp8::FindConComps(int *concomp_cnt, int min_size) const { - (*concomp_cnt) = 0; - - unsigned int **out_bmp_array = CreateBmpBuffer(wid_, hgt_, 0); - if (out_bmp_array == NULL) { - fprintf(stderr, "Cube ERROR (Bmp8::FindConComps): could not allocate " - "bitmap array\n"); - return NULL; - } - - // listed of connected components - ConComp **concomp_array = NULL; - - int x; - int y; - int x_nbr; - int y_nbr; - int concomp_id; - int alloc_concomp_cnt = 0; - - // neighbors to check - const int nbr_cnt = 4; - - // relative coordinates of nbrs - int x_del[nbr_cnt] = {-1, 0, 1, -1}, - y_del[nbr_cnt] = {-1, -1, -1, 0}; - - - for (y = 0; y < hgt_; y++) { - for (x = 0; x < wid_; x++) { - // is this a foreground pix - if (line_buff_[y][x] != 0xff) { - int master_concomp_id = 0; - ConComp *master_concomp = NULL; - - // checkout the nbrs - for (int nbr = 0; nbr < nbr_cnt; nbr++) { - x_nbr = x + x_del[nbr]; - y_nbr = y + y_del[nbr]; - - if (x_nbr < 0 || y_nbr < 0 || x_nbr >= wid_ || y_nbr >= hgt_) { - continue; - } - - // is this nbr a foreground pix - if (line_buff_[y_nbr][x_nbr] != 0xff) { - // get its concomp ID - concomp_id = out_bmp_array[y_nbr][x_nbr]; - - // this should not happen - if (concomp_id < 1 || concomp_id > alloc_concomp_cnt) { - fprintf(stderr, "Cube ERROR (Bmp8::FindConComps): illegal " - "connected component id: %d\n", concomp_id); - FreeBmpBuffer(out_bmp_array); - delete []concomp_array; - return NULL; - } - - // if we has previously found a component then merge the two - // and delete the latest one - if (master_concomp != NULL && concomp_id != master_concomp_id) { - // relabel all the pts - ConCompPt *pt_ptr = concomp_array[concomp_id - 1]->Head(); - while (pt_ptr != NULL) { - out_bmp_array[pt_ptr->y()][pt_ptr->x()] = master_concomp_id; - pt_ptr = pt_ptr->Next(); - } - - // merge the two concomp - if (!master_concomp->Merge(concomp_array[concomp_id - 1])) { - fprintf(stderr, "Cube ERROR (Bmp8::FindConComps): could not " - "merge connected component: %d\n", concomp_id); - FreeBmpBuffer(out_bmp_array); - delete []concomp_array; - return NULL; - } - - // delete the merged concomp - delete concomp_array[concomp_id - 1]; - concomp_array[concomp_id - 1] = NULL; - } else { - // this is the first concomp we encounter - master_concomp_id = concomp_id; - master_concomp = concomp_array[master_concomp_id - 1]; - - out_bmp_array[y][x] = master_concomp_id; - - if (!master_concomp->Add(x, y)) { - fprintf(stderr, "Cube ERROR (Bmp8::FindConComps): could not " - "add connected component (%d,%d)\n", x, y); - FreeBmpBuffer(out_bmp_array); - delete []concomp_array; - return NULL; - } - } - } // foreground nbr - } // nbrs - - // if there was no foreground pix, then create a new concomp - if (master_concomp == NULL) { - master_concomp = new ConComp(); - if (master_concomp == NULL || master_concomp->Add(x, y) == false) { - fprintf(stderr, "Cube ERROR (Bmp8::FindConComps): could not " - "allocate or add a connected component\n"); - FreeBmpBuffer(out_bmp_array); - delete []concomp_array; - return NULL; - } - - // extend the list of concomps if needed - if ((alloc_concomp_cnt % kConCompAllocChunk) == 0) { - ConComp **temp_con_comp = - new ConComp *[alloc_concomp_cnt + kConCompAllocChunk]; - if (temp_con_comp == NULL) { - fprintf(stderr, "Cube ERROR (Bmp8::FindConComps): could not " - "extend array of connected components\n"); - FreeBmpBuffer(out_bmp_array); - delete []concomp_array; - return NULL; - } - - if (alloc_concomp_cnt > 0) { - memcpy(temp_con_comp, concomp_array, - alloc_concomp_cnt * sizeof(*concomp_array)); - - delete []concomp_array; - } - - concomp_array = temp_con_comp; - } - - concomp_array[alloc_concomp_cnt++] = master_concomp; - out_bmp_array[y][x] = alloc_concomp_cnt; - } - } // foreground pix - } // x - } // y - - // free the concomp bmp - FreeBmpBuffer(out_bmp_array); - - if (alloc_concomp_cnt > 0 && concomp_array != NULL) { - // scan the array of connected components and color - // the o/p buffer with the corresponding concomps - (*concomp_cnt) = 0; - ConComp *concomp = NULL; - - for (int concomp_idx = 0; concomp_idx < alloc_concomp_cnt; concomp_idx++) { - concomp = concomp_array[concomp_idx]; - - // found a concomp - if (concomp != NULL) { - // add the connected component if big enough - if (concomp->PtCnt() > min_size) { - concomp->SetLeftMost(true); - concomp->SetRightMost(true); - concomp->SetID((*concomp_cnt)); - concomp_array[(*concomp_cnt)++] = concomp; - } else { - delete concomp; - } - } - } - } - - return concomp_array; -} - -// precompute the tan table to speedup deslanting -bool Bmp8::ComputeTanTable() { - int ang_idx; - float ang_val; - - // alloc memory for tan table - delete []tan_table_; - tan_table_ = new float[kDeslantAngleCount]; - if (tan_table_ == NULL) { - return false; - } - - for (ang_idx = 0, ang_val = kMinDeslantAngle; - ang_idx < kDeslantAngleCount; ang_idx++) { - tan_table_[ang_idx] = tan(ang_val * M_PI / 180.0f); - ang_val += kDeslantAngleDelta; - } - - return true; -} - -// generates a deslanted bitmap from the passed bitmap. -bool Bmp8::Deslant() { - int x; - int y; - int des_x; - int des_y; - int ang_idx; - int best_ang; - int min_des_x; - int max_des_x; - int des_wid; - - // only do deslanting if bitmap is wide enough - // otherwise it slant estimate might not be reliable - if (wid_ < (hgt_ * 2)) { - return true; - } - - // compute tan table if needed - if (tan_table_ == NULL && !ComputeTanTable()) { - return false; - } - - // compute min and max values for x after deslant - min_des_x = static_cast(0.5f + (hgt_ - 1) * tan_table_[0]); - max_des_x = (wid_ - 1) + - static_cast(0.5f + (hgt_ - 1) * tan_table_[kDeslantAngleCount - 1]); - - des_wid = max_des_x - min_des_x + 1; - - // alloc memory for histograms - int **angle_hist = new int*[kDeslantAngleCount]; - for (ang_idx = 0; ang_idx < kDeslantAngleCount; ang_idx++) { - angle_hist[ang_idx] = new int[des_wid]; - if (angle_hist[ang_idx] == NULL) { - delete[] angle_hist; - return false; - } - memset(angle_hist[ang_idx], 0, des_wid * sizeof(*angle_hist[ang_idx])); - } - - // compute histograms - for (y = 0; y < hgt_; y++) { - for (x = 0; x < wid_; x++) { - // find a non-bkgrnd pixel - if (line_buff_[y][x] != 0xff) { - des_y = hgt_ - y - 1; - // stamp all histograms - for (ang_idx = 0; ang_idx < kDeslantAngleCount; ang_idx++) { - des_x = x + static_cast(0.5f + (des_y * tan_table_[ang_idx])); - if (des_x >= min_des_x && des_x <= max_des_x) { - angle_hist[ang_idx][des_x - min_des_x]++; - } - } - } - } - } - - // find the histogram with the lowest entropy - float entropy; - double best_entropy = 0.0f; - double norm_val; - - best_ang = -1; - for (ang_idx = 0; ang_idx < kDeslantAngleCount; ang_idx++) { - entropy = 0.0f; - - for (x = min_des_x; x <= max_des_x; x++) { - if (angle_hist[ang_idx][x - min_des_x] > 0) { - norm_val = (1.0f * angle_hist[ang_idx][x - min_des_x] / hgt_); - entropy += (-1.0f * norm_val * log(norm_val)); - } - } - - if (best_ang == -1 || entropy < best_entropy) { - best_ang = ang_idx; - best_entropy = entropy; - } - - // free the histogram - delete[] angle_hist[ang_idx]; - } - delete[] angle_hist; - - // deslant - if (best_ang != -1) { - unsigned char **dest_lines; - int old_wid = wid_; - - // create a new buffer - wid_ = des_wid; - dest_lines = CreateBmpBuffer(); - if (dest_lines == NULL) { - return false; - } - - for (y = 0; y < hgt_; y++) { - for (x = 0; x < old_wid; x++) { - // find a non-bkgrnd pixel - if (line_buff_[y][x] != 0xff) { - des_y = hgt_ - y - 1; - // compute new pos - des_x = x + static_cast(0.5f + (des_y * tan_table_[best_ang])); - dest_lines[y][des_x - min_des_x] = 0; - } - } - } - - // free old buffer - FreeBmpBuffer(line_buff_); - line_buff_ = dest_lines; - } - return true; -} - -// Load dimensions & contents of bitmap from raw data -bool Bmp8::LoadFromCharDumpFile(unsigned char **raw_data_ptr) { - unsigned short wid; - unsigned short hgt; - unsigned short x; - unsigned short y; - unsigned char *raw_data = (*raw_data_ptr); - int buf_size; - int pix; - unsigned int val32; - - // read and check 32 bit marker - memcpy(&val32, raw_data, sizeof(val32)); - raw_data += sizeof(val32); - - if (val32 != kMagicNumber) { - return false; - } - - // read wid and hgt - memcpy(&wid, raw_data, sizeof(wid)); - raw_data += sizeof(wid); - - memcpy(&hgt, raw_data, sizeof(hgt)); - raw_data += sizeof(hgt); - - // read buf size - memcpy(&buf_size, raw_data, sizeof(buf_size)); - raw_data += sizeof(buf_size); - - // validate buf size: for now, only 3 channel (RBG) is supported - if (buf_size != (3 * wid * hgt)) { - return false; - } - - wid_ = wid; - hgt_ = hgt; - - line_buff_ = CreateBmpBuffer(); - if (line_buff_ == NULL) { - return false; - } - - // copy the data - for (y = 0, pix = 0; y < hgt_; y++) { - for (x = 0; x < wid_; x++, pix += 3) { - // for now we only support gray scale, - // so we expect R = G = B, it this is not the case, bail out - if (raw_data[pix] != raw_data[pix + 1] || - raw_data[pix] != raw_data[pix + 2]) { - return false; - } - - line_buff_[y][x] = raw_data[pix]; - } - } - - (*raw_data_ptr) = raw_data + buf_size; - return true; -} - -float Bmp8::ForegroundRatio() const { - int fore_cnt = 0; - - if (wid_ == 0 || hgt_ == 0) { - return 1.0; - } - - for (int y = 0; y < hgt_; y++) { - for (int x = 0; x < wid_; x++) { - fore_cnt += (line_buff_[y][x] == 0xff ? 0 : 1); - } - } - - return (1.0 * (fore_cnt / hgt_) / wid_); -} - -// generates a deslanted bitmap from the passed bitmap -bool Bmp8::HorizontalDeslant(double *deslant_angle) { - int x; - int y; - int des_y; - int ang_idx; - int best_ang; - int min_des_y; - int max_des_y; - int des_hgt; - - // compute tan table if necess. - if (tan_table_ == NULL && !ComputeTanTable()) { - return false; - } - - // compute min and max values for x after deslant - min_des_y = min(0, static_cast((wid_ - 1) * tan_table_[0])); - max_des_y = (hgt_ - 1) + - max(0, static_cast((wid_ - 1) * tan_table_[kDeslantAngleCount - 1])); - - des_hgt = max_des_y - min_des_y + 1; - - // alloc memory for histograms - int **angle_hist = new int*[kDeslantAngleCount]; - for (ang_idx = 0; ang_idx < kDeslantAngleCount; ang_idx++) { - angle_hist[ang_idx] = new int[des_hgt]; - if (angle_hist[ang_idx] == NULL) { - delete[] angle_hist; - return false; - } - memset(angle_hist[ang_idx], 0, des_hgt * sizeof(*angle_hist[ang_idx])); - } - - // compute histograms - for (y = 0; y < hgt_; y++) { - for (x = 0; x < wid_; x++) { - // find a non-bkgrnd pixel - if (line_buff_[y][x] != 0xff) { - // stamp all histograms - for (ang_idx = 0; ang_idx < kDeslantAngleCount; ang_idx++) { - des_y = y - static_cast(x * tan_table_[ang_idx]); - if (des_y >= min_des_y && des_y <= max_des_y) { - angle_hist[ang_idx][des_y - min_des_y]++; - } - } - } - } - } - - // find the histogram with the lowest entropy - float entropy; - float best_entropy = 0.0f; - float norm_val; - - best_ang = -1; - for (ang_idx = 0; ang_idx < kDeslantAngleCount; ang_idx++) { - entropy = 0.0f; - - for (y = min_des_y; y <= max_des_y; y++) { - if (angle_hist[ang_idx][y - min_des_y] > 0) { - norm_val = (1.0f * angle_hist[ang_idx][y - min_des_y] / wid_); - entropy += (-1.0f * norm_val * log(norm_val)); - } - } - - if (best_ang == -1 || entropy < best_entropy) { - best_ang = ang_idx; - best_entropy = entropy; - } - - // free the histogram - delete[] angle_hist[ang_idx]; - } - delete[] angle_hist; - - (*deslant_angle) = 0.0; - - // deslant - if (best_ang != -1) { - unsigned char **dest_lines; - int old_hgt = hgt_; - - // create a new buffer - min_des_y = min(0, static_cast((wid_ - 1) * -tan_table_[best_ang])); - max_des_y = (hgt_ - 1) + - max(0, static_cast((wid_ - 1) * -tan_table_[best_ang])); - hgt_ = max_des_y - min_des_y + 1; - dest_lines = CreateBmpBuffer(); - if (dest_lines == NULL) { - return false; - } - - for (y = 0; y < old_hgt; y++) { - for (x = 0; x < wid_; x++) { - // find a non-bkgrnd pixel - if (line_buff_[y][x] != 0xff) { - // compute new pos - des_y = y - static_cast((x * tan_table_[best_ang])); - dest_lines[des_y - min_des_y][x] = 0; - } - } - } - - // free old buffer - FreeBmpBuffer(line_buff_); - line_buff_ = dest_lines; - - (*deslant_angle) = kMinDeslantAngle + (best_ang * kDeslantAngleDelta); - } - - return true; -} - -float Bmp8::MeanHorizontalHistogramEntropy() const { - float entropy = 0.0f; - - // compute histograms - for (int y = 0; y < hgt_; y++) { - int pix_cnt = 0; - - for (int x = 0; x < wid_; x++) { - // find a non-bkgrnd pixel - if (line_buff_[y][x] != 0xff) { - pix_cnt++; - } - } - - if (pix_cnt > 0) { - float norm_val = (1.0f * pix_cnt / wid_); - entropy += (-1.0f * norm_val * log(norm_val)); - } - } - - return entropy / hgt_; -} - -int *Bmp8::HorizontalHistogram() const { - int *hist = new int[hgt_]; - if (hist == NULL) { - return NULL; - } - - // compute histograms - for (int y = 0; y < hgt_; y++) { - hist[y] = 0; - - for (int x = 0; x < wid_; x++) { - // find a non-bkgrnd pixel - if (line_buff_[y][x] != 0xff) { - hist[y]++; - } - } - } - - return hist; -} - -} // namespace tesseract diff --git a/cube/bmp_8.h b/cube/bmp_8.h deleted file mode 100644 index 7200d7da..00000000 --- a/cube/bmp_8.h +++ /dev/null @@ -1,122 +0,0 @@ -/********************************************************************** - * File: bmp_8.h - * Description: Declaration of an 8-bit Bitmap class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#ifndef BMP8_H -#define BMP8_H - -// The Bmp8 class is an 8-bit bitmap that represents images of -// words, characters and segments throughout Cube -// It is meant to provide fast access to the bitmap bits and provide -// fast scaling, cropping, deslanting, connected components detection, -// loading and saving functionality - -#include -#include -#include "con_comp.h" -#include "cached_file.h" - -namespace tesseract { - -// Non-integral deslanting parameters. -static const float kMinDeslantAngle = -30.0f; -static const float kMaxDeslantAngle = 30.0f; -static const float kDeslantAngleDelta = 0.5f; - -class Bmp8 { - public: - Bmp8(unsigned short wid, unsigned short hgt); - ~Bmp8(); - // Clears the bitmap - bool Clear(); - // accessors to bitmap dimensions - inline unsigned short Width() const { return wid_; } - inline unsigned short Stride() const { return stride_; } - inline unsigned short Height() const { return hgt_; } - inline unsigned char *RawData() const { - return (line_buff_ == NULL ? NULL : line_buff_[0]); - } - // creates a scaled version of the specified bitmap - // Optionally, scaling can be isotropic (preserving aspect ratio) or not - bool ScaleFrom(Bmp8 *bmp, bool isotropic = true); - // Deslant the bitmap vertically - bool Deslant(); - // Deslant the bitmap horizontally - bool HorizontalDeslant(double *deslant_angle); - // Create a bitmap object from a file - static Bmp8 *FromCharDumpFile(CachedFile *fp); - static Bmp8 *FromCharDumpFile(FILE *fp); - // are two bitmaps identical - bool IsIdentical(Bmp8 *pBmp) const; - // Detect connected components - ConComp ** FindConComps(int *concomp_cnt, int min_size) const; - // compute the foreground ratio - float ForegroundRatio() const; - // returns the mean horizontal histogram entropy of the bitmap - float MeanHorizontalHistogramEntropy() const; - // returns the horizontal histogram of the bitmap - int *HorizontalHistogram() const; - - private: - // Compute a look up tan table that will be used for fast slant computation - static bool ComputeTanTable(); - // create a bitmap buffer (two flavors char & int) and init contents - unsigned char ** CreateBmpBuffer(unsigned char init_val = 0xff); - static unsigned int ** CreateBmpBuffer(int wid, int hgt, - unsigned char init_val = 0xff); - // Free a bitmap buffer - static void FreeBmpBuffer(unsigned char **buff); - static void FreeBmpBuffer(unsigned int **buff); - - // a static array that holds the tan lookup table - static float *tan_table_; - // bitmap 32-bit-aligned stride - unsigned short stride_; - // Bmp8 magic number used to validate saved bitmaps - static const unsigned int kMagicNumber = 0xdeadbeef; - - protected: - // bitmap dimensions - unsigned short wid_; - unsigned short hgt_; - // bitmap contents - unsigned char **line_buff_; - // deslanting parameters - static const int kConCompAllocChunk = 16; - static const int kDeslantAngleCount; - - // Load dimensions & contents of bitmap from file - bool LoadFromCharDumpFile(CachedFile *fp); - bool LoadFromCharDumpFile(FILE *fp); - // Load dimensions & contents of bitmap from raw data - bool LoadFromCharDumpFile(unsigned char **raw_data); - // Load contents of bitmap from raw data - bool LoadFromRawData(unsigned char *data); - // save bitmap to a file - bool SaveBmp2CharDumpFile(FILE *fp) const; - // checks if a row or a column are entirely blank - bool IsBlankColumn(int x) const; - bool IsBlankRow(int y) const; - // crop the bitmap returning new dimensions - void Crop(int *xst_src, int *yst_src, int *wid, int *hgt); - // copy part of the specified bitmap - void Copy(int x, int y, int wid, int hgt, Bmp8 *bmp_dest) const; -}; -} - -#endif // BMP8_H diff --git a/cube/cached_file.cpp b/cube/cached_file.cpp deleted file mode 100644 index a9a5b2e0..00000000 --- a/cube/cached_file.cpp +++ /dev/null @@ -1,150 +0,0 @@ -/********************************************************************** - * File: cached_file.pp - * Description: Implementation of an Cached File Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include "cached_file.h" - -namespace tesseract { - -CachedFile::CachedFile(string file_name) { - file_name_ = file_name; - buff_ = NULL; - buff_pos_ = 0; - buff_size_ = 0; - file_pos_ = 0; - file_size_ = 0; - fp_ = NULL; -} - -CachedFile::~CachedFile() { - if (fp_ != NULL) { - fclose(fp_); - fp_ = NULL; - } - - if (buff_ != NULL) { - delete []buff_; - buff_ = NULL; - } -} - -// free buffers and init vars -bool CachedFile::Open() { - if (fp_ != NULL) { - return true; - } - - fp_ = fopen(file_name_.c_str(), "rb"); - if (fp_ == NULL) { - return false; - } - - // seek to the end - fseek(fp_, 0, SEEK_END); - // get file size - file_size_ = ftell(fp_); - if (file_size_ < 1) { - return false; - } - // rewind again - rewind(fp_); - // alloc memory for buffer - buff_ = new unsigned char[kCacheSize]; - if (buff_ == NULL) { - return false; - } - // init counters - buff_size_ = 0; - buff_pos_ = 0; - file_pos_ = 0; - return true; -} - -// add a new sample -int CachedFile::Read(void *read_buff, int bytes) { - int read_bytes = 0; - unsigned char *buff = (unsigned char *)read_buff; - - // do we need to read beyond the buffer - if ((buff_pos_ + bytes) > buff_size_) { - // copy as much bytes from the current buffer if any - int copy_bytes = buff_size_ - buff_pos_; - - if (copy_bytes > 0) { - memcpy(buff, buff_ + buff_pos_, copy_bytes); - buff += copy_bytes; - bytes -= copy_bytes; - read_bytes += copy_bytes; - } - - // determine how much to read - buff_size_ = kCacheSize; - - if ((file_pos_ + buff_size_) > file_size_) { - buff_size_ = static_cast(file_size_ - file_pos_); - } - - // EOF ? - if (buff_size_ <= 0 || bytes > buff_size_) { - return read_bytes; - } - - // read the first chunck - if (fread(buff_, 1, buff_size_, fp_) != buff_size_) { - return read_bytes; - } - - buff_pos_ = 0; - file_pos_ += buff_size_; - } - - memcpy(buff, buff_ + buff_pos_, bytes); - read_bytes += bytes; - buff_pos_ += bytes; - - return read_bytes; -} - -long CachedFile::Size() { - if (fp_ == NULL && Open() == false) { - return 0; - } - - return file_size_; -} - -long CachedFile::Tell() { - if (fp_ == NULL && Open() == false) { - return 0; - } - - return file_pos_ - buff_size_ + buff_pos_; -} - -bool CachedFile::eof() { - if (fp_ == NULL && Open() == false) { - return true; - } - - return (file_pos_ - buff_size_ + buff_pos_) >= file_size_; -} - -} // namespace tesseract diff --git a/cube/cached_file.h b/cube/cached_file.h deleted file mode 100644 index eb671970..00000000 --- a/cube/cached_file.h +++ /dev/null @@ -1,69 +0,0 @@ -/********************************************************************** - * File: cached_file.h - * Description: Declaration of a Cached File class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#ifndef CACHED_FILE_H -#define CACHED_FILE_H - -// The CachedFile class provides a large-cache read access to a file -// It is mainly designed for loading large word dump files - -#include -#include -#ifdef USE_STD_NAMESPACE -using std::string; -#endif - -namespace tesseract { -class CachedFile { - public: - explicit CachedFile(string file_name); - ~CachedFile(); - - // reads a specified number of bytes to the specified buffer and - // returns the actual number of bytes read - int Read(void *read_buff, int bytes); - // Returns the file size - long Size(); - // returns the current position in the file - long Tell(); - // End of file flag - bool eof(); - - private: - static const unsigned int kCacheSize = 0x8000000; - // file name - string file_name_; - // internal file buffer - unsigned char *buff_; - // file position - long file_pos_; - // file size - long file_size_; - // position of file within buffer - int buff_pos_; - // buffer size - int buff_size_; - // file handle - FILE *fp_; - // Opens the file - bool Open(); -}; -} - -#endif // CACHED_FILE_H diff --git a/cube/char_altlist.cpp b/cube/char_altlist.cpp deleted file mode 100644 index c0e7776e..00000000 --- a/cube/char_altlist.cpp +++ /dev/null @@ -1,115 +0,0 @@ -/********************************************************************** - * File: char_altlist.cpp - * Description: Implementation of a Character Alternate List Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "char_altlist.h" - -namespace tesseract { - -// The CharSet is not class owned and must exist for -// the life time of this class -CharAltList::CharAltList(const CharSet *char_set, int max_alt) - : AltList(max_alt) { - char_set_ = char_set; - max_alt_ = max_alt; - class_id_alt_ = NULL; - class_id_cost_ = NULL; -} - -CharAltList::~CharAltList() { - if (class_id_alt_ != NULL) { - delete []class_id_alt_; - class_id_alt_ = NULL; - } - - if (class_id_cost_ != NULL) { - delete []class_id_cost_; - class_id_cost_ = NULL; - } -} - -// Insert a new char alternate -bool CharAltList::Insert(int class_id, int cost, void *tag) { - // validate class ID - if (class_id < 0 || class_id >= char_set_->ClassCount()) { - return false; - } - - // allocate buffers if nedded - if (class_id_alt_ == NULL || alt_cost_ == NULL) { - class_id_alt_ = new int[max_alt_]; - alt_cost_ = new int[max_alt_]; - alt_tag_ = new void *[max_alt_]; - - if (class_id_alt_ == NULL || alt_cost_ == NULL || alt_tag_ == NULL) { - return false; - } - - memset(alt_tag_, 0, max_alt_ * sizeof(*alt_tag_)); - } - - if (class_id_cost_ == NULL) { - int class_cnt = char_set_->ClassCount(); - - class_id_cost_ = new int[class_cnt]; - if (class_id_cost_ == NULL) { - return false; - } - - for (int ich = 0; ich < class_cnt; ich++) { - class_id_cost_[ich] = WORST_COST; - } - } - - if (class_id < 0 || class_id >= char_set_->ClassCount()) { - return false; - } - - // insert the alternate - class_id_alt_[alt_cnt_] = class_id; - alt_cost_[alt_cnt_] = cost; - alt_tag_[alt_cnt_] = tag; - - alt_cnt_++; - - class_id_cost_[class_id] = cost; - - return true; -} - -// sort the alternate Desc. based on prob -void CharAltList::Sort() { - for (int alt_idx = 0; alt_idx < alt_cnt_; alt_idx++) { - for (int alt = alt_idx + 1; alt < alt_cnt_; alt++) { - if (alt_cost_[alt_idx] > alt_cost_[alt]) { - int temp = class_id_alt_[alt_idx]; - class_id_alt_[alt_idx] = class_id_alt_[alt]; - class_id_alt_[alt] = temp; - - temp = alt_cost_[alt_idx]; - alt_cost_[alt_idx] = alt_cost_[alt]; - alt_cost_[alt] = temp; - - void *tag = alt_tag_[alt_idx]; - alt_tag_[alt_idx] = alt_tag_[alt]; - alt_tag_[alt] = tag; - } - } - } -} -} diff --git a/cube/char_altlist.h b/cube/char_altlist.h deleted file mode 100644 index ef3a083a..00000000 --- a/cube/char_altlist.h +++ /dev/null @@ -1,70 +0,0 @@ -/********************************************************************** - * File: char_altlist.h - * Description: Declaration of a Character Alternate List Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#ifndef CHAR_ALT_LIST_H -#define CHAR_ALT_LIST_H - -// The CharAltList class holds the list of class alternates returned from -// a character classifier. Each alternate represents a class ID. -// It inherits from the AltList class. -// The CharAltList owns a CharSet object that maps a class-id to a string. - -#include "altlist.h" -#include "char_set.h" - -namespace tesseract { -class CharAltList : public AltList { - public: - CharAltList(const CharSet *char_set, int max_alt = kMaxCharAlt); - ~CharAltList(); - - // Sort the alternate list based on cost - void Sort(); - // insert a new alternate with the specified class-id, cost and tag - bool Insert(int class_id, int cost, void *tag = NULL); - // returns the cost of a specific class ID - inline int ClassCost(int class_id) const { - if (class_id_cost_ == NULL || - class_id < 0 || - class_id >= char_set_->ClassCount()) { - return WORST_COST; - } - return class_id_cost_[class_id]; - } - // returns the alternate class-id corresponding to an alternate index - inline int Alt(int alt_idx) const { return class_id_alt_[alt_idx]; } - // set the cost of a certain alternate - void SetAltCost(int alt_idx, int cost) { - alt_cost_[alt_idx] = cost; - class_id_cost_[class_id_alt_[alt_idx]] = cost; - } - - private: - // character set object. Passed at construction time - const CharSet *char_set_; - // array of alternate class-ids - int *class_id_alt_; - // array of alternate costs - int *class_id_cost_; - // default max count of alternates - static const int kMaxCharAlt = 256; -}; -} - -#endif // CHAR_ALT_LIST_H diff --git a/cube/char_bigrams.cpp b/cube/char_bigrams.cpp deleted file mode 100644 index b36b1f6c..00000000 --- a/cube/char_bigrams.cpp +++ /dev/null @@ -1,207 +0,0 @@ -/********************************************************************** - * File: char_bigrams.cpp - * Description: Implementation of a Character Bigrams Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include - -#include "char_bigrams.h" -#include "cube_utils.h" -#include "ndminx.h" -#include "cube_const.h" - -namespace tesseract { - -CharBigrams::CharBigrams() { - memset(&bigram_table_, 0, sizeof(bigram_table_)); -} - -CharBigrams::~CharBigrams() { - if (bigram_table_.char_bigram != NULL) { - for (int ch1 = 0; ch1 <= bigram_table_.max_char; ch1++) { - CharBigram *char_bigram = bigram_table_.char_bigram + ch1; - - if (char_bigram->bigram != NULL) { - delete []char_bigram->bigram; - } - } - delete []bigram_table_.char_bigram; - } -} - -CharBigrams *CharBigrams::Create(const string &data_file_path, - const string &lang) { - string file_name; - string str; - - file_name = data_file_path + lang; - file_name += ".cube.bigrams"; - - // load the string into memory - if (!CubeUtils::ReadFileToString(file_name, &str)) { - return NULL; - } - - // construct a new object - CharBigrams *char_bigrams_obj = new CharBigrams(); - if (char_bigrams_obj == NULL) { - fprintf(stderr, "Cube ERROR (CharBigrams::Create): could not create " - "character bigrams object.\n"); - return NULL; - } - CharBigramTable *table = &char_bigrams_obj->bigram_table_; - - table->total_cnt = 0; - table->max_char = -1; - table->char_bigram = NULL; - - // split into lines - vector str_vec; - CubeUtils::SplitStringUsing(str, "\r\n", &str_vec); - - for (int big = 0; big < str_vec.size(); big++) { - char_32 ch1; - char_32 ch2; - int cnt; - if (sscanf(str_vec[big].c_str(), "%d %x %x", &cnt, &ch1, &ch2) != 3) { - fprintf(stderr, "Cube ERROR (CharBigrams::Create): invalid format " - "reading line: %s\n", str_vec[big].c_str()); - delete char_bigrams_obj; - return NULL; - } - - // expand the bigram table - if (ch1 > table->max_char) { - CharBigram *char_bigram = new CharBigram[ch1 + 1]; - if (char_bigram == NULL) { - fprintf(stderr, "Cube ERROR (CharBigrams::Create): error allocating " - "additional memory for character bigram table.\n"); - return NULL; - } - - if (table->char_bigram != NULL && table->max_char >= 0) { - memcpy(char_bigram, table->char_bigram, - (table->max_char + 1) * sizeof(*char_bigram)); - - delete []table->char_bigram; - } - table->char_bigram = char_bigram; - - // init - for (int new_big = table->max_char + 1; new_big <= ch1; new_big++) { - table->char_bigram[new_big].total_cnt = 0; - table->char_bigram[new_big].max_char = -1; - table->char_bigram[new_big].bigram = NULL; - } - table->max_char = ch1; - } - - if (ch2 > table->char_bigram[ch1].max_char) { - Bigram *bigram = new Bigram[ch2 + 1]; - if (bigram == NULL) { - fprintf(stderr, "Cube ERROR (CharBigrams::Create): error allocating " - "memory for bigram.\n"); - delete char_bigrams_obj; - return NULL; - } - - if (table->char_bigram[ch1].bigram != NULL && - table->char_bigram[ch1].max_char >= 0) { - memcpy(bigram, table->char_bigram[ch1].bigram, - (table->char_bigram[ch1].max_char + 1) * sizeof(*bigram)); - delete []table->char_bigram[ch1].bigram; - } - table->char_bigram[ch1].bigram = bigram; - - // init - for (int new_big = table->char_bigram[ch1].max_char + 1; - new_big <= ch2; new_big++) { - table->char_bigram[ch1].bigram[new_big].cnt = 0; - } - table->char_bigram[ch1].max_char = ch2; - } - - table->char_bigram[ch1].bigram[ch2].cnt = cnt; - table->char_bigram[ch1].total_cnt += cnt; - table->total_cnt += cnt; - } - - // compute costs (-log probs) - table->worst_cost = static_cast( - -PROB2COST_SCALE * log(0.5 / table->total_cnt)); - for (char_32 ch1 = 0; ch1 <= table->max_char; ch1++) { - for (char_32 ch2 = 0; ch2 <= table->char_bigram[ch1].max_char; ch2++) { - int cnt = table->char_bigram[ch1].bigram[ch2].cnt; - table->char_bigram[ch1].bigram[ch2].cost = - static_cast(-PROB2COST_SCALE * - log(MAX(0.5, static_cast(cnt)) / - table->total_cnt)); - } - } - return char_bigrams_obj; -} - -int CharBigrams::PairCost(char_32 ch1, char_32 ch2) const { - if (ch1 > bigram_table_.max_char) { - return bigram_table_.worst_cost; - } - if (ch2 > bigram_table_.char_bigram[ch1].max_char) { - return bigram_table_.worst_cost; - } - return bigram_table_.char_bigram[ch1].bigram[ch2].cost; -} - -int CharBigrams::Cost(const char_32 *char_32_ptr, CharSet *char_set) const { - if (!char_32_ptr || char_32_ptr[0] == 0) { - return bigram_table_.worst_cost; - } - int cost = MeanCostWithSpaces(char_32_ptr); - if (CubeUtils::StrLen(char_32_ptr) >= kMinLengthCaseInvariant && - CubeUtils::IsCaseInvariant(char_32_ptr, char_set)) { - char_32 *lower_32 = CubeUtils::ToLower(char_32_ptr, char_set); - if (lower_32 && lower_32[0] != 0) { - int cost_lower = MeanCostWithSpaces(lower_32); - cost = MIN(cost, cost_lower); - delete [] lower_32; - } - char_32 *upper_32 = CubeUtils::ToUpper(char_32_ptr, char_set); - if (upper_32 && upper_32[0] != 0) { - int cost_upper = MeanCostWithSpaces(upper_32); - cost = MIN(cost, cost_upper); - delete [] upper_32; - } - } - return cost; -} - -int CharBigrams::MeanCostWithSpaces(const char_32 *char_32_ptr) const { - if (!char_32_ptr) - return bigram_table_.worst_cost; - int len = CubeUtils::StrLen(char_32_ptr); - int cost = 0; - int c = 0; - cost = PairCost(' ', char_32_ptr[0]); - for (c = 1; c < len; c++) { - cost += PairCost(char_32_ptr[c - 1], char_32_ptr[c]); - } - cost += PairCost(char_32_ptr[len - 1], ' '); - return static_cast(cost / static_cast(len + 1)); -} -} // namespace tesseract diff --git a/cube/char_bigrams.h b/cube/char_bigrams.h deleted file mode 100644 index 5d819311..00000000 --- a/cube/char_bigrams.h +++ /dev/null @@ -1,89 +0,0 @@ -/********************************************************************** - * File: char_bigrams.h - * Description: Declaration of a Character Bigrams Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CharBigram class represents the interface to the character bigram -// table used by Cube -// A CharBigram object can be constructed from the Char Bigrams file -// Given a sequence of characters, the "Cost" method returns the Char Bigram -// cost of the string according to the table - -#ifndef CHAR_BIGRAMS_H -#define CHAR_BIGRAMS_H - -#include -#include "char_set.h" - -namespace tesseract { - -// structure representing a single bigram value -struct Bigram { - int cnt; - int cost; -}; - -// structure representing the char bigram array of characters -// following a specific character -struct CharBigram { - int total_cnt; - char_32 max_char; - Bigram *bigram; -}; - -// structure representing the whole bigram table -struct CharBigramTable { - int total_cnt; - int worst_cost; - char_32 max_char; - CharBigram *char_bigram; -}; - -class CharBigrams { - public: - CharBigrams(); - ~CharBigrams(); - // Construct the CharBigrams class from a file - static CharBigrams *Create(const string &data_file_path, - const string &lang); - // Top-level function to return the mean character bigram cost of a - // sequence of characters. If char_set is not NULL, use - // tesseract functions to return a case-invariant cost. - // This avoids unnecessarily penalizing all-one-case words or - // capitalized words (first-letter upper-case and remaining letters - // lower-case). - int Cost(const char_32 *str, CharSet *char_set) const; - - protected: - // Returns the character bigram cost of two characters. - int PairCost(char_32 ch1, char_32 ch2) const; - // Returns the mean character bigram cost of a sequence of - // characters. Adds a space at the beginning and end to account for - // cost of starting and ending characters. - int MeanCostWithSpaces(const char_32 *char_32_ptr) const; - - private: - // Only words this length or greater qualify for case-invariant character - // bigram cost. - static const int kMinLengthCaseInvariant = 4; - - - CharBigramTable bigram_table_; -}; -} - -#endif // CHAR_BIGRAMS_H diff --git a/cube/char_samp.cpp b/cube/char_samp.cpp deleted file mode 100644 index c3493fa1..00000000 --- a/cube/char_samp.cpp +++ /dev/null @@ -1,669 +0,0 @@ -/********************************************************************** - * File: char_samp.cpp - * Description: Implementation of a Character Bitmap Sample Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include "char_samp.h" -#include "cube_utils.h" - -namespace tesseract { - -#define MAX_LINE_LEN 1024 - -CharSamp::CharSamp() - : Bmp8(0, 0) { - left_ = 0; - top_ = 0; - label32_ = NULL; - page_ = -1; -} - -CharSamp::CharSamp(int wid, int hgt) - : Bmp8(wid, hgt) { - left_ = 0; - top_ = 0; - label32_ = NULL; - page_ = -1; -} - -CharSamp::CharSamp(int left, int top, int wid, int hgt) - : Bmp8(wid, hgt) - , left_(left) - , top_(top) { - label32_ = NULL; - page_ = -1; -} - -CharSamp::~CharSamp() { - if (label32_ != NULL) { - delete []label32_; - label32_ = NULL; - } -} - -// returns a UTF-8 version of the string label -string CharSamp::stringLabel() const { - string str = ""; - if (label32_ != NULL) { - string_32 str32(label32_); - CubeUtils::UTF32ToUTF8(str32.c_str(), &str); - } - return str; -} - -// set a the string label using a UTF encoded string -void CharSamp::SetLabel(string str) { - if (label32_ != NULL) { - delete []label32_; - label32_ = NULL; - } - string_32 str32; - CubeUtils::UTF8ToUTF32(str.c_str(), &str32); - SetLabel(reinterpret_cast(str32.c_str())); -} - -// creates a CharSamp object from file -CharSamp *CharSamp::FromCharDumpFile(CachedFile *fp) { - unsigned short left; - unsigned short top; - unsigned short page; - unsigned short first_char; - unsigned short last_char; - unsigned short norm_top; - unsigned short norm_bottom; - unsigned short norm_aspect_ratio; - unsigned int val32; - - char_32 *label32; - - // read and check 32 bit marker - if (fp->Read(&val32, sizeof(val32)) != sizeof(val32)) { - return NULL; - } - if (val32 != 0xabd0fefe) { - return NULL; - } - // read label length, - if (fp->Read(&val32, sizeof(val32)) != sizeof(val32)) { - return NULL; - } - // the label is not null terminated in the file - if (val32 > 0 && val32 < MAX_UINT32) { - label32 = new char_32[val32 + 1]; - if (label32 == NULL) { - return NULL; - } - // read label - if (fp->Read(label32, val32 * sizeof(*label32)) != - (val32 * sizeof(*label32))) { - return NULL; - } - // null terminate - label32[val32] = 0; - } else { - label32 = NULL; - } - // read coordinates - if (fp->Read(&page, sizeof(page)) != sizeof(page)) { - return NULL; - } - if (fp->Read(&left, sizeof(left)) != sizeof(left)) { - return NULL; - } - if (fp->Read(&top, sizeof(top)) != sizeof(top)) { - return NULL; - } - if (fp->Read(&first_char, sizeof(first_char)) != sizeof(first_char)) { - return NULL; - } - if (fp->Read(&last_char, sizeof(last_char)) != sizeof(last_char)) { - return NULL; - } - if (fp->Read(&norm_top, sizeof(norm_top)) != sizeof(norm_top)) { - return NULL; - } - if (fp->Read(&norm_bottom, sizeof(norm_bottom)) != sizeof(norm_bottom)) { - return NULL; - } - if (fp->Read(&norm_aspect_ratio, sizeof(norm_aspect_ratio)) != - sizeof(norm_aspect_ratio)) { - return NULL; - } - // create the object - CharSamp *char_samp = new CharSamp(); - if (char_samp == NULL) { - return NULL; - } - // init - char_samp->label32_ = label32; - char_samp->page_ = page; - char_samp->left_ = left; - char_samp->top_ = top; - char_samp->first_char_ = first_char; - char_samp->last_char_ = last_char; - char_samp->norm_top_ = norm_top; - char_samp->norm_bottom_ = norm_bottom; - char_samp->norm_aspect_ratio_ = norm_aspect_ratio; - // load the Bmp8 part - if (char_samp->LoadFromCharDumpFile(fp) == false) { - delete char_samp; - return NULL; - } - return char_samp; -} - -// Load a Char Samp from a dump file -CharSamp *CharSamp::FromCharDumpFile(FILE *fp) { - unsigned short left; - unsigned short top; - unsigned short page; - unsigned short first_char; - unsigned short last_char; - unsigned short norm_top; - unsigned short norm_bottom; - unsigned short norm_aspect_ratio; - unsigned int val32; - char_32 *label32; - - // read and check 32 bit marker - if (fread(&val32, 1, sizeof(val32), fp) != sizeof(val32)) { - return NULL; - } - if (val32 != 0xabd0fefe) { - return NULL; - } - // read label length, - if (fread(&val32, 1, sizeof(val32), fp) != sizeof(val32)) { - return NULL; - } - // the label is not null terminated in the file - if (val32 > 0 && val32 < MAX_UINT32) { - label32 = new char_32[val32 + 1]; - if (label32 == NULL) { - return NULL; - } - // read label - if (fread(label32, 1, val32 * sizeof(*label32), fp) != - (val32 * sizeof(*label32))) { - delete [] label32; - return NULL; - } - // null terminate - label32[val32] = 0; - } else { - label32 = NULL; - } - // read coordinates - if (fread(&page, 1, sizeof(page), fp) != sizeof(page) || - fread(&left, 1, sizeof(left), fp) != sizeof(left) || - fread(&top, 1, sizeof(top), fp) != sizeof(top) || - fread(&first_char, 1, sizeof(first_char), fp) != sizeof(first_char) || - fread(&last_char, 1, sizeof(last_char), fp) != sizeof(last_char) || - fread(&norm_top, 1, sizeof(norm_top), fp) != sizeof(norm_top) || - fread(&norm_bottom, 1, sizeof(norm_bottom), fp) != sizeof(norm_bottom) || - fread(&norm_aspect_ratio, 1, sizeof(norm_aspect_ratio), fp) != - sizeof(norm_aspect_ratio)) { - delete [] label32; - return NULL; - } - // create the object - CharSamp *char_samp = new CharSamp(); - if (char_samp == NULL) { - delete [] label32; - return NULL; - } - // init - char_samp->label32_ = label32; - char_samp->page_ = page; - char_samp->left_ = left; - char_samp->top_ = top; - char_samp->first_char_ = first_char; - char_samp->last_char_ = last_char; - char_samp->norm_top_ = norm_top; - char_samp->norm_bottom_ = norm_bottom; - char_samp->norm_aspect_ratio_ = norm_aspect_ratio; - // load the Bmp8 part - if (char_samp->LoadFromCharDumpFile(fp) == false) { - delete char_samp; // It owns label32. - return NULL; - } - return char_samp; -} - -// returns a copy of the charsamp that is scaled to the -// specified width and height -CharSamp *CharSamp::Scale(int wid, int hgt, bool isotropic) { - CharSamp *scaled_samp = new CharSamp(wid, hgt); - if (scaled_samp == NULL) { - return NULL; - } - if (scaled_samp->ScaleFrom(this, isotropic) == false) { - delete scaled_samp; - return NULL; - } - scaled_samp->left_ = left_; - scaled_samp->top_ = top_; - scaled_samp->page_ = page_; - scaled_samp->SetLabel(label32_); - scaled_samp->first_char_ = first_char_; - scaled_samp->last_char_ = last_char_; - scaled_samp->norm_top_ = norm_top_; - scaled_samp->norm_bottom_ = norm_bottom_; - scaled_samp->norm_aspect_ratio_ = norm_aspect_ratio_; - return scaled_samp; -} - -// Load a Char Samp from a dump file -CharSamp *CharSamp::FromRawData(int left, int top, int wid, int hgt, - unsigned char *data) { - // create the object - CharSamp *char_samp = new CharSamp(left, top, wid, hgt); - if (char_samp == NULL) { - return NULL; - } - if (char_samp->LoadFromRawData(data) == false) { - delete char_samp; - return NULL; - } - return char_samp; -} - -// Saves the charsamp to a dump file -bool CharSamp::Save2CharDumpFile(FILE *fp) const { - unsigned int val32; - // write and check 32 bit marker - val32 = 0xabd0fefe; - if (fwrite(&val32, 1, sizeof(val32), fp) != sizeof(val32)) { - return false; - } - // write label length - val32 = (label32_ == NULL) ? 0 : LabelLen(label32_); - if (fwrite(&val32, 1, sizeof(val32), fp) != sizeof(val32)) { - return false; - } - // write label - if (label32_ != NULL) { - if (fwrite(label32_, 1, val32 * sizeof(*label32_), fp) != - (val32 * sizeof(*label32_))) { - return false; - } - } - // write coordinates - if (fwrite(&page_, 1, sizeof(page_), fp) != sizeof(page_)) { - return false; - } - if (fwrite(&left_, 1, sizeof(left_), fp) != sizeof(left_)) { - return false; - } - if (fwrite(&top_, 1, sizeof(top_), fp) != sizeof(top_)) { - return false; - } - if (fwrite(&first_char_, 1, sizeof(first_char_), fp) != - sizeof(first_char_)) { - return false; - } - if (fwrite(&last_char_, 1, sizeof(last_char_), fp) != sizeof(last_char_)) { - return false; - } - if (fwrite(&norm_top_, 1, sizeof(norm_top_), fp) != sizeof(norm_top_)) { - return false; - } - if (fwrite(&norm_bottom_, 1, sizeof(norm_bottom_), fp) != - sizeof(norm_bottom_)) { - return false; - } - if (fwrite(&norm_aspect_ratio_, 1, sizeof(norm_aspect_ratio_), fp) != - sizeof(norm_aspect_ratio_)) { - return false; - } - if (SaveBmp2CharDumpFile(fp) == false) { - return false; - } - return true; -} - -// Crop the char samp such that there are no white spaces on any side. -// The norm_top_ and norm_bottom_ fields are the character top/bottom -// with respect to whatever context the character is being recognized -// in (e.g. word bounding box) normalized to a standard size of -// 255. Here they default to 0 and 255 (word box boundaries), but -// since they are context dependent, they may need to be reset by the -// calling function. -CharSamp *CharSamp::Crop() { - // get the dimesions of the cropped img - int cropped_left = 0; - int cropped_top = 0; - int cropped_wid = wid_; - int cropped_hgt = hgt_; - Bmp8::Crop(&cropped_left, &cropped_top, - &cropped_wid, &cropped_hgt); - - if (cropped_wid == 0 || cropped_hgt == 0) { - return NULL; - } - // create the cropped char samp - CharSamp *cropped_samp = new CharSamp(left_ + cropped_left, - top_ + cropped_top, - cropped_wid, cropped_hgt); - cropped_samp->SetLabel(label32_); - cropped_samp->SetFirstChar(first_char_); - cropped_samp->SetLastChar(last_char_); - // the following 3 fields may/should be reset by the calling function - // using context information, i.e., location of character box - // w.r.t. the word bounding box - cropped_samp->SetNormAspectRatio(255 * - cropped_wid / (cropped_wid + cropped_hgt)); - cropped_samp->SetNormTop(0); - cropped_samp->SetNormBottom(255); - - // copy the bitmap to the cropped img - Copy(cropped_left, cropped_top, cropped_wid, cropped_hgt, cropped_samp); - return cropped_samp; -} - -// segment the char samp to connected components -// based on contiguity and vertical pixel density histogram -ConComp **CharSamp::Segment(int *segment_cnt, bool right_2_left, - int max_hist_wnd, int min_con_comp_size) const { - // init - (*segment_cnt) = 0; - int concomp_cnt = 0; - int seg_cnt = 0; - // find the concomps of the image - ConComp **concomp_array = FindConComps(&concomp_cnt, min_con_comp_size); - if (concomp_cnt <= 0 || !concomp_array) { - if (concomp_array) - delete []concomp_array; - return NULL; - } - ConComp **seg_array = NULL; - // segment each concomp further using vertical histogram - for (int concomp = 0; concomp < concomp_cnt; concomp++) { - int concomp_seg_cnt = 0; - // segment the concomp - ConComp **concomp_seg_array = NULL; - ConComp **concomp_alloc_seg = - concomp_array[concomp]->Segment(max_hist_wnd, &concomp_seg_cnt); - // no segments, add the whole concomp - if (concomp_alloc_seg == NULL) { - concomp_seg_cnt = 1; - concomp_seg_array = concomp_array + concomp; - } else { - // delete the original concomp, we no longer need it - concomp_seg_array = concomp_alloc_seg; - delete concomp_array[concomp]; - } - // add the resulting segments - for (int seg_idx = 0; seg_idx < concomp_seg_cnt; seg_idx++) { - // too small of a segment: ignore - if (concomp_seg_array[seg_idx]->Width() < 2 && - concomp_seg_array[seg_idx]->Height() < 2) { - delete concomp_seg_array[seg_idx]; - } else { - // add the new segment - // extend the segment array - if ((seg_cnt % kConCompAllocChunk) == 0) { - ConComp **temp_segm_array = - new ConComp *[seg_cnt + kConCompAllocChunk]; - if (temp_segm_array == NULL) { - fprintf(stderr, "Cube ERROR (CharSamp::Segment): could not " - "allocate additional connected components\n"); - delete []concomp_seg_array; - delete []concomp_array; - delete []seg_array; - return NULL; - } - if (seg_cnt > 0) { - memcpy(temp_segm_array, seg_array, seg_cnt * sizeof(*seg_array)); - delete []seg_array; - } - seg_array = temp_segm_array; - } - seg_array[seg_cnt++] = concomp_seg_array[seg_idx]; - } - } // segment - if (concomp_alloc_seg != NULL) { - delete []concomp_alloc_seg; - } - } // concomp - delete []concomp_array; - - // sort the concomps from Left2Right or Right2Left, based on the reading order - if (seg_cnt > 0 && seg_array != NULL) { - qsort(seg_array, seg_cnt, sizeof(*seg_array), right_2_left ? - ConComp::Right2LeftComparer : ConComp::Left2RightComparer); - } - (*segment_cnt) = seg_cnt; - return seg_array; -} - -// builds a char samp from a set of connected components -CharSamp *CharSamp::FromConComps(ConComp **concomp_array, int strt_concomp, - int seg_flags_size, int *seg_flags, - bool *left_most, bool *right_most, - int word_hgt) { - int concomp; - int end_concomp; - int concomp_cnt = 0; - end_concomp = strt_concomp + seg_flags_size; - // determine ID range - bool once = false; - int min_id = -1; - int max_id = -1; - for (concomp = strt_concomp; concomp < end_concomp; concomp++) { - if (!seg_flags || seg_flags[concomp - strt_concomp] != 0) { - if (!once) { - min_id = concomp_array[concomp]->ID(); - max_id = concomp_array[concomp]->ID(); - once = true; - } else { - UpdateRange(concomp_array[concomp]->ID(), &min_id, &max_id); - } - concomp_cnt++; - } - } - if (concomp_cnt < 1 || !once || min_id == -1 || max_id == -1) { - return NULL; - } - // alloc memo for computing leftmost and right most attributes - int id_cnt = max_id - min_id + 1; - bool *id_exist = new bool[id_cnt]; - bool *left_most_exist = new bool[id_cnt]; - bool *right_most_exist = new bool[id_cnt]; - if (!id_exist || !left_most_exist || !right_most_exist) - return NULL; - memset(id_exist, 0, id_cnt * sizeof(*id_exist)); - memset(left_most_exist, 0, id_cnt * sizeof(*left_most_exist)); - memset(right_most_exist, 0, id_cnt * sizeof(*right_most_exist)); - // find the dimensions of the charsamp - once = false; - int left = -1; - int right = -1; - int top = -1; - int bottom = -1; - int unq_ids = 0; - int unq_left_most = 0; - int unq_right_most = 0; - for (concomp = strt_concomp; concomp < end_concomp; concomp++) { - if (!seg_flags || seg_flags[concomp - strt_concomp] != 0) { - if (!once) { - left = concomp_array[concomp]->Left(); - right = concomp_array[concomp]->Right(); - top = concomp_array[concomp]->Top(); - bottom = concomp_array[concomp]->Bottom(); - once = true; - } else { - UpdateRange(concomp_array[concomp]->Left(), - concomp_array[concomp]->Right(), &left, &right); - UpdateRange(concomp_array[concomp]->Top(), - concomp_array[concomp]->Bottom(), &top, &bottom); - } - // count unq ids, unq left most and right mosts ids - int concomp_id = concomp_array[concomp]->ID() - min_id; - if (!id_exist[concomp_id]) { - id_exist[concomp_id] = true; - unq_ids++; - } - if (concomp_array[concomp]->LeftMost()) { - if (left_most_exist[concomp_id] == false) { - left_most_exist[concomp_id] = true; - unq_left_most++; - } - } - if (concomp_array[concomp]->RightMost()) { - if (right_most_exist[concomp_id] == false) { - right_most_exist[concomp_id] = true; - unq_right_most++; - } - } - } - } - delete []id_exist; - delete []left_most_exist; - delete []right_most_exist; - if (!once || left == -1 || top == -1 || right == -1 || bottom == -1) { - return NULL; - } - (*left_most) = (unq_left_most >= unq_ids); - (*right_most) = (unq_right_most >= unq_ids); - // create the char sample object - CharSamp *samp = new CharSamp(left, top, right - left + 1, bottom - top + 1); - if (!samp) { - return NULL; - } - - // set the foreground pixels - for (concomp = strt_concomp; concomp < end_concomp; concomp++) { - if (!seg_flags || seg_flags[concomp - strt_concomp] != 0) { - ConCompPt *pt_ptr = concomp_array[concomp]->Head(); - while (pt_ptr) { - samp->line_buff_[pt_ptr->y() - top][pt_ptr->x() - left] = 0; - pt_ptr = pt_ptr->Next(); - } - } - } - return samp; -} - -// clones the object -CharSamp *CharSamp::Clone() const { - // create the cropped char samp - CharSamp *samp = new CharSamp(left_, top_, wid_, hgt_); - samp->SetLabel(label32_); - samp->SetFirstChar(first_char_); - samp->SetLastChar(last_char_); - samp->SetNormTop(norm_top_); - samp->SetNormBottom(norm_bottom_); - samp->SetNormAspectRatio(norm_aspect_ratio_); - // copy the bitmap to the cropped img - Copy(0, 0, wid_, hgt_, samp); - return samp; -} - -// Load a Char Samp from a dump file -CharSamp *CharSamp::FromCharDumpFile(unsigned char **raw_data_ptr) { - unsigned int val32; - char_32 *label32; - unsigned char *raw_data = *raw_data_ptr; - - // read and check 32 bit marker - memcpy(&val32, raw_data, sizeof(val32)); - raw_data += sizeof(val32); - if (val32 != 0xabd0fefe) { - return NULL; - } - // read label length, - memcpy(&val32, raw_data, sizeof(val32)); - raw_data += sizeof(val32); - // the label is not null terminated in the file - if (val32 > 0 && val32 < MAX_UINT32) { - label32 = new char_32[val32 + 1]; - if (label32 == NULL) { - return NULL; - } - // read label - memcpy(label32, raw_data, val32 * sizeof(*label32)); - raw_data += (val32 * sizeof(*label32)); - // null terminate - label32[val32] = 0; - } else { - label32 = NULL; - } - - // create the object - CharSamp *char_samp = new CharSamp(); - if (char_samp == NULL) { - return NULL; - } - - // read coordinates - char_samp->label32_ = label32; - memcpy(&char_samp->page_, raw_data, sizeof(char_samp->page_)); - raw_data += sizeof(char_samp->page_); - memcpy(&char_samp->left_, raw_data, sizeof(char_samp->left_)); - raw_data += sizeof(char_samp->left_); - memcpy(&char_samp->top_, raw_data, sizeof(char_samp->top_)); - raw_data += sizeof(char_samp->top_); - memcpy(&char_samp->first_char_, raw_data, sizeof(char_samp->first_char_)); - raw_data += sizeof(char_samp->first_char_); - memcpy(&char_samp->last_char_, raw_data, sizeof(char_samp->last_char_)); - raw_data += sizeof(char_samp->last_char_); - memcpy(&char_samp->norm_top_, raw_data, sizeof(char_samp->norm_top_)); - raw_data += sizeof(char_samp->norm_top_); - memcpy(&char_samp->norm_bottom_, raw_data, sizeof(char_samp->norm_bottom_)); - raw_data += sizeof(char_samp->norm_bottom_); - memcpy(&char_samp->norm_aspect_ratio_, raw_data, - sizeof(char_samp->norm_aspect_ratio_)); - raw_data += sizeof(char_samp->norm_aspect_ratio_); - - // load the Bmp8 part - if (char_samp->LoadFromCharDumpFile(&raw_data) == false) { - delete char_samp; - return NULL; - } - - (*raw_data_ptr) = raw_data; - return char_samp; -} - -// computes the features corresponding to the char sample -bool CharSamp::ComputeFeatures(int conv_grid_size, float *features) { - // Create a scaled BMP - CharSamp *scaled_bmp = Scale(conv_grid_size, conv_grid_size); - if (!scaled_bmp) { - return false; - } - // prepare input - unsigned char *buff = scaled_bmp->RawData(); - // bitmap features - int input; - int bmp_size = conv_grid_size * conv_grid_size; - for (input = 0; input < bmp_size; input++) { - features[input] = 255.0f - (1.0f * buff[input]); - } - // word context features - features[input++] = FirstChar(); - features[input++] = LastChar(); - features[input++] = NormTop(); - features[input++] = NormBottom(); - features[input++] = NormAspectRatio(); - delete scaled_bmp; - return true; -} -} // namespace tesseract diff --git a/cube/char_samp.h b/cube/char_samp.h deleted file mode 100644 index a3c3063b..00000000 --- a/cube/char_samp.h +++ /dev/null @@ -1,166 +0,0 @@ -/********************************************************************** - * File: char_samp.h - * Description: Declaration of a Character Bitmap Sample Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CharSamp inherits the Bmp8 class that represents images of -// words, characters and segments throughout Cube -// CharSamp adds more data members to hold the physical location of the image -// in a page, page number in a book if available. -// It also holds the label (GT) of the image that might correspond to a single -// character or a word -// It also provides methods for segmenting, scaling and cropping of the sample - -#ifndef CHAR_SAMP_H -#define CHAR_SAMP_H - -#include -#include -#include -#include "bmp_8.h" -#include "string_32.h" - -namespace tesseract { - -class CharSamp : public Bmp8 { - public: - CharSamp(); - CharSamp(int wid, int hgt); - CharSamp(int left, int top, int wid, int hgt); - ~CharSamp(); - // accessor methods - unsigned short Left() const { return left_; } - unsigned short Right() const { return left_ + wid_; } - unsigned short Top() const { return top_; } - unsigned short Bottom() const { return top_ + hgt_; } - unsigned short Page() const { return page_; } - unsigned short NormTop() const { return norm_top_; } - unsigned short NormBottom() const { return norm_bottom_; } - unsigned short NormAspectRatio() const { return norm_aspect_ratio_; } - unsigned short FirstChar() const { return first_char_; } - unsigned short LastChar() const { return last_char_; } - char_32 Label() const { - if (label32_ == NULL || LabelLen() != 1) { - return 0; - } - return label32_[0]; - } - char_32 * StrLabel() const { return label32_; } - string stringLabel() const; - - void SetLeft(unsigned short left) { left_ = left; } - void SetTop(unsigned short top) { top_ = top; } - void SetPage(unsigned short page) { page_ = page; } - void SetLabel(char_32 label) { - if (label32_ != NULL) { - delete []label32_; - } - label32_ = new char_32[2]; - if (label32_ != NULL) { - label32_[0] = label; - label32_[1] = 0; - } - } - void SetLabel(const char_32 *label32) { - if (label32_ != NULL) { - delete []label32_; - label32_ = NULL; - } - if (label32 != NULL) { - // remove any byte order marks if any - if (label32[0] == 0xfeff) { - label32++; - } - int len = LabelLen(label32); - label32_ = new char_32[len + 1]; - if (label32_ != NULL) { - memcpy(label32_, label32, len * sizeof(*label32)); - label32_[len] = 0; - } - } - } - void SetLabel(string str); - void SetNormTop(unsigned short norm_top) { norm_top_ = norm_top; } - void SetNormBottom(unsigned short norm_bottom) { - norm_bottom_ = norm_bottom; - } - void SetNormAspectRatio(unsigned short norm_aspect_ratio) { - norm_aspect_ratio_ = norm_aspect_ratio; - } - void SetFirstChar(unsigned short first_char) { - first_char_ = first_char; - } - void SetLastChar(unsigned short last_char) { - last_char_ = last_char; - } - - // Saves the charsamp to a dump file - bool Save2CharDumpFile(FILE *fp) const; - // Crops the underlying image and returns a new CharSamp with the - // same character information but new dimensions. Warning: does not - // necessarily set the normalized top and bottom correctly since - // those depend on its location within the word (or CubeSearchObject). - CharSamp *Crop(); - // Computes the connected components of the char sample - ConComp **Segment(int *seg_cnt, bool right_2_left, int max_hist_wnd, - int min_con_comp_size) const; - // returns a copy of the charsamp that is scaled to the - // specified width and height - CharSamp *Scale(int wid, int hgt, bool isotropic = true); - // returns a Clone of the charsample - CharSamp *Clone() const; - // computes the features corresponding to the char sample - bool ComputeFeatures(int conv_grid_size, float *features); - // Load a Char Samp from a dump file - static CharSamp *FromCharDumpFile(CachedFile *fp); - static CharSamp *FromCharDumpFile(FILE *fp); - static CharSamp *FromCharDumpFile(unsigned char **raw_data); - static CharSamp *FromRawData(int left, int top, int wid, int hgt, - unsigned char *data); - static CharSamp *FromConComps(ConComp **concomp_array, - int strt_concomp, int seg_flags_size, - int *seg_flags, bool *left_most, - bool *right_most, int word_hgt); - static int AuxFeatureCnt() { return (5); } - // Return the length of the label string - int LabelLen() const { return LabelLen(label32_); } - static int LabelLen(const char_32 *label32) { - if (label32 == NULL) { - return 0; - } - int len = 0; - while (label32[++len] != 0); - return len; - } - private: - char_32 * label32_; - unsigned short page_; - unsigned short left_; - unsigned short top_; - // top of sample normalized to a word height of 255 - unsigned short norm_top_; - // bottom of sample normalized to a word height of 255 - unsigned short norm_bottom_; - // 255 * ratio of character width to (width + height) - unsigned short norm_aspect_ratio_; - unsigned short first_char_; - unsigned short last_char_; -}; - -} - -#endif // CHAR_SAMP_H diff --git a/cube/char_samp_enum.cpp b/cube/char_samp_enum.cpp deleted file mode 100644 index 46d9b209..00000000 --- a/cube/char_samp_enum.cpp +++ /dev/null @@ -1,30 +0,0 @@ -/********************************************************************** - * File: char_samp_enum.cpp - * Description: Implementation of a Character Sample Enumerator Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "char_samp_enum.h" - -namespace tesseract { - -CharSampEnum::CharSampEnum() { -} - -CharSampEnum::~CharSampEnum() { -} - -} // namespace ocrlib diff --git a/cube/char_samp_enum.h b/cube/char_samp_enum.h deleted file mode 100644 index 93768e0f..00000000 --- a/cube/char_samp_enum.h +++ /dev/null @@ -1,38 +0,0 @@ -/********************************************************************** - * File: char_samp_enum.h - * Description: Declaration of a Character Sample Enumerator Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CharSampEnum class provides the base class for CharSamp class -// Enumerators. This is typically used to implement dump file readers - -#ifndef CHARSAMP_ENUM_H -#define CHARSAMP_ENUM_H - -#include "char_samp.h" - -namespace tesseract { - -class CharSampEnum { - public: - CharSampEnum(); - virtual ~CharSampEnum(); - virtual bool EnumCharSamp(CharSamp *char_samp, float progress) = 0; -}; -} - -#endif // CHARSAMP_ENUM_H diff --git a/cube/char_samp_set.cpp b/cube/char_samp_set.cpp deleted file mode 100644 index 2a495095..00000000 --- a/cube/char_samp_set.cpp +++ /dev/null @@ -1,182 +0,0 @@ -/********************************************************************** - * File: char_samp_enum.cpp - * Description: Implementation of a Character Sample Set Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include "char_samp_set.h" -#include "cached_file.h" - -namespace tesseract { - -CharSampSet::CharSampSet() { - cnt_ = 0; - samp_buff_ = NULL; - own_samples_ = false; -} - -CharSampSet::~CharSampSet() { - Cleanup(); -} - -// free buffers and init vars -void CharSampSet::Cleanup() { - if (samp_buff_ != NULL) { - // only free samples if owned by class - if (own_samples_ == true) { - for (int samp_idx = 0; samp_idx < cnt_; samp_idx++) { - if (samp_buff_[samp_idx] != NULL) { - delete samp_buff_[samp_idx]; - } - } - } - delete []samp_buff_; - } - cnt_ = 0; - samp_buff_ = NULL; -} - -// add a new sample -bool CharSampSet::Add(CharSamp *char_samp) { - if ((cnt_ % SAMP_ALLOC_BLOCK) == 0) { - // create an extended buffer - CharSamp **new_samp_buff = - reinterpret_cast(new CharSamp *[cnt_ + SAMP_ALLOC_BLOCK]); - if (new_samp_buff == NULL) { - return false; - } - // copy old contents - if (cnt_ > 0) { - memcpy(new_samp_buff, samp_buff_, cnt_ * sizeof(*samp_buff_)); - delete []samp_buff_; - } - samp_buff_ = new_samp_buff; - } - samp_buff_[cnt_++] = char_samp; - return true; -} - -// load char samples from file -bool CharSampSet::LoadCharSamples(FILE *fp) { - // free existing - Cleanup(); - // samples are created here and owned by the class - own_samples_ = true; - // start loading char samples - while (feof(fp) == 0) { - CharSamp *new_samp = CharSamp::FromCharDumpFile(fp); - if (new_samp != NULL) { - if (Add(new_samp) == false) { - return false; - } - } - } - return true; -} - -// creates a CharSampSet object from file -CharSampSet * CharSampSet::FromCharDumpFile(string file_name) { - FILE *fp; - unsigned int val32; - // open the file - fp = fopen(file_name.c_str(), "rb"); - if (fp == NULL) { - return NULL; - } - // read and verify marker - if (fread(&val32, 1, sizeof(val32), fp) != sizeof(val32)) { - fclose(fp); - return NULL; - } - if (val32 != 0xfefeabd0) { - fclose(fp); - return NULL; - } - // create an object - CharSampSet *samp_set = new CharSampSet(); - if (samp_set == NULL) { - fclose(fp); - return NULL; - } - if (samp_set->LoadCharSamples(fp) == false) { - delete samp_set; - samp_set = NULL; - } - fclose(fp); - return samp_set; -} - -// Create a new Char Dump file -FILE *CharSampSet::CreateCharDumpFile(string file_name) { - FILE *fp; - unsigned int val32; - // create the file - fp = fopen(file_name.c_str(), "wb"); - if (!fp) { - return NULL; - } - // read and verify marker - val32 = 0xfefeabd0; - if (fwrite(&val32, 1, sizeof(val32), fp) != sizeof(val32)) { - fclose(fp); - return NULL; - } - return fp; -} - -// Enumerate the Samples in the set one-by-one calling the enumertor's - // EnumCharSamp method for each sample -bool CharSampSet::EnumSamples(string file_name, CharSampEnum *enum_obj) { - CachedFile *fp_in; - unsigned int val32; - long i64_size, - i64_pos; - // open the file - fp_in = new CachedFile(file_name); - if (fp_in == NULL) { - return false; - } - i64_size = fp_in->Size(); - if (i64_size < 1) { - return false; - } - // read and verify marker - if (fp_in->Read(&val32, sizeof(val32)) != sizeof(val32)) { - return false; - } - if (val32 != 0xfefeabd0) { - return false; - } - // start loading char samples - while (fp_in->eof() == false) { - CharSamp *new_samp = CharSamp::FromCharDumpFile(fp_in); - i64_pos = fp_in->Tell(); - if (new_samp != NULL) { - bool ret_flag = (enum_obj)->EnumCharSamp(new_samp, - (100.0f * i64_pos / i64_size)); - delete new_samp; - if (ret_flag == false) { - break; - } - } - } - delete fp_in; - return true; -} - -} // namespace ocrlib diff --git a/cube/char_samp_set.h b/cube/char_samp_set.h deleted file mode 100644 index d5242825..00000000 --- a/cube/char_samp_set.h +++ /dev/null @@ -1,73 +0,0 @@ -/********************************************************************** - * File: char_samp_set.h - * Description: Declaration of a Character Sample Set Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CharSampSet set encapsulates a set of CharSet objects typically -// but not necessarily loaded from a file -// It provides methods to load samples from File, Create a new file and -// Add new char samples to the set - -#ifndef CHAR_SAMP_SET_H -#define CHAR_SAMP_SET_H - -#include -#include -#include -#include "char_samp.h" -#include "char_samp_enum.h" -#include "char_set.h" - -namespace tesseract { - -// chunks of samp pointers to allocate -#define SAMP_ALLOC_BLOCK 10000 - -class CharSampSet { - public: - CharSampSet(); - ~CharSampSet(); - // return sample count - int SampleCount() const { return cnt_; } - // returns samples buffer - CharSamp ** Samples() const { return samp_buff_; } - // Create a CharSampSet set object from a file - static CharSampSet *FromCharDumpFile(string file_name); - // Enumerate the Samples in the set one-by-one calling the enumertor's - // EnumCharSamp method for each sample - static bool EnumSamples(string file_name, CharSampEnum *enumerator); - // Create a new Char Dump file - static FILE *CreateCharDumpFile(string file_name); - // Add a new sample to the set - bool Add(CharSamp *char_samp); - - private: - // sample count - int cnt_; - // the char samp array - CharSamp **samp_buff_; - // Are the samples owned by the set or not. - // Determines whether we should cleanup in the end - bool own_samples_; - // Cleanup - void Cleanup(); - // Load character samples from a file - bool LoadCharSamples(FILE *fp); -}; -} - -#endif // CHAR_SAMP_SET_H diff --git a/cube/char_set.cpp b/cube/char_set.cpp deleted file mode 100644 index 1414d640..00000000 --- a/cube/char_set.cpp +++ /dev/null @@ -1,186 +0,0 @@ -/********************************************************************** - * File: char_samp_enum.cpp - * Description: Implementation of a Character Set Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include - -#include "char_set.h" -#include "cube_utils.h" -#include "tessdatamanager.h" - -namespace tesseract { - -CharSet::CharSet() { - class_cnt_ = 0; - class_strings_ = NULL; - unicharset_map_ = NULL; - init_ = false; - - // init hash table - memset(hash_bin_size_, 0, sizeof(hash_bin_size_)); -} - -CharSet::~CharSet() { - if (class_strings_ != NULL) { - for (int cls = 0; cls < class_cnt_; cls++) { - if (class_strings_[cls] != NULL) { - delete class_strings_[cls]; - } - } - delete []class_strings_; - class_strings_ = NULL; - } - delete []unicharset_map_; -} - -// Creates CharSet object by reading the unicharset from the -// TessDatamanager, and mapping Cube's unicharset to Tesseract's if -// they differ. -CharSet *CharSet::Create(TessdataManager *tessdata_manager, - UNICHARSET *tess_unicharset) { - CharSet *char_set = new CharSet(); - if (char_set == NULL) { - return NULL; - } - - // First look for Cube's unicharset; if not there, use tesseract's - bool cube_unicharset_exists; - if (!(cube_unicharset_exists = - tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) && - !tessdata_manager->SeekToStart(TESSDATA_UNICHARSET)) { - fprintf(stderr, "Cube ERROR (CharSet::Create): could not find " - "either cube or tesseract unicharset\n"); - return NULL; - } - FILE *charset_fp = tessdata_manager->GetDataFilePtr(); - if (!charset_fp) { - fprintf(stderr, "Cube ERROR (CharSet::Create): could not load " - "a unicharset\n"); - return NULL; - } - - // If we found a cube unicharset separate from tesseract's, load it and - // map its unichars to tesseract's; if only one unicharset exists, - // just load it. - bool loaded; - if (cube_unicharset_exists) { - char_set->cube_unicharset_.load_from_file(charset_fp); - loaded = tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET); - loaded = loaded && char_set->LoadSupportedCharList( - tessdata_manager->GetDataFilePtr(), tess_unicharset); - char_set->unicharset_ = &char_set->cube_unicharset_; - } else { - loaded = char_set->LoadSupportedCharList(charset_fp, NULL); - char_set->unicharset_ = tess_unicharset; - } - if (!loaded) { - delete char_set; - return NULL; - } - - char_set->init_ = true; - return char_set; -} - -// Load the list of supported chars from the given data file pointer. -bool CharSet::LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset) { - if (init_) - return true; - - char str_line[256]; - // init hash table - memset(hash_bin_size_, 0, sizeof(hash_bin_size_)); - // read the char count - if (fgets(str_line, sizeof(str_line), fp) == NULL) { - fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not " - "read char count.\n"); - return false; - } - class_cnt_ = atoi(str_line); - if (class_cnt_ < 2) { - fprintf(stderr, "Cube ERROR (CharSet::InitMemory): invalid " - "class count: %d\n", class_cnt_); - return false; - } - // memory for class strings - class_strings_ = new string_32*[class_cnt_]; - if (class_strings_ == NULL) { - fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not " - "allocate memory for class strings.\n"); - return false; - } - // memory for unicharset map - if (tess_unicharset) { - unicharset_map_ = new int[class_cnt_]; - if (unicharset_map_ == NULL) { - fprintf(stderr, "Cube ERROR (CharSet::InitMemory): could not " - "allocate memory for unicharset map.\n"); - return false; - } - } - - // Read in character strings and add to hash table - for (int class_id = 0; class_id < class_cnt_; class_id++) { - // Read the class string - if (fgets(str_line, sizeof(str_line), fp) == NULL) { - fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): " - "could not read class string with class_id=%d.\n", class_id); - return false; - } - // Terminate at space if any - char *p = strchr(str_line, ' '); - if (p != NULL) - *p = '\0'; - // Convert to UTF32 and store - string_32 str32; - // Convert NULL to a space - if (strcmp(str_line, "NULL") == 0) { - strcpy(str_line, " "); - } - CubeUtils::UTF8ToUTF32(str_line, &str32); - class_strings_[class_id] = new string_32(str32); - if (class_strings_[class_id] == NULL) { - fprintf(stderr, "Cube ERROR (CharSet::ReadAndHashStrings): could not " - "allocate memory for class string with class_id=%d.\n", class_id); - return false; - } - - // Add to hash-table - int hash_val = Hash(reinterpret_cast(str32.c_str())); - if (hash_bin_size_[hash_val] >= kMaxHashSize) { - fprintf(stderr, "Cube ERROR (CharSet::LoadSupportedCharList): hash " - "table is full.\n"); - return false; - } - hash_bins_[hash_val][hash_bin_size_[hash_val]++] = class_id; - - if (tess_unicharset != NULL) { - // Add class id to unicharset map - UNICHAR_ID tess_id = tess_unicharset->unichar_to_id(str_line); - if (tess_id == INVALID_UNICHAR_ID) { - tess_unicharset->unichar_insert(str_line); - tess_id = tess_unicharset->unichar_to_id(str_line); - } - ASSERT_HOST(tess_id != INVALID_UNICHAR_ID); - unicharset_map_[class_id] = tess_id; - } - } - return true; -} - -} // tesseract diff --git a/cube/char_set.h b/cube/char_set.h deleted file mode 100644 index 12aea82d..00000000 --- a/cube/char_set.h +++ /dev/null @@ -1,174 +0,0 @@ -/********************************************************************** - * File: char_samp_enum.h - * Description: Declaration of a Character Set Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CharSet class encapsulates the list of 32-bit strings/characters that -// Cube supports for a specific language. The char set is loaded from the -// .unicharset file corresponding to a specific language -// Each string has a corresponding int class-id that gets used throughout Cube -// The class provides pass back and forth conversion between the class-id -// and its corresponding 32-bit string. This is done using a hash table that -// maps the string to the class id. - -#ifndef CHAR_SET_H -#define CHAR_SET_H - -#include -#include -#include - -#include "string_32.h" -#include "tessdatamanager.h" -#include "unicharset.h" -#include "cube_const.h" - -namespace tesseract { - -class CharSet { - public: - CharSet(); - ~CharSet(); - - // Returns true if Cube is sharing Tesseract's unicharset. - inline bool SharedUnicharset() { return (unicharset_map_ == NULL); } - - // Returns the class id corresponding to a 32-bit string. Returns -1 - // if the string is not supported. This is done by hashing the - // string and then looking up the string in the hash-bin if there - // are collisions. - inline int ClassID(const char_32 *str) const { - int hash_val = Hash(str); - if (hash_bin_size_[hash_val] == 0) - return -1; - for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) { - if (class_strings_[hash_bins_[hash_val][bin]]->compare(str) == 0) - return hash_bins_[hash_val][bin]; - } - return -1; - } - // Same as above but using a 32-bit char instead of a string - inline int ClassID(char_32 ch) const { - int hash_val = Hash(ch); - if (hash_bin_size_[hash_val] == 0) - return -1; - for (int bin = 0; bin < hash_bin_size_[hash_val]; bin++) { - if ((*class_strings_[hash_bins_[hash_val][bin]])[0] == ch && - class_strings_[hash_bins_[hash_val][bin]]->length() == 1) { - return hash_bins_[hash_val][bin]; - } - } - return -1; - } - // Retrieve the unicharid in Tesseract's unicharset corresponding - // to a 32-bit string. When Tesseract and Cube share the same - // unicharset, this will just be the class id. - inline int UnicharID(const char_32 *str) const { - int class_id = ClassID(str); - if (class_id == INVALID_UNICHAR_ID) - return INVALID_UNICHAR_ID; - int unichar_id; - if (unicharset_map_) - unichar_id = unicharset_map_[class_id]; - else - unichar_id = class_id; - return unichar_id; - } - // Same as above but using a 32-bit char instead of a string - inline int UnicharID(char_32 ch) const { - int class_id = ClassID(ch); - if (class_id == INVALID_UNICHAR_ID) - return INVALID_UNICHAR_ID; - int unichar_id; - if (unicharset_map_) - unichar_id = unicharset_map_[class_id]; - else - unichar_id = class_id; - return unichar_id; - } - // Returns the 32-bit string corresponding to a class id - inline const char_32 * ClassString(int class_id) const { - if (class_id < 0 || class_id >= class_cnt_) { - return NULL; - } - return reinterpret_cast(class_strings_[class_id]->c_str()); - } - // Returns the count of supported strings - inline int ClassCount() const { return class_cnt_; } - - // Creates CharSet object by reading the unicharset from the - // TessDatamanager, and mapping Cube's unicharset to Tesseract's if - // they differ. - static CharSet *Create(TessdataManager *tessdata_manager, - UNICHARSET *tess_unicharset); - - // Return the UNICHARSET cube is using for recognition internally -- - // ClassId() returns unichar_id's in this unicharset. - UNICHARSET *InternalUnicharset() { return unicharset_; } - - private: - // Hash table configuration params. Determined emperically on - // the supported languages so far (Eng, Ara, Hin). Might need to be - // tuned for speed when more languages are supported - static const int kHashBins = 3001; - static const int kMaxHashSize = 16; - - // Using djb2 hashing function to hash a 32-bit string - // introduced in http://www.cse.yorku.ca/~oz/hash.html - static inline int Hash(const char_32 *str) { - unsigned long hash = 5381; - int c; - while ((c = *str++)) - hash = ((hash << 5) + hash) + c; - return (hash%kHashBins); - } - // Same as above but for a single char - static inline int Hash(char_32 ch) { - char_32 b[2]; - b[0] = ch; - b[1] = 0; - return Hash(b); - } - - // Load the list of supported chars from the given data file - // pointer. If tess_unicharset is non-NULL, mapping each Cube class - // id to a tesseract unicharid. - bool LoadSupportedCharList(FILE *fp, UNICHARSET *tess_unicharset); - - // class count - int class_cnt_; - // hash-bin sizes array - int hash_bin_size_[kHashBins]; - // hash bins - int hash_bins_[kHashBins][kMaxHashSize]; - // supported strings array - string_32 **class_strings_; - // map from class id to secondary (tesseract's) unicharset's ids - int *unicharset_map_; - // A unicharset which is filled in with a Tesseract-style UNICHARSET for - // cube's data if our unicharset is different from tesseract's. - UNICHARSET cube_unicharset_; - // This points to either the tess_unicharset we're passed or cube_unicharset_, - // depending upon whether we just have one unicharset or one for each - // tesseract and cube, respectively. - UNICHARSET *unicharset_; - // has the char set been initialized flag - bool init_; -}; -} - -#endif // CHAR_SET_H diff --git a/cube/classifier_base.h b/cube/classifier_base.h deleted file mode 100644 index 8c2b1bbf..00000000 --- a/cube/classifier_base.h +++ /dev/null @@ -1,100 +0,0 @@ -/********************************************************************** - * File: classifier_base.h - * Description: Declaration of the Base Character Classifier - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CharClassifier class is the abstract class for any character/grapheme -// classifier. - -#ifndef CHAR_CLASSIFIER_BASE_H -#define CHAR_CLASSIFIER_BASE_H - -#include -#include "char_samp.h" -#include "char_altlist.h" -#include "char_set.h" -#include "feature_base.h" -#include "lang_model.h" -#include "tuning_params.h" - -namespace tesseract { -class CharClassifier { - public: - CharClassifier(CharSet *char_set, TuningParams *params, - FeatureBase *feat_extract) { - char_set_ = char_set; - params_ = params; - feat_extract_ = feat_extract; - fold_sets_ = NULL; - fold_set_cnt_ = 0; - fold_set_len_ = NULL; - init_ = false; - case_sensitive_ = true; - } - - virtual ~CharClassifier() { - if (fold_sets_ != NULL) { - for (int fold_set = 0; fold_set < fold_set_cnt_; fold_set++) { - if (fold_sets_[fold_set] != NULL) { - delete []fold_sets_[fold_set]; - } - } - delete []fold_sets_; - fold_sets_ = NULL; - } - if (fold_set_len_ != NULL) { - delete []fold_set_len_; - fold_set_len_ = NULL; - } - if (feat_extract_ != NULL) { - delete feat_extract_; - feat_extract_ = NULL; - } - } - - // pure virtual functions that need to be implemented by any inheriting class - virtual CharAltList * Classify(CharSamp *char_samp) = 0; - virtual int CharCost(CharSamp *char_samp) = 0; - virtual bool Train(CharSamp *char_samp, int ClassID) = 0; - virtual bool SetLearnParam(char *var_name, float val) = 0; - virtual bool Init(const string &data_file_path, const string &lang, - LangModel *lang_mod) = 0; - - // accessors - FeatureBase *FeatureExtractor() {return feat_extract_;} - inline bool CaseSensitive() const { return case_sensitive_; } - inline void SetCaseSensitive(bool case_sensitive) { - case_sensitive_ = case_sensitive; - } - - protected: - virtual void Fold() = 0; - virtual bool LoadFoldingSets(const string &data_file_path, - const string &lang, - LangModel *lang_mod) = 0; - FeatureBase *feat_extract_; - CharSet *char_set_; - TuningParams *params_; - int **fold_sets_; - int *fold_set_len_; - int fold_set_cnt_; - bool init_; - bool case_sensitive_; -}; -} // tesseract - -#endif // CHAR_CLASSIFIER_BASE_H diff --git a/cube/classifier_factory.cpp b/cube/classifier_factory.cpp deleted file mode 100644 index a22f0d4e..00000000 --- a/cube/classifier_factory.cpp +++ /dev/null @@ -1,97 +0,0 @@ -/********************************************************************** - * File: classifier_factory.cpp - * Description: Implementation of the Base Character Classifier - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include "classifier_factory.h" -#include "conv_net_classifier.h" -#include "feature_base.h" -#include "feature_bmp.h" -#include "feature_chebyshev.h" -#include "feature_hybrid.h" -#include "hybrid_neural_net_classifier.h" - -namespace tesseract { - -// Creates a CharClassifier object of the appropriate type depending on the -// classifier type in the settings file -CharClassifier *CharClassifierFactory::Create(const string &data_file_path, - const string &lang, - LangModel *lang_mod, - CharSet *char_set, - TuningParams *params) { - // create the feature extraction object - FeatureBase *feat_extract; - - switch (params->TypeFeature()) { - case TuningParams::BMP: - feat_extract = new FeatureBmp(params); - break; - case TuningParams::CHEBYSHEV: - feat_extract = new FeatureChebyshev(params); - break; - case TuningParams::HYBRID: - feat_extract = new FeatureHybrid(params); - break; - default: - fprintf(stderr, "Cube ERROR (CharClassifierFactory::Create): invalid " - "feature type.\n"); - return NULL; - } - - if (feat_extract == NULL) { - fprintf(stderr, "Cube ERROR (CharClassifierFactory::Create): unable " - "to instantiate feature extraction object.\n"); - return NULL; - } - - // create the classifier object - CharClassifier *classifier_obj; - switch (params->TypeClassifier()) { - case TuningParams::NN: - classifier_obj = new ConvNetCharClassifier(char_set, params, - feat_extract); - break; - case TuningParams::HYBRID_NN: - classifier_obj = new HybridNeuralNetCharClassifier(char_set, params, - feat_extract); - break; - default: - fprintf(stderr, "Cube ERROR (CharClassifierFactory::Create): invalid " - "classifier type.\n"); - return NULL; - } - - if (classifier_obj == NULL) { - fprintf(stderr, "Cube ERROR (CharClassifierFactory::Create): error " - "allocating memory for character classifier object.\n"); - return NULL; - } - - // Init the classifier - if (!classifier_obj->Init(data_file_path, lang, lang_mod)) { - delete classifier_obj; - fprintf(stderr, "Cube ERROR (CharClassifierFactory::Create): unable " - "to Init() character classifier object.\n"); - return NULL; - } - return classifier_obj; -} -} diff --git a/cube/classifier_factory.h b/cube/classifier_factory.h deleted file mode 100644 index f7254d58..00000000 --- a/cube/classifier_factory.h +++ /dev/null @@ -1,43 +0,0 @@ -/********************************************************************** - * File: classifier_factory.h - * Description: Declaration of the Base Character Classifier - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CharClassifierFactory provides a single static method to create an -// instance of the desired classifier - -#ifndef CHAR_CLASSIFIER_FACTORY_H -#define CHAR_CLASSIFIER_FACTORY_H - -#include -#include "classifier_base.h" -#include "lang_model.h" - -namespace tesseract { -class CharClassifierFactory { - public: - // Creates a CharClassifier object of the appropriate type depending on the - // classifier type in the settings file - static CharClassifier *Create(const string &data_file_path, - const string &lang, - LangModel *lang_mod, - CharSet *char_set, - TuningParams *params); -}; -} // tesseract - -#endif // CHAR_CLASSIFIER_FACTORY_H diff --git a/cube/con_comp.cpp b/cube/con_comp.cpp deleted file mode 100644 index 53b1a73b..00000000 --- a/cube/con_comp.cpp +++ /dev/null @@ -1,286 +0,0 @@ -/********************************************************************** - * File: con_comp.cpp - * Description: Implementation of a Connected Component class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include "con_comp.h" -#include "cube_const.h" - -namespace tesseract { - -ConComp::ConComp() { - head_ = NULL; - tail_ = NULL; - left_ = 0; - top_ = 0; - right_ = 0; - bottom_ = 0; - left_most_ = false; - right_most_ = false; - id_ = -1; - pt_cnt_ = 0; -} - -ConComp::~ConComp() { - if (head_ != NULL) { - ConCompPt *pt_ptr = head_; - while (pt_ptr != NULL) { - ConCompPt *pptNext = pt_ptr->Next(); - delete pt_ptr; - pt_ptr = pptNext; - } - head_ = NULL; - } -} - -// adds a pt to the conn comp and updates its boundaries -bool ConComp::Add(int x, int y) { - ConCompPt *pt_ptr = new ConCompPt(x, y); - if (pt_ptr == NULL) { - return false; - } - - if (head_ == NULL) { - left_ = x; - right_ = x; - top_ = y; - bottom_ = y; - - head_ = pt_ptr; - } else { - left_ = left_ <= x ? left_ : x; - top_ = top_ <= y ? top_ : y; - right_ = right_ >= x ? right_ : x; - bottom_ = bottom_ >= y ? bottom_ : y; - } - - if (tail_ != NULL) { - tail_->SetNext(pt_ptr); - } - - tail_ = pt_ptr; - pt_cnt_++; - return true; -} - -// merges two connected components -bool ConComp::Merge(ConComp *concomp) { - if (head_ == NULL || tail_ == NULL || - concomp->head_ == NULL || concomp->tail_ == NULL) { - return false; - } - - tail_->SetNext(concomp->head_); - tail_ = concomp->tail_; - left_ = left_ <= concomp->left_ ? left_ : concomp->left_; - top_ = top_ <= concomp->top_ ? top_ : concomp->top_; - right_ = right_ >= concomp->right_ ? right_ : concomp->right_; - bottom_ = bottom_ >= concomp->bottom_ ? bottom_ : concomp->bottom_; - pt_cnt_ += concomp->pt_cnt_; - - concomp->head_ = NULL; - concomp->tail_ = NULL; - - return true; -} - -// Creates the x-coord density histogram after spreading -// each x-coord position by the HIST_WND_RATIO fraction of the -// height of the ConComp, but limited to max_hist_wnd -int *ConComp::CreateHistogram(int max_hist_wnd) { - int wid = right_ - left_ + 1, - hgt = bottom_ - top_ + 1, - hist_wnd = static_cast(hgt * HIST_WND_RATIO); - - if (hist_wnd > max_hist_wnd) { - hist_wnd = max_hist_wnd; - } - - // alloc memo for histogram - int *hist_array = new int[wid]; - if (hist_array == NULL) { - return NULL; - } - - memset(hist_array, 0, wid * sizeof(*hist_array)); - - // compute windowed histogram - ConCompPt *pt_ptr = head_; - - while (pt_ptr != NULL) { - int x = pt_ptr->x() - left_, - xw = x - hist_wnd; - - for (int xdel = -hist_wnd; xdel <= hist_wnd; xdel++, xw++) { - if (xw >= 0 && xw < wid) { - hist_array[xw]++; - } - } - - pt_ptr = pt_ptr->Next(); - } - - return hist_array; -} - -// find out the seg pts by looking for local minima in the histogram -int *ConComp::SegmentHistogram(int *hist_array, int *seg_pt_cnt) { - // init - (*seg_pt_cnt) = 0; - - int wid = right_ - left_ + 1, - hgt = bottom_ - top_ + 1; - - int *x_seg_pt = new int[wid]; - if (x_seg_pt == NULL) { - return NULL; - } - - int seg_pt_wnd = static_cast(hgt * SEG_PT_WND_RATIO); - - if (seg_pt_wnd > 1) { - seg_pt_wnd = 1; - } - - for (int x = 2; x < (wid - 2); x++) { - if (hist_array[x] < hist_array[x - 1] && - hist_array[x] < hist_array[x - 2] && - hist_array[x] <= hist_array[x + 1] && - hist_array[x] <= hist_array[x + 2]) { - x_seg_pt[(*seg_pt_cnt)++] = x; - x += seg_pt_wnd; - } else if (hist_array[x] <= hist_array[x - 1] && - hist_array[x] <= hist_array[x - 2] && - hist_array[x] < hist_array[x + 1] && - hist_array[x] < hist_array[x + 2]) { - x_seg_pt[(*seg_pt_cnt)++] = x; - x += seg_pt_wnd; - } - } - - // no segments, nothing to do - if ((*seg_pt_cnt) == 0) { - delete []x_seg_pt; - return NULL; - } - - return x_seg_pt; -} - -// segments a concomp based on pixel density histogram local minima -// if there were none found, it returns NULL -// this is more useful than creating a clone of itself -ConComp **ConComp::Segment(int max_hist_wnd, int *concomp_cnt) { - // init - (*concomp_cnt) = 0; - - // No pts - if (head_ == NULL) { - return NULL; - } - - int seg_pt_cnt = 0; - - // create the histogram - int *hist_array = CreateHistogram(max_hist_wnd); - if (hist_array == NULL) { - return NULL; - } - - int *x_seg_pt = SegmentHistogram(hist_array, &seg_pt_cnt); - - // free histogram - delete []hist_array; - - // no segments, nothing to do - if (seg_pt_cnt == 0) { - delete []x_seg_pt; - return NULL; - } - - // create concomp array - ConComp **concomp_array = new ConComp *[seg_pt_cnt + 1]; - if (concomp_array == NULL) { - delete []x_seg_pt; - return NULL; - } - - for (int concomp = 0; concomp <= seg_pt_cnt; concomp++) { - concomp_array[concomp] = new ConComp(); - if (concomp_array[concomp] == NULL) { - delete []x_seg_pt; - delete []concomp_array; - return NULL; - } - - // split concomps inherit the ID this concomp - concomp_array[concomp]->SetID(id_); - } - - // set the left and right most attributes of the - // appropriate concomps - concomp_array[0]->left_most_ = true; - concomp_array[seg_pt_cnt]->right_most_ = true; - - // assign pts to concomps - ConCompPt *pt_ptr = head_; - while (pt_ptr != NULL) { - int seg_pt; - - // find the first seg-pt that exceeds the x value - // of the pt - for (seg_pt = 0; seg_pt < seg_pt_cnt; seg_pt++) { - if ((x_seg_pt[seg_pt] + left_) > pt_ptr->x()) { - break; - } - } - - // add the pt to the proper concomp - if (concomp_array[seg_pt]->Add(pt_ptr->x(), pt_ptr->y()) == false) { - delete []x_seg_pt; - delete []concomp_array; - return NULL; - } - - pt_ptr = pt_ptr->Next(); - } - - delete []x_seg_pt; - - (*concomp_cnt) = (seg_pt_cnt + 1); - - return concomp_array; -} - -// Shifts the co-ordinates of all points by the specified x & y deltas -void ConComp::Shift(int dx, int dy) { - ConCompPt *pt_ptr = head_; - - while (pt_ptr != NULL) { - pt_ptr->Shift(dx, dy); - pt_ptr = pt_ptr->Next(); - } - - left_ += dx; - right_ += dx; - top_ += dy; - bottom_ += dy; -} - -} // namespace tesseract diff --git a/cube/con_comp.h b/cube/con_comp.h deleted file mode 100644 index 4d000154..00000000 --- a/cube/con_comp.h +++ /dev/null @@ -1,124 +0,0 @@ -/********************************************************************** - * File: con_comp.h - * Description: Declaration of a Connected Component class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#ifndef CONCOMP_H -#define CONCOMP_H - -// The ConComp class implements the functionality needed for a -// Connected Component object and Connected Component (ConComp) points. -// The points consituting a connected component are kept in a linked-list -// The Concomp class provided methods to: -// 1- Compare components in L2R and R2L reading orders. -// 2- Merge ConComps -// 3- Compute the windowed vertical pixel density histogram for a specific -// windows size -// 4- Segment a ConComp based on the local windowed vertical pixel -// density histogram local minima - -namespace tesseract { - -// Implments a ConComp point in a linked list of points -class ConCompPt { - public: - ConCompPt(int x, int y) { - x_ = x; - y_ = y; - next_pt_ = NULL; - } - inline int x() { return x_; } - inline int y() { return y_; } - inline void Shift(int dx, int dy) { - x_ += dx; - y_ += dy; - } - inline ConCompPt * Next() { return next_pt_; } - inline void SetNext(ConCompPt *pt) { next_pt_ = pt; } - - private: - int x_; - int y_; - ConCompPt *next_pt_; -}; - -class ConComp { - public: - ConComp(); - virtual ~ConComp(); - // accessors - inline ConCompPt *Head() { return head_; } - inline int Left() const { return left_; } - inline int Top() const { return top_; } - inline int Right() const { return right_; } - inline int Bottom() const { return bottom_; } - inline int Width() const { return right_ - left_ + 1; } - inline int Height() const { return bottom_ - top_ + 1; } - - // Comparer used for sorting L2R reading order - inline static int Left2RightComparer(const void *comp1, - const void *comp2) { - return (*(reinterpret_cast(comp1)))->left_ + - (*(reinterpret_cast(comp1)))->right_ - - (*(reinterpret_cast(comp2)))->left_ - - (*(reinterpret_cast(comp2)))->right_; - } - - // Comparer used for sorting R2L reading order - inline static int Right2LeftComparer(const void *comp1, - const void *comp2) { - return (*(reinterpret_cast(comp2)))->right_ - - (*(reinterpret_cast(comp1)))->right_; - } - - // accessors for attribues of a ConComp - inline bool LeftMost() const { return left_most_; } - inline bool RightMost() const { return right_most_; } - inline void SetLeftMost(bool left_most) { left_most_ = left_most; } - inline void SetRightMost(bool right_most) { right_most_ = right_most; - } - inline int ID () const { return id_; } - inline void SetID(int id) { id_ = id; } - inline int PtCnt () const { return pt_cnt_; } - // Add a new pt - bool Add(int x, int y); - // Merge two connected components in-place - bool Merge(ConComp *con_comp); - // Shifts the co-ordinates of all points by the specified x & y deltas - void Shift(int dx, int dy); - // segments a concomp based on pixel density histogram local minima - ConComp **Segment(int max_hist_wnd, int *concomp_cnt); - // creates the vertical pixel density histogram of the concomp - int *CreateHistogram(int max_hist_wnd); - // find out the seg pts by looking for local minima in the histogram - int *SegmentHistogram(int *hist_array, int *seg_pt_cnt); - - private: - int id_; - bool left_most_; - bool right_most_; - int left_; - int top_; - int right_; - int bottom_; - ConCompPt *head_; - ConCompPt *tail_; - int pt_cnt_; -}; -} - -#endif // CONCOMP_H diff --git a/cube/conv_net_classifier.cpp b/cube/conv_net_classifier.cpp deleted file mode 100644 index ac33cd33..00000000 --- a/cube/conv_net_classifier.cpp +++ /dev/null @@ -1,390 +0,0 @@ -/********************************************************************** - * File: charclassifier.cpp - * Description: Implementation of Convolutional-NeuralNet Character Classifier - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include -#include -#include - -#include "char_set.h" -#include "classifier_base.h" -#include "const.h" -#include "conv_net_classifier.h" -#include "cube_utils.h" -#include "feature_base.h" -#include "feature_bmp.h" -#include "tess_lang_model.h" - -namespace tesseract { - -ConvNetCharClassifier::ConvNetCharClassifier(CharSet *char_set, - TuningParams *params, - FeatureBase *feat_extract) - : CharClassifier(char_set, params, feat_extract) { - char_net_ = NULL; - net_input_ = NULL; - net_output_ = NULL; -} - -ConvNetCharClassifier::~ConvNetCharClassifier() { - if (char_net_ != NULL) { - delete char_net_; - char_net_ = NULL; - } - - if (net_input_ != NULL) { - delete []net_input_; - net_input_ = NULL; - } - - if (net_output_ != NULL) { - delete []net_output_; - net_output_ = NULL; - } -} - -/** - * The main training function. Given a sample and a class ID the classifier - * updates its parameters according to its learning algorithm. This function - * is currently not implemented. TODO(ahmadab): implement end-2-end training - */ -bool ConvNetCharClassifier::Train(CharSamp *char_samp, int ClassID) { - return false; -} - -/** - * A secondary function needed for training. Allows the trainer to set the - * value of any train-time parameter. This function is currently not - * implemented. TODO(ahmadab): implement end-2-end training - */ -bool ConvNetCharClassifier::SetLearnParam(char *var_name, float val) { - // TODO(ahmadab): implementation of parameter initializing. - return false; -} - -/** - * Folds the output of the NeuralNet using the loaded folding sets - */ -void ConvNetCharClassifier::Fold() { - // in case insensitive mode - if (case_sensitive_ == false) { - int class_cnt = char_set_->ClassCount(); - // fold case - for (int class_id = 0; class_id < class_cnt; class_id++) { - // get class string - const char_32 *str32 = char_set_->ClassString(class_id); - // get the upper case form of the string - string_32 upper_form32 = str32; - for (int ch = 0; ch < upper_form32.length(); ch++) { - if (iswalpha(static_cast(upper_form32[ch])) != 0) { - upper_form32[ch] = towupper(upper_form32[ch]); - } - } - - // find out the upperform class-id if any - int upper_class_id = - char_set_->ClassID(reinterpret_cast( - upper_form32.c_str())); - if (upper_class_id != -1 && class_id != upper_class_id) { - float max_out = MAX(net_output_[class_id], net_output_[upper_class_id]); - net_output_[class_id] = max_out; - net_output_[upper_class_id] = max_out; - } - } - } - - // The folding sets specify how groups of classes should be folded - // Folding involved assigning a min-activation to all the members - // of the folding set. The min-activation is a fraction of the max-activation - // of the members of the folding set - for (int fold_set = 0; fold_set < fold_set_cnt_; fold_set++) { - if (fold_set_len_[fold_set] == 0) - continue; - float max_prob = net_output_[fold_sets_[fold_set][0]]; - for (int ch = 1; ch < fold_set_len_[fold_set]; ch++) { - if (net_output_[fold_sets_[fold_set][ch]] > max_prob) { - max_prob = net_output_[fold_sets_[fold_set][ch]]; - } - } - for (int ch = 0; ch < fold_set_len_[fold_set]; ch++) { - net_output_[fold_sets_[fold_set][ch]] = MAX(max_prob * kFoldingRatio, - net_output_[fold_sets_[fold_set][ch]]); - } - } -} - -/** - * Compute the features of specified charsamp and feedforward the - * specified nets - */ -bool ConvNetCharClassifier::RunNets(CharSamp *char_samp) { - if (char_net_ == NULL) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::RunNets): " - "NeuralNet is NULL\n"); - return false; - } - int feat_cnt = char_net_->in_cnt(); - int class_cnt = char_set_->ClassCount(); - - // allocate i/p and o/p buffers if needed - if (net_input_ == NULL) { - net_input_ = new float[feat_cnt]; - if (net_input_ == NULL) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::RunNets): " - "unable to allocate memory for input nodes\n"); - return false; - } - - net_output_ = new float[class_cnt]; - if (net_output_ == NULL) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::RunNets): " - "unable to allocate memory for output nodes\n"); - return false; - } - } - - // compute input features - if (feat_extract_->ComputeFeatures(char_samp, net_input_) == false) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::RunNets): " - "unable to compute features\n"); - return false; - } - - if (char_net_ != NULL) { - if (char_net_->FeedForward(net_input_, net_output_) == false) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::RunNets): " - "unable to run feed-forward\n"); - return false; - } - } else { - return false; - } - Fold(); - return true; -} - -/** - * return the cost of being a char - */ -int ConvNetCharClassifier::CharCost(CharSamp *char_samp) { - if (RunNets(char_samp) == false) { - return 0; - } - return CubeUtils::Prob2Cost(1.0f - net_output_[0]); -} - -/** - * classifies a charsamp and returns an alternate list - * of chars sorted by char costs - */ -CharAltList *ConvNetCharClassifier::Classify(CharSamp *char_samp) { - // run the needed nets - if (RunNets(char_samp) == false) { - return NULL; - } - - int class_cnt = char_set_->ClassCount(); - - // create an altlist - CharAltList *alt_list = new CharAltList(char_set_, class_cnt); - if (alt_list == NULL) { - fprintf(stderr, "Cube WARNING (ConvNetCharClassifier::Classify): " - "returning emtpy CharAltList\n"); - return NULL; - } - - for (int out = 1; out < class_cnt; out++) { - int cost = CubeUtils::Prob2Cost(net_output_[out]); - alt_list->Insert(out, cost); - } - - return alt_list; -} - -/** - * Set an external net (for training purposes) - */ -void ConvNetCharClassifier::SetNet(tesseract::NeuralNet *char_net) { - if (char_net_ != NULL) { - delete char_net_; - char_net_ = NULL; - } - char_net_ = char_net; -} - -/** - * This function will return true if the file does not exist. - * But will fail if the it did not pass the sanity checks - */ -bool ConvNetCharClassifier::LoadFoldingSets(const string &data_file_path, - const string &lang, - LangModel *lang_mod) { - fold_set_cnt_ = 0; - string fold_file_name; - fold_file_name = data_file_path + lang; - fold_file_name += ".cube.fold"; - - // folding sets are optional - FILE *fp = fopen(fold_file_name.c_str(), "rb"); - if (fp == NULL) { - return true; - } - fclose(fp); - - string fold_sets_str; - if (!CubeUtils::ReadFileToString(fold_file_name, - &fold_sets_str)) { - return false; - } - - // split into lines - vector str_vec; - CubeUtils::SplitStringUsing(fold_sets_str, "\r\n", &str_vec); - fold_set_cnt_ = str_vec.size(); - - fold_sets_ = new int *[fold_set_cnt_]; - if (fold_sets_ == NULL) { - return false; - } - fold_set_len_ = new int[fold_set_cnt_]; - if (fold_set_len_ == NULL) { - fold_set_cnt_ = 0; - return false; - } - - for (int fold_set = 0; fold_set < fold_set_cnt_; fold_set++) { - reinterpret_cast(lang_mod)->RemoveInvalidCharacters( - &str_vec[fold_set]); - - // if all or all but one character are invalid, invalidate this set - if (str_vec[fold_set].length() <= 1) { - fprintf(stderr, "Cube WARNING (ConvNetCharClassifier::LoadFoldingSets): " - "invalidating folding set %d\n", fold_set); - fold_set_len_[fold_set] = 0; - fold_sets_[fold_set] = NULL; - continue; - } - - string_32 str32; - CubeUtils::UTF8ToUTF32(str_vec[fold_set].c_str(), &str32); - fold_set_len_[fold_set] = str32.length(); - fold_sets_[fold_set] = new int[fold_set_len_[fold_set]]; - if (fold_sets_[fold_set] == NULL) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::LoadFoldingSets): " - "could not allocate folding set\n"); - fold_set_cnt_ = fold_set; - return false; - } - for (int ch = 0; ch < fold_set_len_[fold_set]; ch++) { - fold_sets_[fold_set][ch] = char_set_->ClassID(str32[ch]); - } - } - return true; -} - -/** - * Init the classifier provided a data-path and a language string - */ -bool ConvNetCharClassifier::Init(const string &data_file_path, - const string &lang, - LangModel *lang_mod) { - if (init_) { - return true; - } - - // load the nets if any. This function will return true if the net file - // does not exist. But will fail if the net did not pass the sanity checks - if (!LoadNets(data_file_path, lang)) { - return false; - } - - // load the folding sets if any. This function will return true if the - // file does not exist. But will fail if the it did not pass the sanity checks - if (!LoadFoldingSets(data_file_path, lang, lang_mod)) { - return false; - } - - init_ = true; - return true; -} - -/** - * Load the classifier's Neural Nets - * This function will return true if the net file does not exist. - * But will fail if the net did not pass the sanity checks - */ -bool ConvNetCharClassifier::LoadNets(const string &data_file_path, - const string &lang) { - string char_net_file; - - // add the lang identifier - char_net_file = data_file_path + lang; - char_net_file += ".cube.nn"; - - // neural network is optional - FILE *fp = fopen(char_net_file.c_str(), "rb"); - if (fp == NULL) { - return true; - } - fclose(fp); - - // load main net - char_net_ = tesseract::NeuralNet::FromFile(char_net_file); - if (char_net_ == NULL) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::LoadNets): " - "could not load %s\n", char_net_file.c_str()); - return false; - } - - // validate net - if (char_net_->in_cnt()!= feat_extract_->FeatureCnt()) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::LoadNets): " - "could not validate net %s\n", char_net_file.c_str()); - return false; - } - - // alloc net i/o buffers - int feat_cnt = char_net_->in_cnt(); - int class_cnt = char_set_->ClassCount(); - - if (char_net_->out_cnt() != class_cnt) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::LoadNets): " - "output count (%d) and class count (%d) are not equal\n", - char_net_->out_cnt(), class_cnt); - return false; - } - - // allocate i/p and o/p buffers if needed - if (net_input_ == NULL) { - net_input_ = new float[feat_cnt]; - if (net_input_ == NULL) { - return false; - } - - net_output_ = new float[class_cnt]; - if (net_output_ == NULL) { - return false; - } - } - - return true; -} -} // tesseract diff --git a/cube/conv_net_classifier.h b/cube/conv_net_classifier.h deleted file mode 100644 index b9e7692c..00000000 --- a/cube/conv_net_classifier.h +++ /dev/null @@ -1,94 +0,0 @@ -/********************************************************************** - * File: conv_net_classifier.h - * Description: Declaration of Convolutional-NeuralNet Character Classifier - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The ConvNetCharClassifier inherits from the base classifier class: -// "CharClassifierBase". It implements a Convolutional Neural Net classifier -// instance of the base classifier. It uses the Tesseract Neural Net library -// The Neural Net takes a scaled version of a bitmap and feeds it to a -// Convolutional Neural Net as input and performs a FeedForward. Each output -// of the net corresponds to class_id in the CharSet passed at construction -// time. -// Afterwards, the outputs of the Net are "folded" using the folding set -// (if any) -#ifndef CONV_NET_CLASSIFIER_H -#define CONV_NET_CLASSIFIER_H - -#include -#include "char_samp.h" -#include "char_altlist.h" -#include "char_set.h" -#include "feature_base.h" -#include "classifier_base.h" -#include "neural_net.h" -#include "lang_model.h" -#include "tuning_params.h" - -namespace tesseract { - -// Folding Ratio is the ratio of the max-activation of members of a folding -// set that is used to compute the min-activation of the rest of the set -static const float kFoldingRatio = 0.75; - -class ConvNetCharClassifier : public CharClassifier { - public: - ConvNetCharClassifier(CharSet *char_set, TuningParams *params, - FeatureBase *feat_extract); - virtual ~ConvNetCharClassifier(); - // The main training function. Given a sample and a class ID the classifier - // updates its parameters according to its learning algorithm. This function - // is currently not implemented. TODO(ahmadab): implement end-2-end training - virtual bool Train(CharSamp *char_samp, int ClassID); - // A secondary function needed for training. Allows the trainer to set the - // value of any train-time parameter. This function is currently not - // implemented. TODO(ahmadab): implement end-2-end training - virtual bool SetLearnParam(char *var_name, float val); - // Externally sets the Neural Net used by the classifier. Used for training - void SetNet(tesseract::NeuralNet *net); - - // Classifies an input charsamp and return a CharAltList object containing - // the possible candidates and corresponding scores - virtual CharAltList * Classify(CharSamp *char_samp); - // Computes the cost of a specific charsamp being a character (versus a - // non-character: part-of-a-character OR more-than-one-character) - virtual int CharCost(CharSamp *char_samp); - - - private: - // Neural Net object used for classification - tesseract::NeuralNet *char_net_; - // data buffers used to hold Neural Net inputs and outputs - float *net_input_; - float *net_output_; - - // Init the classifier provided a data-path and a language string - virtual bool Init(const string &data_file_path, const string &lang, - LangModel *lang_mod); - // Loads the NeuralNets needed for the classifier - bool LoadNets(const string &data_file_path, const string &lang); - // Loads the folding sets provided a data-path and a language string - virtual bool LoadFoldingSets(const string &data_file_path, - const string &lang, - LangModel *lang_mod); - // Folds the output of the NeuralNet using the loaded folding sets - virtual void Fold(); - // Scales the input char_samp and feeds it to the NeuralNet as input - bool RunNets(CharSamp *char_samp); -}; -} -#endif // CONV_NET_CLASSIFIER_H diff --git a/cube/cube_const.h b/cube/cube_const.h deleted file mode 100644 index 36d40bbc..00000000 --- a/cube/cube_const.h +++ /dev/null @@ -1,41 +0,0 @@ -/********************************************************************** - * File: const.h - * Description: Defintions of constants used by Cube - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#ifndef CUBE_CONST_H -#define CUBE_CONST_H - -// Scale used to normalize a log-prob to a cost -#define PROB2COST_SCALE 4096.0 -// Maximum possible cost (-log prob of MIN_PROB) -#define MIN_PROB_COST 65536 -// Probability corresponding to the max cost MIN_PROB_COST -#define MIN_PROB 0.000000113 -// Worst possible cost (returned on failure) -#define WORST_COST 0x40000 -// Oversegmentation hysteresis thresholds -#define HIST_WND_RATIO 0.1f -#define SEG_PT_WND_RATIO 0.1f - -#ifdef _WIN32 -#ifdef __GNUC__ -#include -#endif -#endif - -#endif // CUBE_CONST_H diff --git a/cube/cube_line_object.cpp b/cube/cube_line_object.cpp deleted file mode 100644 index 03254537..00000000 --- a/cube/cube_line_object.cpp +++ /dev/null @@ -1,255 +0,0 @@ -/********************************************************************** - * File: cube_line_object.cpp - * Description: Implementation of the Cube Line Object Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include "cube_line_object.h" - -namespace tesseract { -CubeLineObject::CubeLineObject(CubeRecoContext *cntxt, Pix *pix) { - line_pix_ = pix; - own_pix_ = false; - processed_ = false; - cntxt_ = cntxt; - phrase_cnt_ = 0; - phrases_ = NULL; -} - -CubeLineObject::~CubeLineObject() { - if (line_pix_ != NULL && own_pix_ == true) { - pixDestroy(&line_pix_); - line_pix_ = NULL; - } - - if (phrases_ != NULL) { - for (int phrase_idx = 0; phrase_idx < phrase_cnt_; phrase_idx++) { - if (phrases_[phrase_idx] != NULL) { - delete phrases_[phrase_idx]; - } - } - - delete []phrases_; - phrases_ = NULL; - } -} - -// Recognize the specified pix as one line returning the recognized -bool CubeLineObject::Process() { - // do nothing if pix had already been processed - if (processed_) { - return true; - } - - // validate data - if (line_pix_ == NULL || cntxt_ == NULL) { - return false; - } - - // create a CharSamp - CharSamp *char_samp = CubeUtils::CharSampleFromPix(line_pix_, 0, 0, - line_pix_->w, - line_pix_->h); - if (char_samp == NULL) { - return false; - } - - // compute connected components. - int con_comp_cnt = 0; - ConComp **con_comps = char_samp->FindConComps(&con_comp_cnt, - cntxt_->Params()->MinConCompSize()); - // no longer need char_samp, delete it - delete char_samp; - // no connected components, bail out - if (con_comp_cnt <= 0 || con_comps == NULL) { - return false; - } - - // sort connected components based on reading order - bool rtl = (cntxt_->ReadingOrder() == tesseract::CubeRecoContext::R2L); - qsort(con_comps, con_comp_cnt, sizeof(*con_comps), rtl ? - ConComp::Right2LeftComparer : ConComp::Left2RightComparer); - - // compute work breaking threshold as a ratio of line height - bool ret_val = false; - int word_break_threshold = ComputeWordBreakThreshold(con_comp_cnt, con_comps, - rtl); - if (word_break_threshold > 0) { - // over-allocate phrases object buffer - phrases_ = new CubeObject *[con_comp_cnt]; - if (phrases_ != NULL) { - // create a phrase if the horizontal distance between two consecutive - // concomps is higher than threshold - int start_con_idx = 0; - int current_phrase_limit = rtl ? con_comps[0]->Left() : - con_comps[0]->Right(); - - for (int con_idx = 1; con_idx <= con_comp_cnt; con_idx++) { - bool create_new_phrase = true; - // if not at the end, compute the distance between two consecutive - // concomps - if (con_idx < con_comp_cnt) { - int dist = 0; - if (cntxt_->ReadingOrder() == tesseract::CubeRecoContext::R2L) { - dist = current_phrase_limit - con_comps[con_idx]->Right(); - } else { - dist = con_comps[con_idx]->Left() - current_phrase_limit; - } - create_new_phrase = (dist > word_break_threshold); - } - - // create a new phrase - if (create_new_phrase) { - // create a phrase corresponding to a range on components - bool left_most; - bool right_most; - CharSamp *phrase_char_samp = - CharSamp::FromConComps(con_comps, start_con_idx, - con_idx - start_con_idx, NULL, - &left_most, &right_most, - line_pix_->h); - if (phrase_char_samp == NULL) { - break; - } - phrases_[phrase_cnt_] = new CubeObject(cntxt_, phrase_char_samp); - if (phrases_[phrase_cnt_] == NULL) { - delete phrase_char_samp; - break; - } - // set the ownership of the charsamp to the cube object - phrases_[phrase_cnt_]->SetCharSampOwnership(true); - phrase_cnt_++; - // advance the starting index to the current index - start_con_idx = con_idx; - // set the limit of the newly starting phrase (if any) - if (con_idx < con_comp_cnt) { - current_phrase_limit = rtl ? con_comps[con_idx]->Left() : - con_comps[con_idx]->Right(); - } - } else { - // update the limit of the current phrase - if (cntxt_->ReadingOrder() == tesseract::CubeRecoContext::R2L) { - current_phrase_limit = MIN(current_phrase_limit, - con_comps[con_idx]->Left()); - } else { - current_phrase_limit = MAX(current_phrase_limit, - con_comps[con_idx]->Right()); - } - } - } - ret_val = true; - } - } - - // clean-up connected comps - for (int con_idx = 0; con_idx < con_comp_cnt; con_idx++) { - delete con_comps[con_idx]; - } - delete []con_comps; - - // success - processed_ = true; - return ret_val; -} - -// Compute the least word breaking threshold that is required to produce a -// valid set of phrases. Phrases are validated using the Aspect ratio -// constraints specified in the language specific Params object -int CubeLineObject::ComputeWordBreakThreshold(int con_comp_cnt, - ConComp **con_comps, bool rtl) { - // initial estimate of word breaking threshold - int word_break_threshold = - static_cast(line_pix_->h * cntxt_->Params()->MaxSpaceHeightRatio()); - bool valid = false; - - // compute the resulting words and validate each's aspect ratio - do { - // group connected components into words based on breaking threshold - int start_con_idx = 0; - int current_phrase_limit = (rtl ? con_comps[0]->Left() : - con_comps[0]->Right()); - int min_x = con_comps[0]->Left(); - int max_x = con_comps[0]->Right(); - int min_y = con_comps[0]->Top(); - int max_y = con_comps[0]->Bottom(); - valid = true; - for (int con_idx = 1; con_idx <= con_comp_cnt; con_idx++) { - bool create_new_phrase = true; - // if not at the end, compute the distance between two consecutive - // concomps - if (con_idx < con_comp_cnt) { - int dist = 0; - if (rtl) { - dist = current_phrase_limit - con_comps[con_idx]->Right(); - } else { - dist = con_comps[con_idx]->Left() - current_phrase_limit; - } - create_new_phrase = (dist > word_break_threshold); - } - - // create a new phrase - if (create_new_phrase) { - // check aspect ratio. Break if invalid - if ((max_x - min_x + 1) > - (cntxt_->Params()->MaxWordAspectRatio() * (max_y - min_y + 1))) { - valid = false; - break; - } - // advance the starting index to the current index - start_con_idx = con_idx; - // set the limit of the newly starting phrase (if any) - if (con_idx < con_comp_cnt) { - current_phrase_limit = rtl ? con_comps[con_idx]->Left() : - con_comps[con_idx]->Right(); - // re-init bounding box - min_x = con_comps[con_idx]->Left(); - max_x = con_comps[con_idx]->Right(); - min_y = con_comps[con_idx]->Top(); - max_y = con_comps[con_idx]->Bottom(); - } - } else { - // update the limit of the current phrase - if (rtl) { - current_phrase_limit = MIN(current_phrase_limit, - con_comps[con_idx]->Left()); - } else { - current_phrase_limit = MAX(current_phrase_limit, - con_comps[con_idx]->Right()); - } - // update bounding box - UpdateRange(con_comps[con_idx]->Left(), - con_comps[con_idx]->Right(), &min_x, &max_x); - UpdateRange(con_comps[con_idx]->Top(), - con_comps[con_idx]->Bottom(), &min_y, &max_y); - } - } - - // return the breaking threshold if all broken word dimensions are valid - if (valid) { - return word_break_threshold; - } - - // decrease the threshold and try again - word_break_threshold--; - } while (!valid && word_break_threshold > 0); - - // failed to find a threshold that achieves the target aspect ratio. - // Just use the default threshold - return static_cast(line_pix_->h * - cntxt_->Params()->MaxSpaceHeightRatio()); -} -} diff --git a/cube/cube_line_object.h b/cube/cube_line_object.h deleted file mode 100644 index 037ae6b6..00000000 --- a/cube/cube_line_object.h +++ /dev/null @@ -1,67 +0,0 @@ -/********************************************************************** - * File: cube_line_object.h - * Description: Declaration of the Cube Line Object Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CubeLineObject implements an objects that holds a line of text -// Each line is broken into phrases. Phrases are blocks within the line that -// are unambiguously separate collections of words - -#ifndef CUBE_LINE_OBJECT_H -#define CUBE_LINE_OBJECT_H - -#include "cube_reco_context.h" -#include "cube_object.h" -#include "allheaders.h" - -namespace tesseract { -class CubeLineObject { - public: - CubeLineObject(CubeRecoContext *cntxt, Pix *pix); - ~CubeLineObject(); - - // accessors - inline int PhraseCount() { - if (!processed_ && !Process()) { - return 0; - } - return phrase_cnt_; - } - inline CubeObject **Phrases() { - if (!processed_ && !Process()) { - return NULL; - } - return phrases_; - } - - private: - CubeRecoContext *cntxt_; - bool own_pix_; - bool processed_; - Pix *line_pix_; - CubeObject **phrases_; - int phrase_cnt_; - bool Process(); - // Compute the least word breaking threshold that is required to produce a - // valid set of phrases. Phrases are validated using the Aspect ratio - // constraints specified in the language specific Params object - int ComputeWordBreakThreshold(int con_comp_cnt, ConComp **con_comps, - bool rtl); -}; -} - -#endif // CUBE_LINE_OBJECT_H diff --git a/cube/cube_line_segmenter.cpp b/cube/cube_line_segmenter.cpp deleted file mode 100644 index 278011f0..00000000 --- a/cube/cube_line_segmenter.cpp +++ /dev/null @@ -1,955 +0,0 @@ -/********************************************************************** - * File: cube_page_segmenter.cpp - * Description: Implementation of the Cube Page Segmenter Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "cube_line_segmenter.h" -#include "ndminx.h" - -namespace tesseract { -// constants that worked for Arabic page segmenter -const int CubeLineSegmenter::kLineSepMorphMinHgt = 20; -const int CubeLineSegmenter::kHgtBins = 20; -const double CubeLineSegmenter::kMaxValidLineRatio = 3.2; -const int CubeLineSegmenter::kMaxConnCompHgt = 150; -const int CubeLineSegmenter::kMaxConnCompWid = 500; -const int CubeLineSegmenter::kMaxHorzAspectRatio = 50; -const int CubeLineSegmenter::kMaxVertAspectRatio = 20; -const int CubeLineSegmenter::kMinWid = 2; -const int CubeLineSegmenter::kMinHgt = 2; -const float CubeLineSegmenter::kMinValidLineHgtRatio = 2.5; - -CubeLineSegmenter::CubeLineSegmenter(CubeRecoContext *cntxt, Pix *img) { - cntxt_ = cntxt; - orig_img_ = img; - img_ = NULL; - lines_pixa_ = NULL; - init_ = false; - line_cnt_ = 0; - columns_ = NULL; - con_comps_ = NULL; - est_alef_hgt_ = 0.0; - est_dot_hgt_ = 0.0; -} - -CubeLineSegmenter::~CubeLineSegmenter() { - if (img_ != NULL) { - pixDestroy(&img_); - img_ = NULL; - } - - if (lines_pixa_ != NULL) { - pixaDestroy(&lines_pixa_); - lines_pixa_ = NULL; - } - - if (con_comps_ != NULL) { - pixaDestroy(&con_comps_); - con_comps_ = NULL; - } - - if (columns_ != NULL) { - pixaaDestroy(&columns_); - columns_ = NULL; - } -} - -// compute validity ratio for a line -double CubeLineSegmenter::ValidityRatio(Pix *line_mask_pix, Box *line_box) { - return line_box->h / est_alef_hgt_; -} - -// validate line -bool CubeLineSegmenter::ValidLine(Pix *line_mask_pix, Box *line_box) { - double validity_ratio = ValidityRatio(line_mask_pix, line_box); - - return validity_ratio < kMaxValidLineRatio; -} - -// perform a vertical Closing with the specified threshold -// returning the resulting conn comps as a pixa -Pixa *CubeLineSegmenter::VerticalClosing(Pix *pix, - int threshold, Boxa **boxa) { - char sequence_str[16]; - - // do the morphology - sprintf(sequence_str, "c100.%d", threshold); - Pix *morphed_pix = pixMorphCompSequence(pix, sequence_str, 0); - if (morphed_pix == NULL) { - return NULL; - } - - // get the resulting lines by computing concomps - Pixa *pixac; - (*boxa) = pixConnComp(morphed_pix, &pixac, 8); - - pixDestroy(&morphed_pix); - - if ((*boxa) == NULL) { - return NULL; - } - - return pixac; -} - -// Helper cleans up after CrackLine. -static void CleanupCrackLine(int line_cnt, Pixa **lines_pixa, - Boxa **line_con_comps, - Pixa **line_con_comps_pix) { - for (int line = 0; line < line_cnt; line++) { - if (lines_pixa[line] != NULL) { - pixaDestroy(&lines_pixa[line]); - } - } - - delete []lines_pixa; - boxaDestroy(line_con_comps); - pixaDestroy(line_con_comps_pix); -} - -// do a desperate attempt at cracking lines -Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix, - Box *cracked_line_box, int line_cnt) { - // create lines pixa array - Pixa **lines_pixa = new Pixa*[line_cnt]; - if (lines_pixa == NULL) { - return NULL; - } - - memset(lines_pixa, 0, line_cnt * sizeof(*lines_pixa)); - - // compute line conn comps - Pixa *line_con_comps_pix; - Boxa *line_con_comps = ComputeLineConComps(cracked_line_pix, - cracked_line_box, &line_con_comps_pix); - - if (line_con_comps == NULL) { - delete []lines_pixa; - return NULL; - } - - // assign each conn comp to the a line based on its centroid - for (int con = 0; con < line_con_comps->n; con++) { - Box *con_box = line_con_comps->box[con]; - Pix *con_pix = line_con_comps_pix->pix[con]; - int mid_y = (con_box->y - cracked_line_box->y) + (con_box->h / 2), - line_idx = MIN(line_cnt - 1, - (mid_y * line_cnt / cracked_line_box->h)); - - // create the line if it has not been created? - if (lines_pixa[line_idx] == NULL) { - lines_pixa[line_idx] = pixaCreate(line_con_comps->n); - if (lines_pixa[line_idx] == NULL) { - CleanupCrackLine(line_cnt, lines_pixa, &line_con_comps, - &line_con_comps_pix); - return NULL; - } - } - - // add the concomp to the line - if (pixaAddPix(lines_pixa[line_idx], con_pix, L_CLONE) != 0 || - pixaAddBox(lines_pixa[line_idx], con_box, L_CLONE)) { - CleanupCrackLine(line_cnt, lines_pixa, &line_con_comps, - &line_con_comps_pix); - return NULL; - } - } - - // create the lines pixa - Pixa *lines = pixaCreate(line_cnt); - bool success = true; - - // create and check the validity of the lines - for (int line = 0; line < line_cnt; line++) { - Pixa *line_pixa = lines_pixa[line]; - - // skip invalid lines - if (line_pixa == NULL) { - continue; - } - - // merge the pix, check the validity of the line - // and add it to the lines pixa - Box *line_box; - Pix *line_pix = Pixa2Pix(line_pixa, &line_box); - if (line_pix == NULL || - line_box == NULL || - ValidLine(line_pix, line_box) == false || - pixaAddPix(lines, line_pix, L_INSERT) != 0 || - pixaAddBox(lines, line_box, L_INSERT) != 0) { - if (line_pix != NULL) { - pixDestroy(&line_pix); - } - - if (line_box != NULL) { - boxDestroy(&line_box); - } - - success = false; - - break; - } - } - - // cleanup - CleanupCrackLine(line_cnt, lines_pixa, &line_con_comps, - &line_con_comps_pix); - - if (success == false) { - pixaDestroy(&lines); - lines = NULL; - } - - return lines; -} - -// do a desperate attempt at cracking lines -Pixa *CubeLineSegmenter::CrackLine(Pix *cracked_line_pix, - Box *cracked_line_box) { - // estimate max line count - int max_line_cnt = static_cast((cracked_line_box->h / - est_alef_hgt_) + 0.5); - if (max_line_cnt < 2) { - return NULL; - } - - for (int line_cnt = 2; line_cnt < max_line_cnt; line_cnt++) { - Pixa *lines = CrackLine(cracked_line_pix, cracked_line_box, line_cnt); - if (lines != NULL) { - return lines; - } - } - - return NULL; -} - -// split a line continuously until valid or fail -Pixa *CubeLineSegmenter::SplitLine(Pix *line_mask_pix, Box *line_box) { - // clone the line mask - Pix *line_pix = pixClone(line_mask_pix); - - if (line_pix == NULL) { - return NULL; - } - - // AND with the image to get the actual line - pixRasterop(line_pix, 0, 0, line_pix->w, line_pix->h, - PIX_SRC & PIX_DST, img_, line_box->x, line_box->y); - - // continue to do rasterop morphology on the line until - // it splits to valid lines or we fail - int morph_hgt = kLineSepMorphMinHgt - 1, - best_threshold = kLineSepMorphMinHgt - 1, - max_valid_portion = 0; - - Boxa *boxa; - Pixa *pixac; - - do { - pixac = VerticalClosing(line_pix, morph_hgt, &boxa); - - // add the box offset to all the lines - // and check for the validity of each - int line, - valid_line_cnt = 0, - valid_portion = 0; - - for (line = 0; line < pixac->n; line++) { - boxa->box[line]->x += line_box->x; - boxa->box[line]->y += line_box->y; - - if (ValidLine(pixac->pix[line], boxa->box[line]) == true) { - // count valid lines - valid_line_cnt++; - - // and the valid portions - valid_portion += boxa->box[line]->h; - } - } - - // all the lines are valid - if (valid_line_cnt == pixac->n) { - boxaDestroy(&boxa); - pixDestroy(&line_pix); - return pixac; - } - - // a larger valid portion - if (valid_portion > max_valid_portion) { - max_valid_portion = valid_portion; - best_threshold = morph_hgt; - } - - boxaDestroy(&boxa); - pixaDestroy(&pixac); - - morph_hgt--; - } - while (morph_hgt > 0); - - // failed to break into valid lines - // attempt to crack the line - pixac = CrackLine(line_pix, line_box); - if (pixac != NULL) { - pixDestroy(&line_pix); - return pixac; - } - - // try to leverage any of the lines - // did the best threshold yield a non zero valid portion - if (max_valid_portion > 0) { - // use this threshold to break lines - pixac = VerticalClosing(line_pix, best_threshold, &boxa); - - // add the box offset to all the lines - // and check for the validity of each - for (int line = 0; line < pixac->n; line++) { - boxa->box[line]->x += line_box->x; - boxa->box[line]->y += line_box->y; - - // remove invalid lines from the pixa - if (ValidLine(pixac->pix[line], boxa->box[line]) == false) { - pixaRemovePix(pixac, line); - line--; - } - } - - boxaDestroy(&boxa); - pixDestroy(&line_pix); - return pixac; - } - - // last resort: attempt to crack the line - pixDestroy(&line_pix); - - return NULL; -} - -// Checks of a line is too small -bool CubeLineSegmenter::SmallLine(Box *line_box) { - return line_box->h <= (kMinValidLineHgtRatio * est_dot_hgt_); -} - -// Compute the connected components in a line -Boxa * CubeLineSegmenter::ComputeLineConComps(Pix *line_mask_pix, - Box *line_box, - Pixa **con_comps_pixa) { - // clone the line mask - Pix *line_pix = pixClone(line_mask_pix); - - if (line_pix == NULL) { - return NULL; - } - - // AND with the image to get the actual line - pixRasterop(line_pix, 0, 0, line_pix->w, line_pix->h, - PIX_SRC & PIX_DST, img_, line_box->x, line_box->y); - - // compute the connected components of the line to be merged - Boxa *line_con_comps = pixConnComp(line_pix, con_comps_pixa, 8); - - pixDestroy(&line_pix); - - // offset boxes by the bbox of the line - for (int con = 0; con < line_con_comps->n; con++) { - line_con_comps->box[con]->x += line_box->x; - line_con_comps->box[con]->y += line_box->y; - } - - return line_con_comps; -} - -// create a union of two arbitrary pix -Pix *CubeLineSegmenter::PixUnion(Pix *dest_pix, Box *dest_box, - Pix *src_pix, Box *src_box) { - // compute dimensions of union rect - BOX *union_box = boxBoundingRegion(src_box, dest_box); - - // create the union pix - Pix *union_pix = pixCreate(union_box->w, union_box->h, src_pix->d); - if (union_pix == NULL) { - return NULL; - } - - // blt the src and dest pix - pixRasterop(union_pix, - src_box->x - union_box->x, src_box->y - union_box->y, - src_box->w, src_box->h, PIX_SRC | PIX_DST, src_pix, 0, 0); - - pixRasterop(union_pix, - dest_box->x - union_box->x, dest_box->y - union_box->y, - dest_box->w, dest_box->h, PIX_SRC | PIX_DST, dest_pix, 0, 0); - - // replace the dest_box - *dest_box = *union_box; - - boxDestroy(&union_box); - - return union_pix; -} - -// create a union of a number of arbitrary pix -Pix *CubeLineSegmenter::Pixa2Pix(Pixa *pixa, Box **dest_box, - int start_pix, int pix_cnt) { - // compute union_box - int min_x = INT_MAX, - max_x = INT_MIN, - min_y = INT_MAX, - max_y = INT_MIN; - - for (int pix_idx = start_pix; pix_idx < (start_pix + pix_cnt); pix_idx++) { - Box *pix_box = pixa->boxa->box[pix_idx]; - - UpdateRange(pix_box->x, pix_box->x + pix_box->w, &min_x, &max_x); - UpdateRange(pix_box->y, pix_box->y + pix_box->h, &min_y, &max_y); - } - - (*dest_box) = boxCreate(min_x, min_y, max_x - min_x, max_y - min_y); - if ((*dest_box) == NULL) { - return NULL; - } - - // create the union pix - Pix *union_pix = pixCreate((*dest_box)->w, (*dest_box)->h, img_->d); - if (union_pix == NULL) { - boxDestroy(dest_box); - return NULL; - } - - // create a pix corresponding to the union of all pixs - // blt the src and dest pix - for (int pix_idx = start_pix; pix_idx < (start_pix + pix_cnt); pix_idx++) { - Box *pix_box = pixa->boxa->box[pix_idx]; - Pix *con_pix = pixa->pix[pix_idx]; - - pixRasterop(union_pix, - pix_box->x - (*dest_box)->x, pix_box->y - (*dest_box)->y, - pix_box->w, pix_box->h, PIX_SRC | PIX_DST, con_pix, 0, 0); - } - - return union_pix; -} - -// create a union of a number of arbitrary pix -Pix *CubeLineSegmenter::Pixa2Pix(Pixa *pixa, Box **dest_box) { - return Pixa2Pix(pixa, dest_box, 0, pixa->n); -} - -// merges a number of lines into one line given a bounding box and a mask -bool CubeLineSegmenter::MergeLine(Pix *line_mask_pix, Box *line_box, - Pixa *lines, Boxaa *lines_con_comps) { - // compute the connected components of the lines to be merged - Pixa *small_con_comps_pix; - Boxa *small_line_con_comps = ComputeLineConComps(line_mask_pix, - line_box, &small_con_comps_pix); - - if (small_line_con_comps == NULL) { - return false; - } - - // for each connected component - for (int con = 0; con < small_line_con_comps->n; con++) { - Box *small_con_comp_box = small_line_con_comps->box[con]; - int best_line = -1, - best_dist = INT_MAX, - small_box_right = small_con_comp_box->x + small_con_comp_box->w, - small_box_bottom = small_con_comp_box->y + small_con_comp_box->h; - - // for each valid line - for (int line = 0; line < lines->n; line++) { - if (SmallLine(lines->boxa->box[line]) == true) { - continue; - } - - // for all the connected components in the line - Boxa *line_con_comps = lines_con_comps->boxa[line]; - - for (int lcon = 0; lcon < line_con_comps->n; lcon++) { - Box *con_comp_box = line_con_comps->box[lcon]; - int xdist, - ydist, - box_right = con_comp_box->x + con_comp_box->w, - box_bottom = con_comp_box->y + con_comp_box->h; - - xdist = MAX(small_con_comp_box->x, con_comp_box->x) - - MIN(small_box_right, box_right); - - ydist = MAX(small_con_comp_box->y, con_comp_box->y) - - MIN(small_box_bottom, box_bottom); - - // if there is an overlap in x-direction - if (xdist <= 0) { - if (best_line == -1 || ydist < best_dist) { - best_dist = ydist; - best_line = line; - } - } - } - } - - // if the distance is too big, do not merged - if (best_line != -1 && best_dist < est_alef_hgt_) { - // add the pix to the best line - Pix *new_line = PixUnion(lines->pix[best_line], - lines->boxa->box[best_line], - small_con_comps_pix->pix[con], small_con_comp_box); - - if (new_line == NULL) { - return false; - } - - pixDestroy(&lines->pix[best_line]); - lines->pix[best_line] = new_line; - } - } - - pixaDestroy(&small_con_comps_pix); - boxaDestroy(&small_line_con_comps); - - return true; -} - -// Creates new set of lines from the computed columns -bool CubeLineSegmenter::AddLines(Pixa *lines) { - // create an array that will hold the bounding boxes - // of the concomps belonging to each line - Boxaa *lines_con_comps = boxaaCreate(lines->n); - if (lines_con_comps == NULL) { - return false; - } - - for (int line = 0; line < lines->n; line++) { - // if the line is not valid - if (ValidLine(lines->pix[line], lines->boxa->box[line]) == false) { - // split it - Pixa *split_lines = SplitLine(lines->pix[line], - lines->boxa->box[line]); - - // remove the old line - if (pixaRemovePix(lines, line) != 0) { - return false; - } - - line--; - - if (split_lines == NULL) { - continue; - } - - // add the split lines instead and move the pointer - for (int s_line = 0; s_line < split_lines->n; s_line++) { - Pix *sp_line = pixaGetPix(split_lines, s_line, L_CLONE); - Box *sp_box = boxaGetBox(split_lines->boxa, s_line, L_CLONE); - - if (sp_line == NULL || sp_box == NULL) { - return false; - } - - // insert the new line - if (pixaInsertPix(lines, ++line, sp_line, sp_box) != 0) { - return false; - } - } - - // remove the split lines - pixaDestroy(&split_lines); - } - } - - // compute the concomps bboxes of each line - for (int line = 0; line < lines->n; line++) { - Boxa *line_con_comps = ComputeLineConComps(lines->pix[line], - lines->boxa->box[line], NULL); - - if (line_con_comps == NULL) { - return false; - } - - // insert it into the boxaa array - if (boxaaAddBoxa(lines_con_comps, line_con_comps, L_INSERT) != 0) { - return false; - } - } - - // post process the lines: - // merge the contents of "small" lines info legitimate lines - for (int line = 0; line < lines->n; line++) { - // a small line detected - if (SmallLine(lines->boxa->box[line]) == true) { - // merge its components to one of the valid lines - if (MergeLine(lines->pix[line], lines->boxa->box[line], - lines, lines_con_comps) == true) { - // remove the small line - if (pixaRemovePix(lines, line) != 0) { - return false; - } - - if (boxaaRemoveBoxa(lines_con_comps, line) != 0) { - return false; - } - - line--; - } - } - } - - boxaaDestroy(&lines_con_comps); - - // add the pix masks - if (pixaaAddPixa(columns_, lines, L_INSERT) != 0) { - return false; - } - - return true; -} - -// Index the specific pixa using RTL reading order -int *CubeLineSegmenter::IndexRTL(Pixa *pixa) { - int *pix_index = new int[pixa->n]; - if (pix_index == NULL) { - return NULL; - } - - for (int pix = 0; pix < pixa->n; pix++) { - pix_index[pix] = pix; - } - - for (int ipix = 0; ipix < pixa->n; ipix++) { - for (int jpix = ipix + 1; jpix < pixa->n; jpix++) { - Box *ipix_box = pixa->boxa->box[pix_index[ipix]], - *jpix_box = pixa->boxa->box[pix_index[jpix]]; - - // swap? - if ((ipix_box->x + ipix_box->w) < (jpix_box->x + jpix_box->w)) { - int temp = pix_index[ipix]; - pix_index[ipix] = pix_index[jpix]; - pix_index[jpix] = temp; - } - } - } - - return pix_index; -} - -// Performs line segmentation -bool CubeLineSegmenter::LineSegment() { - // Use full image morphology to find columns - // This only works for simple layouts where each column - // of text extends the full height of the input image. - Pix *pix_temp1 = pixMorphCompSequence(img_, "c5.500", 0); - if (pix_temp1 == NULL) { - return false; - } - - // Mask with a single component over each column - Pixa *pixam; - Boxa *boxa = pixConnComp(pix_temp1, &pixam, 8); - - if (boxa == NULL) { - return false; - } - - int init_morph_min_hgt = kLineSepMorphMinHgt; - char sequence_str[16]; - sprintf(sequence_str, "c100.%d", init_morph_min_hgt); - - // Use selective region-based morphology to get the textline mask. - Pixa *pixad = pixaMorphSequenceByRegion(img_, pixam, sequence_str, 0, 0); - if (pixad == NULL) { - return false; - } - - // for all columns - int col_cnt = boxaGetCount(boxa); - - // create columns - columns_ = pixaaCreate(col_cnt); - if (columns_ == NULL) { - return false; - } - - // index columns based on readind order (RTL) - int *col_order = IndexRTL(pixad); - if (col_order == NULL) { - return false; - } - - line_cnt_ = 0; - - for (int col_idx = 0; col_idx < col_cnt; col_idx++) { - int col = col_order[col_idx]; - - // get the pix and box corresponding to the column - Pix *pixt3 = pixaGetPix(pixad, col, L_CLONE); - if (pixt3 == NULL) { - delete []col_order; - return false; - } - - Box *col_box = pixad->boxa->box[col]; - - Pixa *pixac; - Boxa *boxa2 = pixConnComp(pixt3, &pixac, 8); - if (boxa2 == NULL) { - delete []col_order; - return false; - } - - // offset the boxes by the column box - for (int line = 0; line < pixac->n; line++) { - pixac->boxa->box[line]->x += col_box->x; - pixac->boxa->box[line]->y += col_box->y; - } - - // add the lines - if (AddLines(pixac) == true) { - if (pixaaAddBox(columns_, col_box, L_CLONE) != 0) { - delete []col_order; - return false; - } - } - - pixDestroy(&pixt3); - boxaDestroy(&boxa2); - - line_cnt_ += columns_->pixa[col_idx]->n; - } - - pixaDestroy(&pixam); - pixaDestroy(&pixad); - boxaDestroy(&boxa); - - delete []col_order; - pixDestroy(&pix_temp1); - - return true; -} - -// Estimate the parameters of the font(s) used in the page -bool CubeLineSegmenter::EstimateFontParams() { - int hgt_hist[kHgtBins]; - int max_hgt; - double mean_hgt; - - // init hgt histogram of concomps - memset(hgt_hist, 0, sizeof(hgt_hist)); - - // compute max hgt - max_hgt = 0; - - for (int con = 0; con < con_comps_->n; con++) { - // skip conn comps that are too long or too wide - if (con_comps_->boxa->box[con]->h > kMaxConnCompHgt || - con_comps_->boxa->box[con]->w > kMaxConnCompWid) { - continue; - } - - max_hgt = MAX(max_hgt, con_comps_->boxa->box[con]->h); - } - - if (max_hgt <= 0) { - return false; - } - - // init hgt histogram of concomps - memset(hgt_hist, 0, sizeof(hgt_hist)); - - // compute histogram - mean_hgt = 0.0; - for (int con = 0; con < con_comps_->n; con++) { - // skip conn comps that are too long or too wide - if (con_comps_->boxa->box[con]->h > kMaxConnCompHgt || - con_comps_->boxa->box[con]->w > kMaxConnCompWid) { - continue; - } - - int bin = static_cast(kHgtBins * con_comps_->boxa->box[con]->h / - max_hgt); - bin = MIN(bin, kHgtBins - 1); - hgt_hist[bin]++; - mean_hgt += con_comps_->boxa->box[con]->h; - } - - mean_hgt /= con_comps_->n; - - // find the top 2 bins - int idx[kHgtBins]; - - for (int bin = 0; bin < kHgtBins; bin++) { - idx[bin] = bin; - } - - for (int ibin = 0; ibin < 2; ibin++) { - for (int jbin = ibin + 1; jbin < kHgtBins; jbin++) { - if (hgt_hist[idx[ibin]] < hgt_hist[idx[jbin]]) { - int swap = idx[ibin]; - idx[ibin] = idx[jbin]; - idx[jbin] = swap; - } - } - } - - // emperically, we found out that the 2 highest freq bins correspond - // respectively to the dot and alef - est_dot_hgt_ = (1.0 * (idx[0] + 1) * max_hgt / kHgtBins); - est_alef_hgt_ = (1.0 * (idx[1] + 1) * max_hgt / kHgtBins); - - // as a sanity check the dot hgt must be significanly lower than alef - if (est_alef_hgt_ < (est_dot_hgt_ * 2)) { - // use max_hgt to estimate instead - est_alef_hgt_ = mean_hgt * 1.5; - est_dot_hgt_ = est_alef_hgt_ / 5.0; - } - - est_alef_hgt_ = MAX(est_alef_hgt_, est_dot_hgt_ * 4.0); - - return true; -} - -// clean up the image -Pix *CubeLineSegmenter::CleanUp(Pix *orig_img) { - // get rid of long horizontal lines - Pix *pix_temp0 = pixMorphCompSequence(orig_img, "o300.2", 0); - pixXor(pix_temp0, pix_temp0, orig_img); - - // get rid of long vertical lines - Pix *pix_temp1 = pixMorphCompSequence(pix_temp0, "o2.300", 0); - pixXor(pix_temp1, pix_temp1, pix_temp0); - - pixDestroy(&pix_temp0); - - // detect connected components - Pixa *con_comps; - Boxa *boxa = pixConnComp(pix_temp1, &con_comps, 8); - if (boxa == NULL) { - return NULL; - } - - // detect and remove suspicious conn comps - for (int con = 0; con < con_comps->n; con++) { - Box *box = boxa->box[con]; - - // remove if suspc. conn comp - if ((box->w > (box->h * kMaxHorzAspectRatio)) || - (box->h > (box->w * kMaxVertAspectRatio)) || - (box->w < kMinWid && box->h < kMinHgt)) { - pixRasterop(pix_temp1, box->x, box->y, box->w, box->h, - PIX_SRC ^ PIX_DST, con_comps->pix[con], 0, 0); - } - } - - pixaDestroy(&con_comps); - boxaDestroy(&boxa); - - return pix_temp1; -} - -// Init the page segmenter -bool CubeLineSegmenter::Init() { - if (init_ == true) { - return true; - } - - if (orig_img_ == NULL) { - return false; - } - - // call the internal line segmentation - return FindLines(); -} - -// return the pix mask and box of a specific line -Pix *CubeLineSegmenter::Line(int line, Box **line_box) { - if (init_ == false && Init() == false) { - return NULL; - } - - if (line < 0 || line >= line_cnt_) { - return NULL; - } - - (*line_box) = lines_pixa_->boxa->box[line]; - return lines_pixa_->pix[line]; -} - -// Implements a basic rudimentary layout analysis based on Leptonica -// works OK for Arabic. For other languages, the function TesseractPageAnalysis -// should be called instead. -bool CubeLineSegmenter::FindLines() { - // convert the image to gray scale if necessary - Pix *gray_scale_img = NULL; - if (orig_img_->d != 2 && orig_img_->d != 8) { - gray_scale_img = pixConvertTo8(orig_img_, false); - if (gray_scale_img == NULL) { - return false; - } - } else { - gray_scale_img = orig_img_; - } - - // threshold image - Pix *thresholded_img; - thresholded_img = pixThresholdToBinary(gray_scale_img, 128); - // free the gray scale image if necessary - if (gray_scale_img != orig_img_) { - pixDestroy(&gray_scale_img); - } - // bail-out if thresholding failed - if (thresholded_img == NULL) { - return false; - } - - // deskew - Pix *deskew_img = pixDeskew(thresholded_img, 2); - if (deskew_img == NULL) { - return false; - } - - pixDestroy(&thresholded_img); - - img_ = CleanUp(deskew_img); - pixDestroy(&deskew_img); - if (img_ == NULL) { - return false; - } - - pixDestroy(&deskew_img); - - // compute connected components - Boxa *boxa = pixConnComp(img_, &con_comps_, 8); - if (boxa == NULL) { - return false; - } - - boxaDestroy(&boxa); - - // estimate dot and alef hgts - if (EstimateFontParams() == false) { - return false; - } - - // perform line segmentation - if (LineSegment() == false) { - return false; - } - - // success - init_ = true; - return true; -} - -} diff --git a/cube/cube_line_segmenter.h b/cube/cube_line_segmenter.h deleted file mode 100644 index 43d19e57..00000000 --- a/cube/cube_line_segmenter.h +++ /dev/null @@ -1,156 +0,0 @@ -/********************************************************************** - * File: cube_page_segmenter.h - * Description: Declaration of the Cube Page Segmenter Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// TODO(ahmadab) -// This is really a makeshift line segmenter that works well for Arabic -// This should eventually be replaced by Ray Smith's Page segmenter -// There are lots of magic numbers below that were determined empirically -// but not thoroughly tested - -#ifndef CUBE_LINE_SEGMENTER_H -#define CUBE_LINE_SEGMENTER_H - -#include "cube_reco_context.h" -#include "allheaders.h" - -namespace tesseract { - -class CubeLineSegmenter { - public: - CubeLineSegmenter(CubeRecoContext *cntxt, Pix *img); - ~CubeLineSegmenter(); - - // Accessor functions - Pix *PostProcessedImage() { - if (init_ == false && Init() == false) { - return NULL; - } - return img_; - } - int ColumnCnt() { - if (init_ == false && Init() == false) { - return 0; - } - return columns_->n; - } - Box *Column(int col) { - if (init_ == false && Init() == false) { - return NULL; - } - - return columns_->boxa->box[col]; - } - int LineCnt() { - if (init_ == false && Init() == false) { - return 0; - } - - return line_cnt_; - } - Pixa *ConComps() { - if (init_ == false && Init() == false) { - return NULL; - } - - return con_comps_; - } - Pixaa *Columns() { - if (init_ == false && Init() == false) { - return NULL; - } - - return columns_; - } - inline double AlefHgtEst() { return est_alef_hgt_; } - inline double DotHgtEst() { return est_dot_hgt_; } - Pix *Line(int line, Box **line_box); - - private: - static const float kMinValidLineHgtRatio; - static const int kLineSepMorphMinHgt; - static const int kHgtBins; - static const int kMaxConnCompHgt; - static const int kMaxConnCompWid; - static const int kMaxHorzAspectRatio; - static const int kMaxVertAspectRatio; - static const int kMinWid; - static const int kMinHgt; - static const double kMaxValidLineRatio; - - // Cube Reco context - CubeRecoContext *cntxt_; - // Original image - Pix *orig_img_; - // Post processed image - Pix *img_; - // Init flag - bool init_; - // Output Line and column info - int line_cnt_; - Pixaa *columns_; - Pixa *con_comps_; - Pixa *lines_pixa_; - // Estimates for sizes of ALEF and DOT needed for Arabic analysis - double est_alef_hgt_; - double est_dot_hgt_; - - // Init the page analysis - bool Init(); - // Performs line segmentation - bool LineSegment(); - // Cleanup function - Pix *CleanUp(Pix *pix); - // compute validity ratio for a line - double ValidityRatio(Pix *line_mask_pix, Box *line_box); - // validate line - bool ValidLine(Pix *line_mask_pix, Box *line_box); - // split a line continuously until valid or fail - Pixa *SplitLine(Pix *line_mask_pix, Box *line_box); - // do a desperate attempt at cracking lines - Pixa *CrackLine(Pix *line_mask_pix, Box *line_box); - Pixa *CrackLine(Pix *line_mask_pix, Box *line_box, int line_cnt); - // Checks of a line is too small - bool SmallLine(Box *line_box); - // Compute the connected components in a line - Boxa * ComputeLineConComps(Pix *line_mask_pix, Box *line_box, - Pixa **con_comps_pixa); - // create a union of two arbitrary pix - Pix *PixUnion(Pix *dest_pix, Box *dest_box, Pix *src_pix, Box *src_box); - // create a union of a pixa subset - Pix *Pixa2Pix(Pixa *pixa, Box **dest_box, int start_pix, int pix_cnt); - // create a union of a pixa - Pix *Pixa2Pix(Pixa *pixa, Box **dest_box); - // merges a number of lines into one line given a bounding box and a mask - bool MergeLine(Pix *line_mask_pix, Box *line_box, - Pixa *lines, Boxaa *lines_con_comps); - // Creates new set of lines from the computed columns - bool AddLines(Pixa *lines); - // Estimate the parameters of the font(s) used in the page - bool EstimateFontParams(); - // perform a vertical Closing with the specified threshold - // returning the resulting conn comps as a pixa - Pixa *VerticalClosing(Pix *pix, int thresold, Boxa **boxa); - // Index the specific pixa using RTL reading order - int *IndexRTL(Pixa *pixa); - // Implements a rudimentary page & line segmenter - bool FindLines(); -}; -} - -#endif // CUBE_LINE_SEGMENTER_H diff --git a/cube/cube_object.cpp b/cube/cube_object.cpp deleted file mode 100644 index c7dec4d5..00000000 --- a/cube/cube_object.cpp +++ /dev/null @@ -1,297 +0,0 @@ -/********************************************************************** - * File: cube_object.cpp - * Description: Implementation of the Cube Object Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include "cube_object.h" -#include "cube_utils.h" -#include "word_list_lang_model.h" - -namespace tesseract { -CubeObject::CubeObject(CubeRecoContext *cntxt, CharSamp *char_samp) { - Init(); - char_samp_ = char_samp; - cntxt_ = cntxt; -} - -CubeObject::CubeObject(CubeRecoContext *cntxt, Pix *pix, - int left, int top, int wid, int hgt) { - Init(); - char_samp_ = CubeUtils::CharSampleFromPix(pix, left, top, wid, hgt); - own_char_samp_ = true; - cntxt_ = cntxt; -} - -// Data member initialization function -void CubeObject::Init() { - char_samp_ = NULL; - own_char_samp_ = false; - alt_list_ = NULL; - srch_obj_ = NULL; - deslanted_alt_list_ = NULL; - deslanted_srch_obj_ = NULL; - deslanted_ = false; - deslanted_char_samp_ = NULL; - beam_obj_ = NULL; - deslanted_beam_obj_ = NULL; - cntxt_ = NULL; -} - -// Cleanup function -void CubeObject::Cleanup() { - if (alt_list_ != NULL) { - delete alt_list_; - alt_list_ = NULL; - } - - if (deslanted_alt_list_ != NULL) { - delete deslanted_alt_list_; - deslanted_alt_list_ = NULL; - } -} - -CubeObject::~CubeObject() { - if (char_samp_ != NULL && own_char_samp_ == true) { - delete char_samp_; - char_samp_ = NULL; - } - - if (srch_obj_ != NULL) { - delete srch_obj_; - srch_obj_ = NULL; - } - - if (deslanted_srch_obj_ != NULL) { - delete deslanted_srch_obj_; - deslanted_srch_obj_ = NULL; - } - - if (beam_obj_ != NULL) { - delete beam_obj_; - beam_obj_ = NULL; - } - - if (deslanted_beam_obj_ != NULL) { - delete deslanted_beam_obj_; - deslanted_beam_obj_ = NULL; - } - - if (deslanted_char_samp_ != NULL) { - delete deslanted_char_samp_; - deslanted_char_samp_ = NULL; - } - - Cleanup(); -} - -/** - * Actually do the recognition using the specified language mode. If none - * is specified, the default language model in the CubeRecoContext is used. - * @return the sorted list of alternate answers - * @param word_mode determines whether recognition is done as a word or a phrase - */ -WordAltList *CubeObject::Recognize(LangModel *lang_mod, bool word_mode) { - if (char_samp_ == NULL) { - return NULL; - } - - // clear alt lists - Cleanup(); - - // no specified language model, use the one in the reco context - if (lang_mod == NULL) { - lang_mod = cntxt_->LangMod(); - } - - // normalize if necessary - if (cntxt_->SizeNormalization()) { - Normalize(); - } - - // assume not de-slanted by default - deslanted_ = false; - - // create a beam search object - if (beam_obj_ == NULL) { - beam_obj_ = new BeamSearch(cntxt_, word_mode); - if (beam_obj_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeObject::Recognize): could not construct " - "BeamSearch\n"); - return NULL; - } - } - - // create a cube search object - if (srch_obj_ == NULL) { - srch_obj_ = new CubeSearchObject(cntxt_, char_samp_); - if (srch_obj_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeObject::Recognize): could not construct " - "CubeSearchObject\n"); - return NULL; - } - } - - // run a beam search against the tesslang model - alt_list_ = beam_obj_->Search(srch_obj_, lang_mod); - - // deslant (if supported by language) and re-reco if probability is low enough - if (cntxt_->HasItalics() == true && - (alt_list_ == NULL || alt_list_->AltCount() < 1 || - alt_list_->AltCost(0) > CubeUtils::Prob2Cost(kMinProbSkipDeslanted))) { - - if (deslanted_beam_obj_ == NULL) { - deslanted_beam_obj_ = new BeamSearch(cntxt_); - if (deslanted_beam_obj_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeObject::Recognize): could not " - "construct deslanted BeamSearch\n"); - return NULL; - } - } - - if (deslanted_srch_obj_ == NULL) { - deslanted_char_samp_ = char_samp_->Clone(); - if (deslanted_char_samp_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeObject::Recognize): could not " - "construct deslanted CharSamp\n"); - return NULL; - } - - if (deslanted_char_samp_->Deslant() == false) { - return NULL; - } - - deslanted_srch_obj_ = new CubeSearchObject(cntxt_, deslanted_char_samp_); - if (deslanted_srch_obj_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeObject::Recognize): could not " - "construct deslanted CubeSearchObject\n"); - return NULL; - } - } - - // run a beam search against the tesslang model - deslanted_alt_list_ = deslanted_beam_obj_->Search(deslanted_srch_obj_, - lang_mod); - // should we use de-slanted altlist? - if (deslanted_alt_list_ != NULL && deslanted_alt_list_->AltCount() > 0) { - if (alt_list_ == NULL || alt_list_->AltCount() < 1 || - deslanted_alt_list_->AltCost(0) < alt_list_->AltCost(0)) { - deslanted_ = true; - return deslanted_alt_list_; - } - } - } - - return alt_list_; -} - -/** - * Recognize the member char sample as a word - */ -WordAltList *CubeObject::RecognizeWord(LangModel *lang_mod) { - return Recognize(lang_mod, true); -} - -/** - * Recognize the member char sample as a phrase - */ -WordAltList *CubeObject::RecognizePhrase(LangModel *lang_mod) { - return Recognize(lang_mod, false); -} - -/** - * Computes the cost of a specific string. This is done by performing - * recognition of a language model that allows only the specified word - */ -int CubeObject::WordCost(const char *str) { - WordListLangModel *lang_mod = new WordListLangModel(cntxt_); - if (lang_mod == NULL) { - return WORST_COST; - } - - if (lang_mod->AddString(str) == false) { - delete lang_mod; - return WORST_COST; - } - - // run a beam search against the single string wordlist model - WordAltList *alt_list = RecognizeWord(lang_mod); - delete lang_mod; - - int cost = WORST_COST; - if (alt_list != NULL) { - if (alt_list->AltCount() > 0) { - cost = alt_list->AltCost(0); - } - } - - return cost; -} - -// Recognizes a single character and returns the list of results. -CharAltList *CubeObject::RecognizeChar() { - if (char_samp_ == NULL) return NULL; - CharAltList* alt_list = NULL; - CharClassifier *char_classifier = cntxt_->Classifier(); - ASSERT_HOST(char_classifier != NULL); - alt_list = char_classifier->Classify(char_samp_); - return alt_list; -} - -// Normalize the input word bitmap to have a minimum aspect ratio -bool CubeObject::Normalize() { - // create a cube search object - CubeSearchObject *srch_obj = new CubeSearchObject(cntxt_, char_samp_); - if (srch_obj == NULL) { - return false; - } - // Perform over-segmentation - int seg_cnt = srch_obj->SegPtCnt(); - // Only perform normalization if segment count is large enough - if (seg_cnt < kMinNormalizationSegmentCnt) { - delete srch_obj; - return true; - } - // compute the mean AR of the segments - double ar_mean = 0.0; - for (int seg_idx = 0; seg_idx <= seg_cnt; seg_idx++) { - CharSamp *seg_samp = srch_obj->CharSample(seg_idx - 1, seg_idx); - if (seg_samp != NULL && seg_samp->Width() > 0) { - ar_mean += (1.0 * seg_samp->Height() / seg_samp->Width()); - } - } - ar_mean /= (seg_cnt + 1); - // perform normalization if segment AR is too high - if (ar_mean > kMinNormalizationAspectRatio) { - // scale down the image in the y-direction to attain AR - CharSamp *new_samp = char_samp_->Scale(char_samp_->Width(), - 2.0 * char_samp_->Height() / ar_mean, - false); - if (new_samp != NULL) { - // free existing char samp if owned - if (own_char_samp_) { - delete char_samp_; - } - // update with new scaled charsamp and set ownership flag - char_samp_ = new_samp; - own_char_samp_ = true; - } - } - delete srch_obj; - return true; -} -} diff --git a/cube/cube_object.h b/cube/cube_object.h deleted file mode 100644 index e1a85145..00000000 --- a/cube/cube_object.h +++ /dev/null @@ -1,171 +0,0 @@ -/********************************************************************** - * File: cube_object.h - * Description: Declaration of the Cube Object Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CubeObject class is the main class used to perform recognition of -// a specific char_samp as a single word. -// To recognize a word, a CubeObject is constructed for this word. -// A Call to RecognizeWord is then issued specifying the language model that -// will be used during recognition. If none is specified, the default language -// model in the CubeRecoContext is used. The CubeRecoContext is passed at -// construction time -// -// The typical usage pattern for Cube is shown below: -// -// // Create and initialize Tesseract object and get its -// // CubeRecoContext object (note that Tesseract object owns it, -// // so it will be freed when the Tesseract object is freed). -// tesseract::Tesseract *tess_obj = new tesseract::Tesseract(); -// tess_obj->init_tesseract(data_path, lang, tesseract::OEM_CUBE_ONLY); -// CubeRecoContext *cntxt = tess_obj->GetCubeRecoContext(); -// CHECK(cntxt != NULL) << "Unable to create a Cube reco context"; -// . -// . -// . -// // Do this to recognize a word in pix whose co-ordinates are -// // (left,top,width,height) -// tesseract::CubeObject *cube_obj; -// cube_obj = new tesseract::CubeObject(cntxt, pix, -// left, top, width, height); -// -// // Get back Cube's list of answers -// tesseract::WordAltList *alt_list = cube_obj->RecognizeWord(); -// CHECK(alt_list != NULL && alt_list->AltCount() > 0); -// -// // Get the string and cost of every alternate -// for (int alt = 0; alt < alt_list->AltCount(); alt++) { -// // Return the result as a UTF-32 string -// string_32 res_str32 = alt_list->Alt(alt); -// // Convert to UTF8 if need-be -// string res_str; -// CubeUtils::UTF32ToUTF8(res_str32.c_str(), &res_str); -// // Get the string cost. This should get bigger as you go deeper -// // in the list -// int cost = alt_list->AltCost(alt); -// } -// -// // Call this once you are done recognizing this word -// delete cube_obj; -// -// // Call this once you are done recognizing all words with -// // for the current language -// delete tess_obj; -// -// Note that if the language supports "Italics" (see the CubeRecoContext), the -// RecognizeWord function attempts to de-slant the word. - -#ifndef CUBE_OBJECT_H -#define CUBE_OBJECT_H - -#include "char_samp.h" -#include "word_altlist.h" -#include "beam_search.h" -#include "cube_search_object.h" -#include "tess_lang_model.h" -#include "cube_reco_context.h" - -namespace tesseract { - -// minimum aspect ratio needed to normalize a char_samp before recognition -static const float kMinNormalizationAspectRatio = 3.5; -// minimum probability a top alt choice must meet before having -// deslanted processing applied to it -static const float kMinProbSkipDeslanted = 0.25; - -class CubeObject { - public: - // Different flavors of constructor. They just differ in the way the - // word image is specified - CubeObject(CubeRecoContext *cntxt, CharSamp *char_samp); - CubeObject(CubeRecoContext *cntxt, Pix *pix, - int left, int top, int wid, int hgt); - ~CubeObject(); - - // Perform the word recognition using the specified language mode. If none - // is specified, the default language model in the CubeRecoContext is used. - // Returns the sorted list of alternate word answers - WordAltList *RecognizeWord(LangModel *lang_mod = NULL); - // Same as RecognizeWord but recognizes as a phrase - WordAltList *RecognizePhrase(LangModel *lang_mod = NULL); - // Computes the cost of a specific string. This is done by performing - // recognition of a language model that allows only the specified word. - // The alternate list(s) will be permanently modified. - int WordCost(const char *str); - // Recognizes a single character and returns the list of results. - CharAltList *RecognizeChar(); - - // Returns the BeamSearch object that resulted from the last call to - // RecognizeWord - inline BeamSearch *BeamObj() const { - return (deslanted_ == true ? deslanted_beam_obj_ : beam_obj_); - } - // Returns the WordAltList object that resulted from the last call to - // RecognizeWord - inline WordAltList *AlternateList() const { - return (deslanted_ == true ? deslanted_alt_list_ : alt_list_); - } - // Returns the CubeSearchObject object that resulted from the last call to - // RecognizeWord - inline CubeSearchObject *SrchObj() const { - return (deslanted_ == true ? deslanted_srch_obj_ : srch_obj_); - } - // Returns the CharSamp object that resulted from the last call to - // RecognizeWord. Note that this object is not necessarily identical to the - // one passed at construction time as normalization might have occurred - inline CharSamp *CharSample() const { - return (deslanted_ == true ? deslanted_char_samp_ : char_samp_); - } - - // Set the ownership of the CharSamp - inline void SetCharSampOwnership(bool own_char_samp) { - own_char_samp_ = own_char_samp; - } - - protected: - // Normalize the CharSamp if its aspect ratio exceeds the below constant. - bool Normalize(); - - private: - // minimum segment count needed to normalize a char_samp before recognition - static const int kMinNormalizationSegmentCnt = 4; - - // Data member initialization function - void Init(); - // Free alternate lists. - void Cleanup(); - // Perform the actual recognition using the specified language mode. If none - // is specified, the default language model in the CubeRecoContext is used. - // Returns the sorted list of alternate answers. Called by both - // RecognizerWord (word_mode is true) or RecognizePhrase (word mode is false) - WordAltList *Recognize(LangModel *lang_mod, bool word_mode); - - CubeRecoContext *cntxt_; - BeamSearch *beam_obj_; - BeamSearch *deslanted_beam_obj_; - bool own_char_samp_; - bool deslanted_; - CharSamp *char_samp_; - CharSamp *deslanted_char_samp_; - CubeSearchObject *srch_obj_; - CubeSearchObject *deslanted_srch_obj_; - WordAltList *alt_list_; - WordAltList *deslanted_alt_list_; -}; -} - -#endif // CUBE_OBJECT_H diff --git a/cube/cube_search_object.cpp b/cube/cube_search_object.cpp deleted file mode 100644 index 61294f26..00000000 --- a/cube/cube_search_object.cpp +++ /dev/null @@ -1,455 +0,0 @@ -/********************************************************************** - * File: cube_search_object.cpp - * Description: Implementation of the Cube Search Object Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "cube_search_object.h" -#include "cube_utils.h" -#include "ndminx.h" - -namespace tesseract { - -const bool CubeSearchObject::kUseCroppedChars = true; - -CubeSearchObject::CubeSearchObject(CubeRecoContext *cntxt, CharSamp *samp) - : SearchObject(cntxt) { - init_ = false; - reco_cache_ = NULL; - samp_cache_ = NULL; - segments_ = NULL; - segment_cnt_ = 0; - samp_ = samp; - left_ = 0; - itop_ = 0; - space_cost_ = NULL; - no_space_cost_ = NULL; - wid_ = samp_->Width(); - hgt_ = samp_->Height(); - max_seg_per_char_ = cntxt_->Params()->MaxSegPerChar(); - rtl_ = (cntxt_->ReadingOrder() == CubeRecoContext::R2L); - min_spc_gap_ = - static_cast(hgt_ * cntxt_->Params()->MinSpaceHeightRatio()); - max_spc_gap_ = - static_cast(hgt_ * cntxt_->Params()->MaxSpaceHeightRatio()); -} - -CubeSearchObject::~CubeSearchObject() { - Cleanup(); -} - -// Cleanup -void CubeSearchObject::Cleanup() { - // delete Recognition Cache - if (reco_cache_) { - for (int strt_seg = 0; strt_seg < segment_cnt_; strt_seg++) { - if (reco_cache_[strt_seg]) { - for (int end_seg = 0; end_seg < segment_cnt_; end_seg++) { - if (reco_cache_[strt_seg][end_seg]) { - delete reco_cache_[strt_seg][end_seg]; - } - } - delete []reco_cache_[strt_seg]; - } - } - delete []reco_cache_; - reco_cache_ = NULL; - } - - // delete CharSamp Cache - if (samp_cache_) { - for (int strt_seg = 0; strt_seg < segment_cnt_; strt_seg++) { - if (samp_cache_[strt_seg]) { - for (int end_seg = 0; end_seg < segment_cnt_; end_seg++) { - if (samp_cache_[strt_seg][end_seg]) { - delete samp_cache_[strt_seg][end_seg]; - } - } - delete []samp_cache_[strt_seg]; - } - } - delete []samp_cache_; - samp_cache_ = NULL; - } - - // delete segment list - if (segments_) { - for (int seg = 0; seg < segment_cnt_; seg++) { - if (segments_[seg]) { - delete segments_[seg]; - } - } - delete []segments_; - segments_ = NULL; - } - - if (space_cost_) { - delete []space_cost_; - space_cost_ = NULL; - } - - if (no_space_cost_) { - delete []no_space_cost_; - no_space_cost_ = NULL; - } - - segment_cnt_ = 0; - init_ = false; -} - -// # of segmentation points. One less than the count of segments -int CubeSearchObject::SegPtCnt() { - if (!init_ && !Init()) - return -1; - return segment_cnt_ - 1; -} - -// init and allocate variables, perform segmentation -bool CubeSearchObject::Init() { - if (init_) - return true; - if (!Segment()) { - return false; - } - - // init cache - reco_cache_ = new CharAltList **[segment_cnt_]; - if (reco_cache_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeSearchObject::Init): could not " - "allocate CharAltList array\n"); - return false; - } - - samp_cache_ = new CharSamp **[segment_cnt_]; - if (samp_cache_ == NULL) { - fprintf(stderr, "Cube ERROR (CubeSearchObject::Init): could not " - "allocate CharSamp array\n"); - return false; - } - - for (int seg = 0; seg < segment_cnt_; seg++) { - reco_cache_[seg] = new CharAltList *[segment_cnt_]; - if (reco_cache_[seg] == NULL) { - fprintf(stderr, "Cube ERROR (CubeSearchObject::Init): could not " - "allocate a single segment's CharAltList array\n"); - return false; - } - - memset(reco_cache_[seg], 0, segment_cnt_ * sizeof(*reco_cache_[seg])); - - samp_cache_[seg] = new CharSamp *[segment_cnt_]; - if (samp_cache_[seg] == NULL) { - fprintf(stderr, "Cube ERROR (CubeSearchObject::Init): could not " - "allocate a single segment's CharSamp array\n"); - return false; - } - - memset(samp_cache_[seg], 0, segment_cnt_ * sizeof(*samp_cache_[seg])); - } - - init_ = true; - return true; -} - -// returns a char sample corresponding to the bitmap between 2 seg pts -CharSamp *CubeSearchObject::CharSample(int start_pt, int end_pt) { - // init if necessary - if (!init_ && !Init()) - return NULL; - // validate segment range - if (!IsValidSegmentRange(start_pt, end_pt)) - return NULL; - - // look for the samp in the cache - if (samp_cache_ && samp_cache_[start_pt + 1] && - samp_cache_[start_pt + 1][end_pt]) { - return samp_cache_[start_pt + 1][end_pt]; - } - // create a char samp object from the specified range of segments - bool left_most; - bool right_most; - CharSamp *samp = CharSamp::FromConComps(segments_, start_pt + 1, - end_pt - start_pt, NULL, - &left_most, &right_most, hgt_); - if (!samp) - return NULL; - - if (kUseCroppedChars) { - CharSamp *cropped_samp = samp->Crop(); - // we no longer need the orig sample - delete samp; - if (!cropped_samp) - return NULL; - samp = cropped_samp; - } - - // get the dimensions of the new cropped sample - int char_top = samp->Top(); - int char_wid = samp->Width(); - int char_hgt = samp->Height(); - - // for cursive languages, these features correspond to whether - // the charsamp is at the beginning or end of conncomp - if (cntxt_->Cursive() == true) { - // first and last char flags depend on reading order - bool first_char = rtl_ ? right_most : left_most; - bool last_char = rtl_ ? left_most : right_most; - - samp->SetFirstChar(first_char ? 255 : 0); - samp->SetLastChar(last_char ? 255 : 0); - } else { - // for non cursive languages, these features correspond - // to whether the charsamp is at the beginning or end of the word - samp->SetFirstChar((start_pt == -1) ? 255 : 0); - samp->SetLastChar((end_pt == (segment_cnt_ - 1)) ? 255 : 0); - } - samp->SetNormTop(255 * char_top / hgt_); - samp->SetNormBottom(255 * (char_top + char_hgt) / hgt_); - samp->SetNormAspectRatio(255 * char_wid / (char_wid + char_hgt)); - - // add to cache & return - samp_cache_[start_pt + 1][end_pt] = samp; - return samp; -} - -Box *CubeSearchObject::CharBox(int start_pt, int end_pt) { - if (!init_ && !Init()) - return NULL; - if (!IsValidSegmentRange(start_pt, end_pt)) { - fprintf(stderr, "Cube ERROR (CubeSearchObject::CharBox): invalid " - "segment range (%d, %d)\n", start_pt, end_pt); - return NULL; - } - - // create a char samp object from the specified range of segments, - // extract its dimensions into a leptonica box, and delete it - bool left_most; - bool right_most; - CharSamp *samp = CharSamp::FromConComps(segments_, start_pt + 1, - end_pt - start_pt, NULL, - &left_most, &right_most, hgt_); - if (!samp) - return NULL; - if (kUseCroppedChars) { - CharSamp *cropped_samp = samp->Crop(); - delete samp; - if (!cropped_samp) { - return NULL; - } - samp = cropped_samp; - } - Box *box = boxCreate(samp->Left(), samp->Top(), - samp->Width(), samp->Height()); - delete samp; - return box; -} - -// call from Beam Search to return the alt list corresponding to -// recognizing the bitmap between two segmentation pts -CharAltList * CubeSearchObject::RecognizeSegment(int start_pt, int end_pt) { - // init if necessary - if (!init_ && !Init()) { - fprintf(stderr, "Cube ERROR (CubeSearchObject::RecognizeSegment): could " - "not initialize CubeSearchObject\n"); - return NULL; - } - - // validate segment range - if (!IsValidSegmentRange(start_pt, end_pt)) { - fprintf(stderr, "Cube ERROR (CubeSearchObject::RecognizeSegment): invalid " - "segment range (%d, %d)\n", start_pt, end_pt); - return NULL; - } - - // look for the recognition results in cache in the cache - if (reco_cache_ && reco_cache_[start_pt + 1] && - reco_cache_[start_pt + 1][end_pt]) { - return reco_cache_[start_pt + 1][end_pt]; - } - - // create the char sample corresponding to the blob - CharSamp *samp = CharSample(start_pt, end_pt); - if (!samp) { - fprintf(stderr, "Cube ERROR (CubeSearchObject::RecognizeSegment): could " - "not construct CharSamp\n"); - return NULL; - } - - // recognize the char sample - CharClassifier *char_classifier = cntxt_->Classifier(); - if (char_classifier) { - reco_cache_[start_pt + 1][end_pt] = char_classifier->Classify(samp); - } else { - // no classifer: all characters are equally probable; add a penalty - // that favors 2-segment characters and aspect ratios (w/h) > 1 - fprintf(stderr, "Cube WARNING (CubeSearchObject::RecognizeSegment): cube " - "context has no character classifier!! Inventing a probability " - "distribution.\n"); - int class_cnt = cntxt_->CharacterSet()->ClassCount(); - CharAltList *alt_list = new CharAltList(cntxt_->CharacterSet(), class_cnt); - int seg_cnt = end_pt - start_pt; - double prob_val = (1.0 / class_cnt) * - exp(-fabs(seg_cnt - 2.0)) * - exp(-samp->Width() / static_cast(samp->Height())); - - if (alt_list) { - for (int class_idx = 0; class_idx < class_cnt; class_idx++) { - alt_list->Insert(class_idx, CubeUtils::Prob2Cost(prob_val)); - } - reco_cache_[start_pt + 1][end_pt] = alt_list; - } - } - - return reco_cache_[start_pt + 1][end_pt]; -} - -// Perform segmentation of the bitmap by detecting connected components, -// segmenting each connected component using windowed vertical pixel density -// histogram and sorting the resulting segments in reading order -bool CubeSearchObject::Segment() { - if (!samp_) - return false; - segment_cnt_ = 0; - segments_ = samp_->Segment(&segment_cnt_, rtl_, - cntxt_->Params()->HistWindWid(), - cntxt_->Params()->MinConCompSize()); - if (!segments_ || segment_cnt_ <= 0) { - return false; - } - if (segment_cnt_ >= kMaxSegmentCnt) { - return false; - } - return true; -} - -// computes the space and no space costs at gaps between segments -bool CubeSearchObject::ComputeSpaceCosts() { - // init if necessary - if (!init_ && !Init()) - return false; - - // Already computed - if (space_cost_) - return true; - - // No segmentation points - if (segment_cnt_ < 2) - return false; - - // Compute the maximum x to the left of and minimum x to the right of each - // segmentation point - int *max_left_x = new int[segment_cnt_ - 1]; - int *min_right_x = new int[segment_cnt_ - 1]; - if (!max_left_x || !min_right_x) { - delete []min_right_x; - delete []max_left_x; - return false; - } - if (rtl_) { - min_right_x[0] = segments_[0]->Left(); - max_left_x[segment_cnt_ - 2] = segments_[segment_cnt_ - 1]->Right(); - for (int pt_idx = 1; pt_idx < (segment_cnt_ - 1); pt_idx++) { - min_right_x[pt_idx] = - MIN(min_right_x[pt_idx - 1], segments_[pt_idx]->Left()); - max_left_x[segment_cnt_ - pt_idx - 2] = - MAX(max_left_x[segment_cnt_ - pt_idx - 1], - segments_[segment_cnt_ - pt_idx - 1]->Right()); - } - } else { - min_right_x[segment_cnt_ - 2] = segments_[segment_cnt_ - 1]->Left(); - max_left_x[0] = segments_[0]->Right(); - for (int pt_idx = 1; pt_idx < (segment_cnt_ - 1); pt_idx++) { - min_right_x[segment_cnt_ - pt_idx - 2] = - MIN(min_right_x[segment_cnt_ - pt_idx - 1], - segments_[segment_cnt_ - pt_idx - 1]->Left()); - max_left_x[pt_idx] = - MAX(max_left_x[pt_idx - 1], segments_[pt_idx]->Right()); - } - } - - // Allocate memory for space and no space costs - // trivial cases - space_cost_ = new int[segment_cnt_ - 1]; - no_space_cost_ = new int[segment_cnt_ - 1]; - if (!space_cost_ || !no_space_cost_) { - delete []min_right_x; - delete []max_left_x; - return false; - } - - // go through all segmentation points determining the horizontal gap between - // the images on both sides of each break points. Use the gap to estimate - // the probability of a space. The probability is modeled a linear function - // of the gap width - for (int pt_idx = 0; pt_idx < (segment_cnt_ - 1); pt_idx++) { - // determine the gap at the segmentation point - int gap = min_right_x[pt_idx] - max_left_x[pt_idx]; - float prob = 0.0; - - // gap is too small => no space - if (gap < min_spc_gap_) { - prob = 0.0; - } else if (gap > max_spc_gap_) { - // gap is too big => definite space - prob = 1.0; - } else { - // gap is somewhere in between, compute probability - prob = (gap - min_spc_gap_) / - static_cast(max_spc_gap_ - min_spc_gap_); - } - - // compute cost of space and non-space - space_cost_[pt_idx] = CubeUtils::Prob2Cost(prob) + - CubeUtils::Prob2Cost(0.1); - no_space_cost_[pt_idx] = CubeUtils::Prob2Cost(1.0 - prob); - } - - delete []min_right_x; - delete []max_left_x; - - return true; -} - -// Returns the cost of having a space before the specified segmentation point -int CubeSearchObject::SpaceCost(int pt_idx) { - if (!space_cost_ && !ComputeSpaceCosts()) { - // Failed to compute costs return a zero prob - return CubeUtils::Prob2Cost(0.0); - } - return space_cost_[pt_idx]; -} - -// Returns the cost of not having a space before the specified -// segmentation point -int CubeSearchObject::NoSpaceCost(int pt_idx) { - // If failed to compute costs, return a 1.0 prob - if (!space_cost_ && !ComputeSpaceCosts()) - return CubeUtils::Prob2Cost(0.0); - return no_space_cost_[pt_idx]; -} - -// Returns the cost of not having any spaces within the specified range -// of segmentation points -int CubeSearchObject::NoSpaceCost(int st_pt, int end_pt) { - // If fail to compute costs, return a 1.0 prob - if (!space_cost_ && !ComputeSpaceCosts()) - return CubeUtils::Prob2Cost(1.0); - int no_spc_cost = 0; - for (int pt_idx = st_pt + 1; pt_idx < end_pt; pt_idx++) - no_spc_cost += NoSpaceCost(pt_idx); - return no_spc_cost; -} -} diff --git a/cube/cube_search_object.h b/cube/cube_search_object.h deleted file mode 100644 index 0a6c3ce2..00000000 --- a/cube/cube_search_object.h +++ /dev/null @@ -1,122 +0,0 @@ -/********************************************************************** - * File: cube_search_object.h - * Description: Declaration of the Cube Search Object Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CubeSearchObject class represents a char_samp (a word bitmap) that is -// being searched for characters (or recognizeable entities). -// The Class detects the connected components and peforms an oversegmentation -// on each ConComp. The result of which is a list of segments that are ordered -// in reading order. -// The class provided methods that inquire about the number of segments, the -// CharSamp corresponding to any segment range and the recognition results -// of any segment range -// An object of Class CubeSearchObject is used by the BeamSearch algorithm -// to recognize a CharSamp into a list of word alternates - -#ifndef CUBE_SEARCH_OBJECT_H -#define CUBE_SEARCH_OBJECT_H - -#include "search_object.h" -#include "char_samp.h" -#include "conv_net_classifier.h" -#include "cube_reco_context.h" -#include "allheaders.h" - -namespace tesseract { -class CubeSearchObject : public SearchObject { - public: - CubeSearchObject(CubeRecoContext *cntxt, CharSamp *samp); - ~CubeSearchObject(); - - // returns the Segmentation Point count of the CharSamp owned by the class - int SegPtCnt(); - // Recognize the set of segments given by the specified range and return - // a list of possible alternate answers - CharAltList * RecognizeSegment(int start_pt, int end_pt); - // Returns the CharSamp corresponding to the specified segment range - CharSamp *CharSample(int start_pt, int end_pt); - // Returns a leptonica box corresponding to the specified segment range - Box *CharBox(int start_pt, int end_pt); - // Returns the cost of having a space before the specified segmentation pt - int SpaceCost(int seg_pt); - // Returns the cost of not having a space before the specified - // segmentation pt - int NoSpaceCost(int seg_pt); - // Returns the cost of not having any spaces within the specified range - // of segmentation points - int NoSpaceCost(int seg_pt, int end_pt); - - private: - // Maximum reasonable segment count - static const int kMaxSegmentCnt = 128; - // Use cropped samples - static const bool kUseCroppedChars; - - // reading order flag - bool rtl_; - // cached dimensions of char samp - int left_; - int itop_; - int wid_; - int hgt_; - // minimum and maximum and possible inter-segment gaps for spaces - int min_spc_gap_; - int max_spc_gap_; - // initialization flag - bool init_; - // maximum segments per character: Cached from tuning parameters object - int max_seg_per_char_; - // char sample to be processed - CharSamp *samp_; - // segment count - int segment_cnt_; - // segments of the processed char samp - ConComp **segments_; - // Cache data members: - // There are two caches kept; a CharSamp cache and a CharAltList cache - // Each is a 2-D array of CharSamp and CharAltList pointers respectively - // hence the triple pointer. - CharAltList ***reco_cache_; - CharSamp ***samp_cache_; - // Cached costs of space and no-space after every segment. Computed only - // in phrase mode - int *space_cost_; - int *no_space_cost_; - - // init and allocate variables, perform segmentation - bool Init(); - // Cleanup - void Cleanup(); - // Perform segmentation of the bitmap by detecting connected components, - // segmenting each connected component using windowed vertical pixel density - // histogram and sorting the resulting segments in reading order - // Returns true on success - bool Segment(); - // validate the segment ranges. - inline bool IsValidSegmentRange(int start_pt, int end_pt) { - return (end_pt > start_pt && start_pt >= -1 && start_pt < segment_cnt_ && - end_pt >= 0 && end_pt <= segment_cnt_ && - end_pt <= (start_pt + max_seg_per_char_)); - } - // computes the space and no space costs at gaps between segments - // return true on success - bool ComputeSpaceCosts(); -}; -} - -#endif // CUBE_SEARCH_OBJECT_H diff --git a/cube/cube_tuning_params.cpp b/cube/cube_tuning_params.cpp deleted file mode 100644 index ac16c9f5..00000000 --- a/cube/cube_tuning_params.cpp +++ /dev/null @@ -1,218 +0,0 @@ -/********************************************************************** - * File: cube_tuning_params.cpp - * Description: Implementation of the CubeTuningParameters Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include "cube_tuning_params.h" -#include "tuning_params.h" -#include "cube_utils.h" - -namespace tesseract { -CubeTuningParams::CubeTuningParams() { - reco_wgt_ = 1.0; - size_wgt_ = 1.0; - char_bigrams_wgt_ = 1.0; - word_unigrams_wgt_ = 0.0; - max_seg_per_char_ = 8; - beam_width_ = 32; - tp_classifier_ = NN; - tp_feat_ = BMP; - conv_grid_size_ = 32; - hist_wind_wid_ = 0; - max_word_aspect_ratio_ = 10.0; - min_space_height_ratio_ = 0.2; - max_space_height_ratio_ = 0.3; - min_con_comp_size_ = 0; - combiner_run_thresh_ = 1.0; - combiner_classifier_thresh_ = 0.5; - ood_wgt_ = 1.0; - num_wgt_ = 1.0; - -} - -CubeTuningParams::~CubeTuningParams() { -} - -// Create an Object given the data file path and the language by loading -// the approporiate file -CubeTuningParams *CubeTuningParams::Create(const string &data_file_path, - const string &lang) { - CubeTuningParams *obj = new CubeTuningParams(); - if (!obj) { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Create): unable to " - "allocate new tuning params object\n"); - return NULL; - } - - string tuning_params_file; - tuning_params_file = data_file_path + lang; - tuning_params_file += ".cube.params"; - - if (!obj->Load(tuning_params_file)) { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Create): unable to " - "load tuning parameters from %s\n", tuning_params_file.c_str()); - delete obj; - obj = NULL; - } - - return obj; -} - -// Loads the params file -bool CubeTuningParams::Load(string tuning_params_file) { - // load the string into memory - string param_str; - - if (CubeUtils::ReadFileToString(tuning_params_file, ¶m_str) == false) { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Load): unable to read " - "file %s\n", tuning_params_file.c_str()); - return false; - } - - // split into lines - vector str_vec; - CubeUtils::SplitStringUsing(param_str, "\r\n", &str_vec); - if (str_vec.size() < 8) { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Load): number of rows " - "in parameter file is too low\n"); - return false; - } - - // for all entries - for (int entry = 0; entry < str_vec.size(); entry++) { - // tokenize - vector str_tok; - - // should be only two tokens - CubeUtils::SplitStringUsing(str_vec[entry], "=", &str_tok); - if (str_tok.size() != 2) { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Load): invalid format in " - "line: %s.\n", str_vec[entry].c_str()); - return false; - } - - double val = 0; - char peekchar = (str_tok[1].c_str())[0]; - if ((peekchar >= '0' && peekchar <= '9') || - peekchar == '-' || peekchar == '+' || - peekchar == '.') { - // read the value - if (sscanf(str_tok[1].c_str(), "%lf", &val) != 1) { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Load): invalid format " - "in line: %s.\n", str_vec[entry].c_str()); - return false; - } - } - - // token type - if (str_tok[0] == "RecoWgt") { - reco_wgt_ = val; - } else if (str_tok[0] == "SizeWgt") { - size_wgt_ = val; - } else if (str_tok[0] == "CharBigramsWgt") { - char_bigrams_wgt_ = val; - } else if (str_tok[0] == "WordUnigramsWgt") { - word_unigrams_wgt_ = val; - } else if (str_tok[0] == "MaxSegPerChar") { - max_seg_per_char_ = static_cast(val); - } else if (str_tok[0] == "BeamWidth") { - beam_width_ = static_cast(val); - } else if (str_tok[0] == "Classifier") { - if (str_tok[1] == "NN") { - tp_classifier_ = TuningParams::NN; - } else if (str_tok[1] == "HYBRID_NN") { - tp_classifier_ = TuningParams::HYBRID_NN; - } else { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Load): invalid " - "classifier type in line: %s.\n", str_vec[entry].c_str()); - return false; - } - } else if (str_tok[0] == "FeatureType") { - if (str_tok[1] == "BMP") { - tp_feat_ = TuningParams::BMP; - } else if (str_tok[1] == "CHEBYSHEV") { - tp_feat_ = TuningParams::CHEBYSHEV; - } else if (str_tok[1] == "HYBRID") { - tp_feat_ = TuningParams::HYBRID; - } else { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Load): invalid feature " - "type in line: %s.\n", str_vec[entry].c_str()); - return false; - } - } else if (str_tok[0] == "ConvGridSize") { - conv_grid_size_ = static_cast(val); - } else if (str_tok[0] == "HistWindWid") { - hist_wind_wid_ = val; - } else if (str_tok[0] == "MinConCompSize") { - min_con_comp_size_ = val; - } else if (str_tok[0] == "MaxWordAspectRatio") { - max_word_aspect_ratio_ = val; - } else if (str_tok[0] == "MinSpaceHeightRatio") { - min_space_height_ratio_ = val; - } else if (str_tok[0] == "MaxSpaceHeightRatio") { - max_space_height_ratio_ = val; - } else if (str_tok[0] == "CombinerRunThresh") { - combiner_run_thresh_ = val; - } else if (str_tok[0] == "CombinerClassifierThresh") { - combiner_classifier_thresh_ = val; - } else if (str_tok[0] == "OODWgt") { - ood_wgt_ = val; - } else if (str_tok[0] == "NumWgt") { - num_wgt_ = val; - } else { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Load): unknown parameter " - "in line: %s.\n", str_vec[entry].c_str()); - return false; - } - } - - return true; -} - -// Save the parameters to a file -bool CubeTuningParams::Save(string file_name) { - FILE *params_file = fopen(file_name.c_str(), "wb"); - if (params_file == NULL) { - fprintf(stderr, "Cube ERROR (CubeTuningParams::Save): error opening file " - "%s for write.\n", file_name.c_str()); - return false; - } - - fprintf(params_file, "RecoWgt=%.4f\n", reco_wgt_); - fprintf(params_file, "SizeWgt=%.4f\n", size_wgt_); - fprintf(params_file, "CharBigramsWgt=%.4f\n", char_bigrams_wgt_); - fprintf(params_file, "WordUnigramsWgt=%.4f\n", word_unigrams_wgt_); - fprintf(params_file, "MaxSegPerChar=%d\n", max_seg_per_char_); - fprintf(params_file, "BeamWidth=%d\n", beam_width_); - fprintf(params_file, "ConvGridSize=%d\n", conv_grid_size_); - fprintf(params_file, "HistWindWid=%d\n", hist_wind_wid_); - fprintf(params_file, "MinConCompSize=%d\n", min_con_comp_size_); - fprintf(params_file, "MaxWordAspectRatio=%.4f\n", max_word_aspect_ratio_); - fprintf(params_file, "MinSpaceHeightRatio=%.4f\n", min_space_height_ratio_); - fprintf(params_file, "MaxSpaceHeightRatio=%.4f\n", max_space_height_ratio_); - fprintf(params_file, "CombinerRunThresh=%.4f\n", combiner_run_thresh_); - fprintf(params_file, "CombinerClassifierThresh=%.4f\n", - combiner_classifier_thresh_); - fprintf(params_file, "OODWgt=%.4f\n", ood_wgt_); - fprintf(params_file, "NumWgt=%.4f\n", num_wgt_); - - fclose(params_file); - return true; -} -} diff --git a/cube/cube_tuning_params.h b/cube/cube_tuning_params.h deleted file mode 100644 index 8b125872..00000000 --- a/cube/cube_tuning_params.h +++ /dev/null @@ -1,57 +0,0 @@ -/********************************************************************** - * File: cube_tuning_params.h - * Description: Declaration of the CubeTuningParameters Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CubeTuningParams class abstracts all the parameters that are used -// in Cube and are tuned/learned during the training process. Inherits -// from the TuningParams class. - -#ifndef CUBE_TUNING_PARAMS_H -#define CUBE_TUNING_PARAMS_H - -#include -#include "tuning_params.h" - -namespace tesseract { -class CubeTuningParams : public TuningParams { - public: - CubeTuningParams(); - ~CubeTuningParams(); - - // Accessor functions - inline double OODWgt() { return ood_wgt_; } - inline double NumWgt() { return num_wgt_; } - - inline void SetOODWgt(double wgt) { ood_wgt_ = wgt; } - inline void SetNumWgt(double wgt) { num_wgt_ = wgt; } - - // Create an object given the data file path and the language by loading - // the approporiate file - static CubeTuningParams * Create(const string &data_file, - const string &lang); - // Save and load the tuning parameters to a specified file - bool Save(string file_name); - bool Load(string file_name); - - private: - double ood_wgt_; - double num_wgt_; -}; -} - -#endif // CUBE_TUNING_PARAMS_H diff --git a/cube/cube_utils.cpp b/cube/cube_utils.cpp deleted file mode 100644 index 13c9c236..00000000 --- a/cube/cube_utils.cpp +++ /dev/null @@ -1,413 +0,0 @@ -/********************************************************************** - * File: cube_utils.cpp - * Description: Implementation of the Cube Utilities Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include "cube_utils.h" -#include "char_set.h" -#include "unichar.h" - -namespace tesseract { -CubeUtils::CubeUtils() { -} - -CubeUtils::~CubeUtils() { -} - -/** - * convert a prob to a cost (-ve log prob) - */ -int CubeUtils::Prob2Cost(double prob_val) { - if (prob_val < MIN_PROB) { - return MIN_PROB_COST; - } - return static_cast(-log(prob_val) * PROB2COST_SCALE); -} - -/** - * converts a cost to probability - */ -double CubeUtils::Cost2Prob(int cost) { - return exp(-cost / PROB2COST_SCALE); -} - -/** - * computes the length of a NULL terminated char_32 string - */ -int CubeUtils::StrLen(const char_32 *char_32_ptr) { - if (char_32_ptr == NULL) { - return 0; - } - int len = -1; - while (char_32_ptr[++len]); - return len; -} - -/** - * compares two char_32 strings - */ -int CubeUtils::StrCmp(const char_32 *str1, const char_32 *str2) { - const char_32 *pch1 = str1; - const char_32 *pch2 = str2; - - for (; (*pch1) != 0 && (*pch2) != 0; pch1++, pch2++) { - if ((*pch1) != (*pch2)) { - return (*pch1) - (*pch2); - } - } - - if ((*pch1) == 0) { - if ((*pch2) == 0) { - return 0; - } else { - return -1; - } - } else { - return 1; - } -} - -/** - * Duplicates a 32-bit char buffer - */ -char_32 *CubeUtils::StrDup(const char_32 *str32) { - int len = StrLen(str32); - char_32 *new_str = new char_32[len + 1]; - if (new_str == NULL) { - return NULL; - } - memcpy(new_str, str32, len * sizeof(*str32)); - new_str[len] = 0; - return new_str; -} - -/** - * creates a char samp from a specified portion of the image - */ -CharSamp *CubeUtils::CharSampleFromPix(Pix *pix, int left, int top, - int wid, int hgt) { - // get the raw img data from the image - unsigned char *temp_buff = GetImageData(pix, left, top, wid, hgt); - if (temp_buff == NULL) { - return NULL; - } - - // create a char samp from temp buffer - CharSamp *char_samp = CharSamp::FromRawData(left, top, wid, hgt, temp_buff); - - // clean up temp buffer - delete []temp_buff; - return char_samp; -} - -/** - * create a B/W image from a char_sample - */ -Pix *CubeUtils::PixFromCharSample(CharSamp *char_samp) { - // parameter check - if (char_samp == NULL) { - return NULL; - } - - // get the raw data - int stride = char_samp->Stride(); - int wid = char_samp->Width(); - int hgt = char_samp->Height(); - - Pix *pix = pixCreate(wid, hgt, 1); - if (pix == NULL) { - return NULL; - } - - // copy the contents - unsigned char *line = char_samp->RawData(); - for (int y = 0; y < hgt ; y++, line += stride) { - for (int x = 0; x < wid; x++) { - if (line[x] != 0) { - pixSetPixel(pix, x, y, 0); - } else { - pixSetPixel(pix, x, y, 255); - } - } - } - - return pix; -} - -/** - * creates a raw buffer from the specified location of the pix - */ -unsigned char *CubeUtils::GetImageData(Pix *pix, int left, int top, - int wid, int hgt) { - // skip invalid dimensions - if (left < 0 || top < 0 || wid < 0 || hgt < 0 || - (left + wid) > pix->w || (top + hgt) > pix->h || - pix->d != 1) { - return NULL; - } - - // copy the char img to a temp buffer - unsigned char *temp_buff = new unsigned char[wid * hgt]; - if (temp_buff == NULL) { - return NULL; - } - l_int32 w; - l_int32 h; - l_int32 d; - l_int32 wpl; - l_uint32 *line; - l_uint32 *data; - - pixGetDimensions(pix, &w, &h, &d); - wpl = pixGetWpl(pix); - data = pixGetData(pix); - line = data + (top * wpl); - - for (int y = 0, off = 0; y < hgt ; y++) { - for (int x = 0; x < wid; x++, off++) { - temp_buff[off] = GET_DATA_BIT(line, x + left) ? 0 : 255; - } - line += wpl; - } - return temp_buff; -} - -/** - * read file contents to a string - */ -bool CubeUtils::ReadFileToString(const string &file_name, string *str) { - str->clear(); - FILE *fp = fopen(file_name.c_str(), "rb"); - if (fp == NULL) { - return false; - } - - // get the size of the size - fseek(fp, 0, SEEK_END); - int file_size = ftell(fp); - if (file_size < 1) { - fclose(fp); - return false; - } - // adjust string size - str->reserve(file_size); - // read the contents - rewind(fp); - char *buff = new char[file_size]; - if (buff == NULL) { - fclose(fp); - return false; - } - int read_bytes = fread(buff, 1, static_cast(file_size), fp); - if (read_bytes == file_size) { - str->append(buff, file_size); - } - delete []buff; - fclose(fp); - return (read_bytes == file_size); -} - -/** - * splits a string into vectors based on specified delimiters - */ -void CubeUtils::SplitStringUsing(const string &str, - const string &delims, - vector *str_vec) { - // Optimize the common case where delims is a single character. - if (delims[0] != '\0' && delims[1] == '\0') { - char c = delims[0]; - const char* p = str.data(); - const char* end = p + str.size(); - while (p != end) { - if (*p == c) { - ++p; - } else { - const char* start = p; - while (++p != end && *p != c); - str_vec->push_back(string(start, p - start)); - } - } - return; - } - - string::size_type begin_index, end_index; - begin_index = str.find_first_not_of(delims); - while (begin_index != string::npos) { - end_index = str.find_first_of(delims, begin_index); - if (end_index == string::npos) { - str_vec->push_back(str.substr(begin_index)); - return; - } - str_vec->push_back(str.substr(begin_index, (end_index - begin_index))); - begin_index = str.find_first_not_of(delims, end_index); - } -} - -/** - * UTF-8 to UTF-32 conversion functions - */ -void CubeUtils::UTF8ToUTF32(const char *utf8_str, string_32 *str32) { - str32->clear(); - int len = strlen(utf8_str); - int step = 0; - for (int ch = 0; ch < len; ch += step) { - step = UNICHAR::utf8_step(utf8_str + ch); - if (step > 0) { - UNICHAR uni_ch(utf8_str + ch, step); - (*str32) += uni_ch.first_uni(); - } - } -} - -/** - * UTF-32 to UTF-8 conversion functions - */ -void CubeUtils::UTF32ToUTF8(const char_32 *utf32_str, string *str) { - str->clear(); - for (const char_32 *ch_32 = utf32_str; (*ch_32) != 0; ch_32++) { - UNICHAR uni_ch((*ch_32)); - char *utf8 = uni_ch.utf8_str(); - if (utf8 != NULL) { - (*str) += utf8; - delete []utf8; - } - } -} - -bool CubeUtils::IsCaseInvariant(const char_32 *str32, CharSet *char_set) { - bool all_one_case = true; - bool capitalized; - bool prev_upper; - bool prev_lower; - bool first_upper; - bool first_lower; - bool cur_upper; - bool cur_lower; - - string str8; - if (!char_set) { - // If cube char_set is missing, use C-locale-dependent functions - // on UTF8 characters to determine case properties. - first_upper = isupper(str32[0]); - first_lower = islower(str32[0]); - if (first_upper) - capitalized = true; - prev_upper = first_upper; - prev_lower = first_lower; - for (int c = 1; str32[c] != 0; ++c) { - cur_upper = isupper(str32[c]); - cur_lower = islower(str32[c]); - if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) - all_one_case = false; - if (cur_upper) - capitalized = false; - prev_upper = cur_upper; - prev_lower = cur_lower; - } - } else { - UNICHARSET *unicharset = char_set->InternalUnicharset(); - // Use UNICHARSET functions to determine case properties - first_upper = unicharset->get_isupper(char_set->ClassID(str32[0])); - first_lower = unicharset->get_islower(char_set->ClassID(str32[0])); - if (first_upper) - capitalized = true; - prev_upper = first_upper; - prev_lower = first_lower; - - for (int c = 1; c < StrLen(str32); ++c) { - cur_upper = unicharset->get_isupper(char_set->ClassID(str32[c])); - cur_lower = unicharset->get_islower(char_set->ClassID(str32[c])); - if ((prev_upper && cur_lower) || (prev_lower && cur_upper)) - all_one_case = false; - if (cur_upper) - capitalized = false; - prev_upper = cur_upper; - prev_lower = cur_lower; - } - } - return all_one_case || capitalized; -} - -char_32 *CubeUtils::ToLower(const char_32 *str32, CharSet *char_set) { - if (!char_set) { - return NULL; - } - UNICHARSET *unicharset = char_set->InternalUnicharset(); - int len = StrLen(str32); - char_32 *lower = new char_32[len + 1]; - if (!lower) - return NULL; - for (int i = 0; i < len; ++i) { - char_32 ch = str32[i]; - if (ch == INVALID_UNICHAR_ID) { - delete [] lower; - return NULL; - } - // convert upper-case characters to lower-case - if (unicharset->get_isupper(char_set->ClassID(ch))) { - UNICHAR_ID uid_lower = unicharset->get_other_case(char_set->ClassID(ch)); - const char_32 *str32_lower = char_set->ClassString(uid_lower); - // expect lower-case version of character to be a single character - if (!str32_lower || StrLen(str32_lower) != 1) { - delete [] lower; - return NULL; - } - lower[i] = str32_lower[0]; - } else { - lower[i] = ch; - } - } - lower[len] = 0; - return lower; -} - -char_32 *CubeUtils::ToUpper(const char_32 *str32, CharSet *char_set) { - if (!char_set) { - return NULL; - } - UNICHARSET *unicharset = char_set->InternalUnicharset(); - int len = StrLen(str32); - char_32 *upper = new char_32[len + 1]; - if (!upper) - return NULL; - for (int i = 0; i < len; ++i) { - char_32 ch = str32[i]; - if (ch == INVALID_UNICHAR_ID) { - delete [] upper; - return NULL; - } - // convert lower-case characters to upper-case - if (unicharset->get_islower(char_set->ClassID(ch))) { - UNICHAR_ID uid_upper = unicharset->get_other_case(char_set->ClassID(ch)); - const char_32 *str32_upper = char_set->ClassString(uid_upper); - // expect upper-case version of character to be a single character - if (!str32_upper || StrLen(str32_upper) != 1) { - delete [] upper; - return NULL; - } - upper[i] = str32_upper[0]; - } else { - upper[i] = ch; - } - } - upper[len] = 0; - return upper; -} -} // namespace tesseract diff --git a/cube/cube_utils.h b/cube/cube_utils.h deleted file mode 100644 index 765012cd..00000000 --- a/cube/cube_utils.h +++ /dev/null @@ -1,83 +0,0 @@ -/********************************************************************** - * File: cube_utils.h - * Description: Declaration of the Cube Utilities Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - *(C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0(the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The CubeUtils class provides miscellaneous utility and helper functions -// to the rest of the Cube Engine - -#ifndef CUBE_UTILS_H -#define CUBE_UTILS_H - -#include -#include - -#include "allheaders.h" -#include "const.h" -#include "char_set.h" -#include "char_samp.h" - -namespace tesseract { -class CubeUtils { - public: - CubeUtils(); - ~CubeUtils(); - - // Converts a probability value to a cost by getting the -log() of the - // probability value to a known base - static int Prob2Cost(double prob_val); - // Converts a cost to probability by getting the exp(-normalized cost) - static double Cost2Prob(int cost); - // Computes the length of a 32-bit char buffer - static int StrLen(const char_32 *str); - // Compares two 32-bit char buffers - static int StrCmp(const char_32 *str1, const char_32 *str2); - // Duplicates a 32-bit char buffer - static char_32 *StrDup(const char_32 *str); - // Creates a CharSamp from an Pix and a bounding box - static CharSamp *CharSampleFromPix(Pix *pix, - int left, int top, int wid, int hgt); - // Creates a Pix from a CharSamp - static Pix *PixFromCharSample(CharSamp *char_samp); - // read the contents of a file to a string - static bool ReadFileToString(const string &file_name, string *str); - // split a string into vectors using any of the specified delimiters - static void SplitStringUsing(const string &str, const string &delims, - vector *str_vec); - // UTF-8 to UTF-32 convesion functions - static void UTF8ToUTF32(const char *utf8_str, string_32 *str32); - static void UTF32ToUTF8(const char_32 *utf32_str, string *str); - // Returns true if input word has either 1) all-one-case, or 2) - // first character upper-case, and remaining characters lower-case. - // If char_set is not NULL, uses tesseract's unicharset functions - // to determine case properties. Otherwise, uses C-locale-dependent - // functions, which may be unreliable on non-ASCII characters. - static bool IsCaseInvariant(const char_32 *str32, CharSet *char_set); - // Returns char_32 pointer to the lower-case-transformed version of - // the input string or NULL on error. If char_set is NULL returns NULL. - // Return array must be freed by caller. - static char_32 *ToLower(const char_32 *str32, CharSet *char_set); - // Returns char_32 pointer to the upper-case-transformed version of - // the input string or NULL on error. If char_set is NULL returns NULL. - // Return array must be freed by caller. - static char_32 *ToUpper(const char_32 *str32, CharSet *char_set); - private: - static unsigned char *GetImageData(Pix *pix, - int left, int top, int wid, int hgt); -}; -} // namespace tesseract -#endif // CUBE_UTILS_H diff --git a/cube/feature_base.h b/cube/feature_base.h deleted file mode 100644 index 032bc73c..00000000 --- a/cube/feature_base.h +++ /dev/null @@ -1,55 +0,0 @@ -/********************************************************************** - * File: feature_base.h - * Description: Declaration of the Feature Base Class - * Author: Ping Ping (xiupingping), Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The FeatureBase class is the base class for any Feature Extraction class -// It provided 3 pure virtual functions (to inherit): -// 1- FeatureCnt: A method to returns the count of features -// 2- ComputeFeatures: A method to compute the features for a given CharSamp -// 3- ComputeFeatureBitmap: A method to render a visualization of the features -// to a CharSamp. This is mainly used by visual-debuggers - -#ifndef FEATURE_BASE_H -#define FEATURE_BASE_H - -#include "char_samp.h" -#include "tuning_params.h" - -namespace tesseract { -class FeatureBase { - public: - explicit FeatureBase(TuningParams *params) - : params_(params) { - } - virtual ~FeatureBase() {} - - // Compute the features for a given CharSamp - virtual bool ComputeFeatures(CharSamp *char_samp, float *features) = 0; - // Render a visualization of the features to a CharSamp. - // This is mainly used by visual-debuggers - virtual CharSamp *ComputeFeatureBitmap(CharSamp *char_samp) = 0; - // Returns the count of features - virtual int FeatureCnt() = 0; - - protected: - TuningParams *params_; -}; -} - -#endif // FEATURE_BASE_H - diff --git a/cube/feature_bmp.cpp b/cube/feature_bmp.cpp deleted file mode 100644 index 06e18798..00000000 --- a/cube/feature_bmp.cpp +++ /dev/null @@ -1,50 +0,0 @@ -/********************************************************************** - * File: feature_bmp.cpp - * Description: Implementation of the Bitmap Feature Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include "feature_base.h" -#include "feature_bmp.h" -#include "cube_utils.h" -#include "const.h" -#include "char_samp.h" - -namespace tesseract { - -FeatureBmp::FeatureBmp(TuningParams *params) - :FeatureBase(params) { - conv_grid_size_ = params->ConvGridSize(); -} - -FeatureBmp::~FeatureBmp() { -} - -// Render a visualization of the features to a CharSamp. -// This is mainly used by visual-debuggers -CharSamp *FeatureBmp::ComputeFeatureBitmap(CharSamp *char_samp) { - return char_samp->Scale(conv_grid_size_, conv_grid_size_); -} - -// Compute the features for a given CharSamp -bool FeatureBmp::ComputeFeatures(CharSamp *char_samp, float *features) { - return char_samp->ComputeFeatures(conv_grid_size_, features); -} -} - diff --git a/cube/feature_bmp.h b/cube/feature_bmp.h deleted file mode 100644 index 2a84941b..00000000 --- a/cube/feature_bmp.h +++ /dev/null @@ -1,53 +0,0 @@ -/********************************************************************** - * File: feature_bmp.h - * Description: Declaration of the Bitmap Feature Class - * Author: PingPing xiu (xiupingping) & Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The FeatureBmp class implements a Bitmap feature extractor class. It -// inherits from the FeatureBase class -// The Bitmap feature vectors is the the bitmap of the specified CharSamp -// scaled to a fixed grid size and then augmented by a 5 aux features that -// describe the size, aspect ration and placement within a word - -#ifndef FEATURE_BMP_H -#define FEATURE_BMP_H - -#include "char_samp.h" -#include "feature_base.h" - -namespace tesseract { -class FeatureBmp : public FeatureBase { - public: - explicit FeatureBmp(TuningParams *params); - virtual ~FeatureBmp(); - // Render a visualization of the features to a CharSamp. - // This is mainly used by visual-debuggers - virtual CharSamp *ComputeFeatureBitmap(CharSamp *samp); - // Compute the features for a given CharSamp - virtual bool ComputeFeatures(CharSamp *samp, float *features); - // Returns the count of features - virtual int FeatureCnt() { - return 5 + (conv_grid_size_ * conv_grid_size_); - } - - protected: - // grid size, cached from the TuningParams object - int conv_grid_size_; -}; -} - -#endif // FEATURE_BMP_H diff --git a/cube/feature_chebyshev.cpp b/cube/feature_chebyshev.cpp deleted file mode 100644 index c00bbdf8..00000000 --- a/cube/feature_chebyshev.cpp +++ /dev/null @@ -1,138 +0,0 @@ -/********************************************************************** - * File: feature_chebyshev.cpp - * Description: Implementation of the Chebyshev coefficients Feature Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include -#include -#include -#include "feature_base.h" -#include "feature_chebyshev.h" -#include "cube_utils.h" -#include "const.h" -#include "char_samp.h" - -namespace tesseract { - -FeatureChebyshev::FeatureChebyshev(TuningParams *params) - : FeatureBase(params) { -} - -FeatureChebyshev::~FeatureChebyshev() { -} - -// Render a visualization of the features to a CharSamp. -// This is mainly used by visual-debuggers -CharSamp *FeatureChebyshev::ComputeFeatureBitmap(CharSamp *char_samp) { - return char_samp; -} - -// Compute Chebyshev coefficients for the specified vector -void FeatureChebyshev::ChebyshevCoefficients(const vector &input, - int coeff_cnt, float *coeff) { - // re-sample function - int input_range = (input.size() - 1); - vector resamp(coeff_cnt); - for (int samp_idx = 0; samp_idx < coeff_cnt; samp_idx++) { - // compute sampling position - float samp_pos = input_range * - (1 + cos(M_PI * (samp_idx + 0.5) / coeff_cnt)) / 2; - // interpolate - int samp_start = static_cast(samp_pos); - int samp_end = static_cast(samp_pos + 0.5); - float func_delta = input[samp_end] - input[samp_start]; - resamp[samp_idx] = input[samp_start] + - ((samp_pos - samp_start) * func_delta); - } - // compute the coefficients - float normalizer = 2.0 / coeff_cnt; - for (int coeff_idx = 0; coeff_idx < coeff_cnt; coeff_idx++, coeff++) { - double sum = 0.0; - for (int samp_idx = 0; samp_idx < coeff_cnt; samp_idx++) { - sum += resamp[samp_idx] * cos(M_PI * coeff_idx * (samp_idx + 0.5) / - coeff_cnt); - } - (*coeff) = (normalizer * sum); - } -} - -// Compute the features of a given CharSamp -bool FeatureChebyshev::ComputeFeatures(CharSamp *char_samp, float *features) { - return ComputeChebyshevCoefficients(char_samp, features); -} - -// Compute the Chebyshev coefficients of a given CharSamp -bool FeatureChebyshev::ComputeChebyshevCoefficients(CharSamp *char_samp, - float *features) { - if (char_samp->NormBottom() <= 0) { - return false; - } - unsigned char *raw_data = char_samp->RawData(); - int stride = char_samp->Stride(); - // compute the height of the word - int word_hgt = (255 * (char_samp->Top() + char_samp->Height()) / - char_samp->NormBottom()); - // compute left & right profiles - vector left_profile(word_hgt, 0.0); - vector right_profile(word_hgt, 0.0); - unsigned char *line_data = raw_data; - for (int y = 0; y < char_samp->Height(); y++, line_data += stride) { - int min_x = char_samp->Width(); - int max_x = -1; - for (int x = 0; x < char_samp->Width(); x++) { - if (line_data[x] == 0) { - UpdateRange(x, &min_x, &max_x); - } - } - left_profile[char_samp->Top() + y] = - 1.0 * (min_x == char_samp->Width() ? 0 : (min_x + 1)) / - char_samp->Width(); - right_profile[char_samp->Top() + y] = - 1.0 * (max_x == -1 ? 0 : char_samp->Width() - max_x) / - char_samp->Width(); - } - - // compute top and bottom profiles - vector top_profile(char_samp->Width(), 0); - vector bottom_profile(char_samp->Width(), 0); - for (int x = 0; x < char_samp->Width(); x++) { - int min_y = word_hgt; - int max_y = -1; - line_data = raw_data; - for (int y = 0; y < char_samp->Height(); y++, line_data += stride) { - if (line_data[x] == 0) { - UpdateRange(y + char_samp->Top(), &min_y, &max_y); - } - } - top_profile[x] = 1.0 * (min_y == word_hgt ? 0 : (min_y + 1)) / word_hgt; - bottom_profile[x] = 1.0 * (max_y == -1 ? 0 : (word_hgt - max_y)) / word_hgt; - } - - // compute the chebyshev coefficients of each profile - ChebyshevCoefficients(left_profile, kChebychevCoefficientCnt, features); - ChebyshevCoefficients(top_profile, kChebychevCoefficientCnt, - features + kChebychevCoefficientCnt); - ChebyshevCoefficients(right_profile, kChebychevCoefficientCnt, - features + (2 * kChebychevCoefficientCnt)); - ChebyshevCoefficients(bottom_profile, kChebychevCoefficientCnt, - features + (3 * kChebychevCoefficientCnt)); - return true; -} -} // namespace tesseract diff --git a/cube/feature_chebyshev.h b/cube/feature_chebyshev.h deleted file mode 100644 index 13c2d96d..00000000 --- a/cube/feature_chebyshev.h +++ /dev/null @@ -1,57 +0,0 @@ -/********************************************************************** - * File: feature_chebyshev.h - * Description: Declaration of the Chebyshev coefficients Feature Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The FeatureChebyshev class implements a Bitmap feature extractor class. It -// inherits from the FeatureBase class -// The feature vector is the composed of the chebyshev coefficients of 4 time -// sequences. The time sequences are the left, top, right & bottom -// bitmap profiles of the input samples - -#ifndef FEATURE_CHEBYSHEV_H -#define FEATURE_CHEBYSHEV_H - -#include "char_samp.h" -#include "feature_base.h" - -namespace tesseract { -class FeatureChebyshev : public FeatureBase { - public: - explicit FeatureChebyshev(TuningParams *params); - virtual ~FeatureChebyshev(); - // Render a visualization of the features to a CharSamp. - // This is mainly used by visual-debuggers - virtual CharSamp *ComputeFeatureBitmap(CharSamp *samp); - // Compute the features for a given CharSamp - virtual bool ComputeFeatures(CharSamp *samp, float *features); - // Returns the count of features - virtual int FeatureCnt() { - return (4 * kChebychevCoefficientCnt); - } - - protected: - static const int kChebychevCoefficientCnt = 40; - // Compute Chebychev coefficients for the specified vector - void ChebyshevCoefficients(const vector &input, - int coeff_cnt, float *coeff); - // Compute the features for a given CharSamp - bool ComputeChebyshevCoefficients(CharSamp *samp, float *features); -}; -} - -#endif // FEATURE_CHEBYSHEV_H diff --git a/cube/feature_hybrid.cpp b/cube/feature_hybrid.cpp deleted file mode 100644 index 35aeda0a..00000000 --- a/cube/feature_hybrid.cpp +++ /dev/null @@ -1,64 +0,0 @@ -/********************************************************************** - * File: feature_chebyshev.cpp - * Description: Implementation of the Chebyshev coefficients Feature Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include -#include -#include -#include "feature_base.h" -#include "feature_hybrid.h" -#include "cube_utils.h" -#include "const.h" -#include "char_samp.h" - -namespace tesseract { - -FeatureHybrid::FeatureHybrid(TuningParams *params) - :FeatureBase(params) { - feature_bmp_ = new FeatureBmp(params); - feature_chebyshev_ = new FeatureChebyshev(params); -} - -FeatureHybrid::~FeatureHybrid() { - delete feature_bmp_; - delete feature_chebyshev_; -} - -// Render a visualization of the features to a CharSamp. -// This is mainly used by visual-debuggers -CharSamp *FeatureHybrid::ComputeFeatureBitmap(CharSamp *char_samp) { - return char_samp; -} - - -// Compute the features of a given CharSamp -bool FeatureHybrid::ComputeFeatures(CharSamp *char_samp, float *features) { - if (feature_bmp_ == NULL || feature_chebyshev_ == NULL) { - return false; - } - if (!feature_bmp_->ComputeFeatures(char_samp, features)) { - return false; - } - return feature_chebyshev_->ComputeFeatures(char_samp, - features + feature_bmp_->FeatureCnt()); -} - -} // namespace tesseract diff --git a/cube/feature_hybrid.h b/cube/feature_hybrid.h deleted file mode 100644 index dc94a526..00000000 --- a/cube/feature_hybrid.h +++ /dev/null @@ -1,56 +0,0 @@ -/********************************************************************** - * File: feature_chebyshev.h - * Description: Declaration of the Chebyshev coefficients Feature Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The FeatureHybrid class implements a Bitmap feature extractor class. It -// inherits from the FeatureBase class -// This class describes the a hybrid feature vector composed by combining -// the bitmap and the chebyshev feature vectors - -#ifndef FEATURE_HYBRID_H -#define FEATURE_HYBRID_H - -#include "char_samp.h" -#include "feature_bmp.h" -#include "feature_chebyshev.h" - -namespace tesseract { -class FeatureHybrid : public FeatureBase { - public: - explicit FeatureHybrid(TuningParams *params); - virtual ~FeatureHybrid(); - // Render a visualization of the features to a CharSamp. - // This is mainly used by visual-debuggers - virtual CharSamp *ComputeFeatureBitmap(CharSamp *samp); - // Compute the features for a given CharSamp - virtual bool ComputeFeatures(CharSamp *samp, float *features); - // Returns the count of features - virtual int FeatureCnt() { - if (feature_bmp_ == NULL || feature_chebyshev_ == NULL) { - return 0; - } - return feature_bmp_->FeatureCnt() + feature_chebyshev_->FeatureCnt(); - } - - protected: - FeatureBmp *feature_bmp_; - FeatureChebyshev *feature_chebyshev_; -}; -} - -#endif // FEATURE_HYBRID_H diff --git a/cube/hybrid_neural_net_classifier.cpp b/cube/hybrid_neural_net_classifier.cpp deleted file mode 100644 index 671a74ac..00000000 --- a/cube/hybrid_neural_net_classifier.cpp +++ /dev/null @@ -1,369 +0,0 @@ -/********************************************************************** - * File: charclassifier.cpp - * Description: Implementation of Convolutional-NeuralNet Character Classifier - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include -#include -#include - -#include "classifier_base.h" -#include "char_set.h" -#include "const.h" -#include "conv_net_classifier.h" -#include "cube_utils.h" -#include "feature_base.h" -#include "feature_bmp.h" -#include "hybrid_neural_net_classifier.h" -#include "tess_lang_model.h" - -namespace tesseract { - -HybridNeuralNetCharClassifier::HybridNeuralNetCharClassifier( - CharSet *char_set, - TuningParams *params, - FeatureBase *feat_extract) - : CharClassifier(char_set, params, feat_extract) { - net_input_ = NULL; - net_output_ = NULL; -} - -HybridNeuralNetCharClassifier::~HybridNeuralNetCharClassifier() { - for (int net_idx = 0; net_idx < nets_.size(); net_idx++) { - if (nets_[net_idx] != NULL) { - delete nets_[net_idx]; - } - } - nets_.clear(); - - if (net_input_ != NULL) { - delete []net_input_; - net_input_ = NULL; - } - - if (net_output_ != NULL) { - delete []net_output_; - net_output_ = NULL; - } -} - -// The main training function. Given a sample and a class ID the classifier -// updates its parameters according to its learning algorithm. This function -// is currently not implemented. TODO(ahmadab): implement end-2-end training -bool HybridNeuralNetCharClassifier::Train(CharSamp *char_samp, int ClassID) { - return false; -} - -// A secondary function needed for training. Allows the trainer to set the -// value of any train-time parameter. This function is currently not -// implemented. TODO(ahmadab): implement end-2-end training -bool HybridNeuralNetCharClassifier::SetLearnParam(char *var_name, float val) { - // TODO(ahmadab): implementation of parameter initializing. - return false; -} - -// Folds the output of the NeuralNet using the loaded folding sets -void HybridNeuralNetCharClassifier::Fold() { - // in case insensitive mode - if (case_sensitive_ == false) { - int class_cnt = char_set_->ClassCount(); - // fold case - for (int class_id = 0; class_id < class_cnt; class_id++) { - // get class string - const char_32 *str32 = char_set_->ClassString(class_id); - // get the upper case form of the string - string_32 upper_form32 = str32; - for (int ch = 0; ch < upper_form32.length(); ch++) { - if (iswalpha(static_cast(upper_form32[ch])) != 0) { - upper_form32[ch] = towupper(upper_form32[ch]); - } - } - - // find out the upperform class-id if any - int upper_class_id = - char_set_->ClassID(reinterpret_cast( - upper_form32.c_str())); - if (upper_class_id != -1 && class_id != upper_class_id) { - float max_out = MAX(net_output_[class_id], net_output_[upper_class_id]); - net_output_[class_id] = max_out; - net_output_[upper_class_id] = max_out; - } - } - } - - // The folding sets specify how groups of classes should be folded - // Folding involved assigning a min-activation to all the members - // of the folding set. The min-activation is a fraction of the max-activation - // of the members of the folding set - for (int fold_set = 0; fold_set < fold_set_cnt_; fold_set++) { - float max_prob = net_output_[fold_sets_[fold_set][0]]; - - for (int ch = 1; ch < fold_set_len_[fold_set]; ch++) { - if (net_output_[fold_sets_[fold_set][ch]] > max_prob) { - max_prob = net_output_[fold_sets_[fold_set][ch]]; - } - } - for (int ch = 0; ch < fold_set_len_[fold_set]; ch++) { - net_output_[fold_sets_[fold_set][ch]] = MAX(max_prob * kFoldingRatio, - net_output_[fold_sets_[fold_set][ch]]); - } - } -} - -// compute the features of specified charsamp and -// feedforward the specified nets -bool HybridNeuralNetCharClassifier::RunNets(CharSamp *char_samp) { - int feat_cnt = feat_extract_->FeatureCnt(); - int class_cnt = char_set_->ClassCount(); - - // allocate i/p and o/p buffers if needed - if (net_input_ == NULL) { - net_input_ = new float[feat_cnt]; - if (net_input_ == NULL) { - return false; - } - - net_output_ = new float[class_cnt]; - if (net_output_ == NULL) { - return false; - } - } - - // compute input features - if (feat_extract_->ComputeFeatures(char_samp, net_input_) == false) { - return false; - } - - // go through all the nets - memset(net_output_, 0, class_cnt * sizeof(*net_output_)); - float *inputs = net_input_; - for (int net_idx = 0; net_idx < nets_.size(); net_idx++) { - // run each net - vector net_out(class_cnt, 0.0); - if (!nets_[net_idx]->FeedForward(inputs, &net_out[0])) { - return false; - } - // add the output values - for (int class_idx = 0; class_idx < class_cnt; class_idx++) { - net_output_[class_idx] += (net_out[class_idx] * net_wgts_[net_idx]); - } - // increment inputs pointer - inputs += nets_[net_idx]->in_cnt(); - } - - Fold(); - - return true; -} - -// return the cost of being a char -int HybridNeuralNetCharClassifier::CharCost(CharSamp *char_samp) { - // it is by design that a character cost is equal to zero - // when no nets are present. This is the case during training. - if (RunNets(char_samp) == false) { - return 0; - } - - return CubeUtils::Prob2Cost(1.0f - net_output_[0]); -} - -// classifies a charsamp and returns an alternate list -// of chars sorted by char costs -CharAltList *HybridNeuralNetCharClassifier::Classify(CharSamp *char_samp) { - // run the needed nets - if (RunNets(char_samp) == false) { - return NULL; - } - - int class_cnt = char_set_->ClassCount(); - - // create an altlist - CharAltList *alt_list = new CharAltList(char_set_, class_cnt); - if (alt_list == NULL) { - return NULL; - } - - for (int out = 1; out < class_cnt; out++) { - int cost = CubeUtils::Prob2Cost(net_output_[out]); - alt_list->Insert(out, cost); - } - - return alt_list; -} - -// set an external net (for training purposes) -void HybridNeuralNetCharClassifier::SetNet(tesseract::NeuralNet *char_net) { -} - -// Load folding sets -// This function returns true on success or if the file can't be read, -// returns false if an error is encountered. -bool HybridNeuralNetCharClassifier::LoadFoldingSets( - const string &data_file_path, const string &lang, LangModel *lang_mod) { - fold_set_cnt_ = 0; - string fold_file_name; - fold_file_name = data_file_path + lang; - fold_file_name += ".cube.fold"; - - // folding sets are optional - FILE *fp = fopen(fold_file_name.c_str(), "rb"); - if (fp == NULL) { - return true; - } - fclose(fp); - - string fold_sets_str; - if (!CubeUtils::ReadFileToString(fold_file_name, - &fold_sets_str)) { - return false; - } - - // split into lines - vector str_vec; - CubeUtils::SplitStringUsing(fold_sets_str, "\r\n", &str_vec); - fold_set_cnt_ = str_vec.size(); - fold_sets_ = new int *[fold_set_cnt_]; - if (fold_sets_ == NULL) { - return false; - } - fold_set_len_ = new int[fold_set_cnt_]; - if (fold_set_len_ == NULL) { - fold_set_cnt_ = 0; - return false; - } - - for (int fold_set = 0; fold_set < fold_set_cnt_; fold_set++) { - reinterpret_cast(lang_mod)->RemoveInvalidCharacters( - &str_vec[fold_set]); - - // if all or all but one character are invalid, invalidate this set - if (str_vec[fold_set].length() <= 1) { - fprintf(stderr, "Cube WARNING (ConvNetCharClassifier::LoadFoldingSets): " - "invalidating folding set %d\n", fold_set); - fold_set_len_[fold_set] = 0; - fold_sets_[fold_set] = NULL; - continue; - } - - string_32 str32; - CubeUtils::UTF8ToUTF32(str_vec[fold_set].c_str(), &str32); - fold_set_len_[fold_set] = str32.length(); - fold_sets_[fold_set] = new int[fold_set_len_[fold_set]]; - if (fold_sets_[fold_set] == NULL) { - fprintf(stderr, "Cube ERROR (ConvNetCharClassifier::LoadFoldingSets): " - "could not allocate folding set\n"); - fold_set_cnt_ = fold_set; - return false; - } - for (int ch = 0; ch < fold_set_len_[fold_set]; ch++) { - fold_sets_[fold_set][ch] = char_set_->ClassID(str32[ch]); - } - } - return true; -} - -// Init the classifier provided a data-path and a language string -bool HybridNeuralNetCharClassifier::Init(const string &data_file_path, - const string &lang, - LangModel *lang_mod) { - if (init_ == true) { - return true; - } - - // load the nets if any. This function will return true if the net file - // does not exist. But will fail if the net did not pass the sanity checks - if (!LoadNets(data_file_path, lang)) { - return false; - } - - // load the folding sets if any. This function will return true if the - // file does not exist. But will fail if the it did not pass the sanity checks - if (!LoadFoldingSets(data_file_path, lang, lang_mod)) { - return false; - } - - init_ = true; - return true; -} - -// Load the classifier's Neural Nets -// This function will return true if the net file does not exist. -// But will fail if the net did not pass the sanity checks -bool HybridNeuralNetCharClassifier::LoadNets(const string &data_file_path, - const string &lang) { - string hybrid_net_file; - string junk_net_file; - - // add the lang identifier - hybrid_net_file = data_file_path + lang; - hybrid_net_file += ".cube.hybrid"; - - // neural network is optional - FILE *fp = fopen(hybrid_net_file.c_str(), "rb"); - if (fp == NULL) { - return true; - } - fclose(fp); - - string str; - if (!CubeUtils::ReadFileToString(hybrid_net_file, &str)) { - return false; - } - - // split into lines - vector str_vec; - CubeUtils::SplitStringUsing(str, "\r\n", &str_vec); - if (str_vec.size() <= 0) { - return false; - } - - // create and add the nets - nets_.resize(str_vec.size(), NULL); - net_wgts_.resize(str_vec.size(), 0); - int total_input_size = 0; - for (int net_idx = 0; net_idx < str_vec.size(); net_idx++) { - // parse the string - vector tokens_vec; - CubeUtils::SplitStringUsing(str_vec[net_idx], " \t", &tokens_vec); - // has to be 2 tokens, net name and input size - if (tokens_vec.size() != 2) { - return false; - } - // load the net - string net_file_name = data_file_path + tokens_vec[0]; - nets_[net_idx] = tesseract::NeuralNet::FromFile(net_file_name); - if (nets_[net_idx] == NULL) { - return false; - } - // parse the input size and validate it - net_wgts_[net_idx] = atof(tokens_vec[1].c_str()); - if (net_wgts_[net_idx] < 0.0) { - return false; - } - total_input_size += nets_[net_idx]->in_cnt(); - } - // validate total input count - if (total_input_size != feat_extract_->FeatureCnt()) { - return false; - } - // success - return true; -} -} // tesseract diff --git a/cube/hybrid_neural_net_classifier.h b/cube/hybrid_neural_net_classifier.h deleted file mode 100644 index 6ad6233f..00000000 --- a/cube/hybrid_neural_net_classifier.h +++ /dev/null @@ -1,90 +0,0 @@ -/********************************************************************** - * File: conv_net_classifier.h - * Description: Declaration of Convolutional-NeuralNet Character Classifier - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#ifndef HYBRID_NEURAL_NET_CLASSIFIER_H -#define HYBRID_NEURAL_NET_CLASSIFIER_H - -#include -#include - -#include "char_samp.h" -#include "char_altlist.h" -#include "char_set.h" -#include "classifier_base.h" -#include "feature_base.h" -#include "lang_model.h" -#include "neural_net.h" -#include "tuning_params.h" - -namespace tesseract { - -// Folding Ratio is the ratio of the max-activation of members of a folding -// set that is used to compute the min-activation of the rest of the set -// static const float kFoldingRatio = 0.75; // see conv_net_classifier.h - -class HybridNeuralNetCharClassifier : public CharClassifier { - public: - HybridNeuralNetCharClassifier(CharSet *char_set, TuningParams *params, - FeatureBase *feat_extract); - virtual ~HybridNeuralNetCharClassifier(); - // The main training function. Given a sample and a class ID the classifier - // updates its parameters according to its learning algorithm. This function - // is currently not implemented. TODO(ahmadab): implement end-2-end training - virtual bool Train(CharSamp *char_samp, int ClassID); - // A secondary function needed for training. Allows the trainer to set the - // value of any train-time parameter. This function is currently not - // implemented. TODO(ahmadab): implement end-2-end training - virtual bool SetLearnParam(char *var_name, float val); - // Externally sets the Neural Net used by the classifier. Used for training - void SetNet(tesseract::NeuralNet *net); - - // Classifies an input charsamp and return a CharAltList object containing - // the possible candidates and corresponding scores - virtual CharAltList *Classify(CharSamp *char_samp); - // Computes the cost of a specific charsamp being a character (versus a - // non-character: part-of-a-character OR more-than-one-character) - virtual int CharCost(CharSamp *char_samp); - - private: - // Neural Net object used for classification - vector nets_; - vector net_wgts_; - - // data buffers used to hold Neural Net inputs and outputs - float *net_input_; - float *net_output_; - - // Init the classifier provided a data-path and a language string - virtual bool Init(const string &data_file_path, const string &lang, - LangModel *lang_mod); - // Loads the NeuralNets needed for the classifier - bool LoadNets(const string &data_file_path, const string &lang); - // Load folding sets - // This function returns true on success or if the file can't be read, - // returns false if an error is encountered. - virtual bool LoadFoldingSets(const string &data_file_path, - const string &lang, - LangModel *lang_mod); - // Folds the output of the NeuralNet using the loaded folding sets - virtual void Fold(); - // Scales the input char_samp and feeds it to the NeuralNet as input - bool RunNets(CharSamp *char_samp); -}; -} -#endif // HYBRID_NEURAL_NET_CLASSIFIER_H diff --git a/cube/lang_mod_edge.h b/cube/lang_mod_edge.h deleted file mode 100644 index 19897942..00000000 --- a/cube/lang_mod_edge.h +++ /dev/null @@ -1,73 +0,0 @@ -/********************************************************************** - * File: lang_mod_edge.h - * Description: Declaration of the Language Model Edge Base Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The LangModEdge abstracts an Edge in the language model trie -// This is an abstract class that any Language Model Edge should inherit from -// It provides methods for: -// 1- Returns the class ID corresponding to the edge -// 2- If the edge is a valid EndOfWord (EOW) -// 3- If the edge is coming from a OutOfDictionary (OOF) state machine -// 4- If the edge is a Terminal (has no children) -// 5- A Hash of the edge that will be used to retrieve the edge -// quickly from the BeamSearch lattice -// 6- If two edges are identcial -// 7- Returns a verbal description of the edge (use by debuggers) -// 8- the language model cost of the edge (if any) -// 9- The string corresponding to this edge -// 10- Getting and setting the "Root" status of the edge - -#ifndef LANG_MOD_EDGE_H -#define LANG_MOD_EDGE_H - -#include "cube_tuning_params.h" -#include "char_set.h" - -namespace tesseract { - -class LangModEdge { - public: - LangModEdge() {} - virtual ~LangModEdge() {} - - // The string corresponding to this edge - virtual const char_32 * EdgeString() const = 0; - // Returns the class ID corresponding to the edge - virtual int ClassID() const = 0; - // If the edge is the root edge - virtual bool IsRoot() const = 0; - // Set the Root flag - virtual void SetRoot(bool flag) = 0; - // If the edge is a valid EndOfWord (EOW) - virtual bool IsEOW() const = 0; - // is the edge is coming from a OutOfDictionary (OOF) state machine - virtual bool IsOOD() const = 0; - // Is the edge is a Terminal (has no children) - virtual bool IsTerminal() const = 0; - // Returns A hash of the edge that will be used to retrieve the edge - virtual unsigned int Hash() const = 0; - // Are the two edges identcial? - virtual bool IsIdentical(LangModEdge *edge) const = 0; - // a verbal description of the edge (use by debuggers) - virtual char *Description() const = 0; - // the language model cost of the edge (if any) - virtual int PathCost() const = 0; -}; -} - -#endif // LANG_MOD_EDGE_H diff --git a/cube/lang_model.h b/cube/lang_model.h deleted file mode 100644 index a29bc1e3..00000000 --- a/cube/lang_model.h +++ /dev/null @@ -1,78 +0,0 @@ -/********************************************************************** - * File: lang_model.h - * Description: Declaration of the Language Model Edge Base Class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The LanguageModel class abstracts a State machine that is modeled as a Trie -// structure. The state machine models the language being recognized by the OCR -// Engine -// This is an abstract class that is to be inherited by any language model - -#ifndef LANG_MODEL_H -#define LANG_MODEL_H - -#include "lang_mod_edge.h" -#include "char_altlist.h" -#include "char_set.h" -#include "tuning_params.h" - -namespace tesseract { -class LangModel { - public: - LangModel() { - ood_enabled_ = true; - numeric_enabled_ = true; - word_list_enabled_ = true; - punc_enabled_ = true; - } - virtual ~LangModel() {} - - // Returns an edge pointer to the Root - virtual LangModEdge *Root() = 0; - // Returns the edges that fan-out of the specified edge and their count - virtual LangModEdge **GetEdges(CharAltList *alt_list, - LangModEdge *parent_edge, - int *edge_cnt) = 0; - // Returns is a sequence of 32-bit characters are valid within this language - // model or net. And EndOfWord flag is specified. If true, the sequence has - // to end on a valid word. The function also optionally returns the list - // of language model edges traversed to parse the string - virtual bool IsValidSequence(const char_32 *str, bool eow_flag, - LangModEdge **edge_array = NULL) = 0; - virtual bool IsLeadingPunc(char_32 ch) = 0; - virtual bool IsTrailingPunc(char_32 ch) = 0; - virtual bool IsDigit(char_32 ch) = 0; - - // accessor functions - inline bool OOD() { return ood_enabled_; } - inline bool Numeric() { return numeric_enabled_; } - inline bool WordList() { return word_list_enabled_; } - inline bool Punc() { return punc_enabled_; } - inline void SetOOD(bool ood) { ood_enabled_ = ood; } - inline void SetNumeric(bool numeric) { numeric_enabled_ = numeric; } - inline void SetWordList(bool word_list) { word_list_enabled_ = word_list; } - inline void SetPunc(bool punc_enabled) { punc_enabled_ = punc_enabled; } - - protected: - bool ood_enabled_; - bool numeric_enabled_; - bool word_list_enabled_; - bool punc_enabled_; -}; -} - -#endif // LANG_MODEL_H diff --git a/cube/search_column.cpp b/cube/search_column.cpp deleted file mode 100644 index 9a042d01..00000000 --- a/cube/search_column.cpp +++ /dev/null @@ -1,229 +0,0 @@ -/********************************************************************** - * File: search_column.cpp - * Description: Implementation of the Beam Search Column Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "search_column.h" -#include - -namespace tesseract { - -SearchColumn::SearchColumn(int col_idx, int max_node) { - col_idx_ = col_idx; - node_cnt_ = 0; - node_array_ = NULL; - max_node_cnt_ = max_node; - node_hash_table_ = NULL; - init_ = false; - min_cost_ = INT_MAX; - max_cost_ = 0; -} - -// Cleanup data -void SearchColumn::Cleanup() { - if (node_array_ != NULL) { - for (int node_idx = 0; node_idx < node_cnt_; node_idx++) { - if (node_array_[node_idx] != NULL) { - delete node_array_[node_idx]; - } - } - - delete []node_array_; - node_array_ = NULL; - } - FreeHashTable(); - init_ = false; -} - -SearchColumn::~SearchColumn() { - Cleanup(); -} - -// Initializations -bool SearchColumn::Init() { - if (init_ == true) { - return true; - } - - // create hash table - if (node_hash_table_ == NULL) { - node_hash_table_ = new SearchNodeHashTable(); - if (node_hash_table_ == NULL) { - return false; - } - } - - init_ = true; - - return true; -} - -// Prune the nodes if necessary. Pruning is done such that a max -// number of nodes is kept, i.e., the beam width -void SearchColumn::Prune() { - // no need to prune - if (node_cnt_ <= max_node_cnt_) { - return; - } - - // compute the cost histogram - memset(score_bins_, 0, sizeof(score_bins_)); - int cost_range = max_cost_ - min_cost_ + 1; - for (int node_idx = 0; node_idx < node_cnt_; node_idx++) { - int cost_bin = static_cast( - ((node_array_[node_idx]->BestCost() - min_cost_) * - kScoreBins) / static_cast(cost_range)); - if (cost_bin >= kScoreBins) { - cost_bin = kScoreBins - 1; - } - score_bins_[cost_bin]++; - } - - // determine the pruning cost by scanning the cost histogram from - // least to greatest cost bins and finding the cost at which the - // max number of nodes is exceeded - int pruning_cost = 0; - int new_node_cnt = 0; - for (int cost_bin = 0; cost_bin < kScoreBins; cost_bin++) { - if (new_node_cnt > 0 && - (new_node_cnt + score_bins_[cost_bin]) > max_node_cnt_) { - pruning_cost = min_cost_ + ((cost_bin * cost_range) / kScoreBins); - break; - } - new_node_cnt += score_bins_[cost_bin]; - } - - // prune out all the nodes above this cost - for (int node_idx = new_node_cnt = 0; node_idx < node_cnt_; node_idx++) { - // prune this node out - if (node_array_[node_idx]->BestCost() > pruning_cost || - new_node_cnt > max_node_cnt_) { - delete node_array_[node_idx]; - } else { - // keep it - node_array_[new_node_cnt++] = node_array_[node_idx]; - } - } - node_cnt_ = new_node_cnt; -} - -// sort all nodes -void SearchColumn::Sort() { - if (node_cnt_ > 0 && node_array_ != NULL) { - qsort(node_array_, node_cnt_, sizeof(*node_array_), - SearchNode::SearchNodeComparer); - } -} - -// add a new node -SearchNode *SearchColumn::AddNode(LangModEdge *edge, int reco_cost, - SearchNode *parent_node, - CubeRecoContext *cntxt) { - // init if necessary - if (init_ == false && Init() == false) { - return NULL; - } - - // find out if we have an node with the same edge - // look in the hash table - SearchNode *new_node = node_hash_table_->Lookup(edge, parent_node); - // node does not exist - if (new_node == NULL) { - new_node = new SearchNode(cntxt, parent_node, reco_cost, edge, col_idx_); - if (new_node == NULL) { - return NULL; - } - - // if the max node count has already been reached, check if the cost of - // the new node exceeds the max cost. This indicates that it will be pruned - // and so there is no point adding it - if (node_cnt_ >= max_node_cnt_ && new_node->BestCost() > max_cost_) { - delete new_node; - return NULL; - } - - // expand the node buffer if necc - if ((node_cnt_ % kNodeAllocChunk) == 0) { - // alloc a new buff - SearchNode **new_node_buff = - new SearchNode *[node_cnt_ + kNodeAllocChunk]; - if (new_node_buff == NULL) { - delete new_node; - return NULL; - } - - // free existing after copying contents - if (node_array_ != NULL) { - memcpy(new_node_buff, node_array_, node_cnt_ * sizeof(*new_node_buff)); - delete []node_array_; - } - - node_array_ = new_node_buff; - } - - // add the node to the hash table only if it is non-OOD edge - // because the langmod state is not unique - if (edge->IsOOD() == false) { - if (!node_hash_table_->Insert(edge, new_node)) { - tprintf("Hash table full!!!"); - delete new_node; - return NULL; - } - } - - node_array_[node_cnt_++] = new_node; - - } else { - // node exists before - // if no update occurred, return NULL - if (new_node->UpdateParent(parent_node, reco_cost, edge) == false) { - new_node = NULL; - } - - // free the edge - if (edge != NULL) { - delete edge; - } - } - - // update Min and Max Costs - if (new_node != NULL) { - if (min_cost_ > new_node->BestCost()) { - min_cost_ = new_node->BestCost(); - } - - if (max_cost_ < new_node->BestCost()) { - max_cost_ = new_node->BestCost(); - } - } - - return new_node; -} - -SearchNode *SearchColumn::BestNode() { - SearchNode *best_node = NULL; - - for (int node_idx = 0; node_idx < node_cnt_; node_idx++) { - if (best_node == NULL || - best_node->BestCost() > node_array_[node_idx]->BestCost()) { - best_node = node_array_[node_idx]; - } - } - - return best_node; -} -} // namespace tesseract diff --git a/cube/search_column.h b/cube/search_column.h deleted file mode 100644 index da077d68..00000000 --- a/cube/search_column.h +++ /dev/null @@ -1,84 +0,0 @@ -/********************************************************************** - * File: search_column.h - * Description: Declaration of the Beam Search Column Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The SearchColumn class abstracts a column in the lattice that is created -// by the BeamSearch during the recognition process -// The class holds the lattice nodes. New nodes are added by calls to AddNode -// made from the BeamSearch -// The class maintains a hash table of the nodes to be able to lookup nodes -// quickly using their lang_mod_edge. This is needed to merge similar paths -// in the lattice - -#ifndef SEARCH_COLUMN_H -#define SEARCH_COLUMN_H - -#include "search_node.h" -#include "lang_mod_edge.h" -#include "cube_reco_context.h" - -namespace tesseract { - -class SearchColumn { - public: - SearchColumn(int col_idx, int max_node_cnt); - ~SearchColumn(); - // Accessor functions - inline int ColIdx() const { return col_idx_; } - inline int NodeCount() const { return node_cnt_; } - inline SearchNode **Nodes() const { return node_array_; } - - // Prune the nodes if necessary. Pruning is done such that a max - // number of nodes is kept, i.e., the beam width - void Prune(); - SearchNode *AddNode(LangModEdge *edge, int score, - SearchNode *parent, CubeRecoContext *cntxt); - // Returns the node with the least cost - SearchNode *BestNode(); - // Sort the lattice nodes. Needed for visualization - void Sort(); - // Free up the Hash Table. Added to be called by the Beam Search after - // a column is pruned to reduce memory foot print - void FreeHashTable() { - if (node_hash_table_ != NULL) { - delete node_hash_table_; - node_hash_table_ = NULL; - } - } - - private: - static const int kNodeAllocChunk = 1024; - static const int kScoreBins = 1024; - bool init_; - int min_cost_; - int max_cost_; - int max_node_cnt_; - int node_cnt_; - int col_idx_; - int score_bins_[kScoreBins]; - SearchNode **node_array_; - SearchNodeHashTable *node_hash_table_; - - // Free node array and hash table - void Cleanup(); - // Create hash table - bool Init(); -}; -} - -#endif // SEARCH_COLUMN_H diff --git a/cube/search_node.cpp b/cube/search_node.cpp deleted file mode 100644 index ff5bfbd8..00000000 --- a/cube/search_node.cpp +++ /dev/null @@ -1,232 +0,0 @@ -/********************************************************************** - * File: search_node.cpp - * Description: Implementation of the Beam Search Node Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "search_node.h" - -namespace tesseract { - -// The constructor updates the best paths and costs: -// mean_char_reco_cost_ (returned by BestRecoCost()) is the mean -// char_reco cost of the best_path, including this node. -// best_path_reco_cost is the total char_reco_cost of the best_path, -// but excludes the char_reco_cost of this node. -// best_cost is the mean mixed cost, i.e., mean_char_reco_cost_ + -// current language model cost, all weighted by the cube context's -// RecoWgt parameter -SearchNode::SearchNode(CubeRecoContext *cntxt, SearchNode *parent_node, - int char_reco_cost, LangModEdge *edge, int col_idx) { - // copy data members - cntxt_ = cntxt; - lang_mod_edge_ = edge; - col_idx_ = col_idx; - parent_node_ = parent_node; - char_reco_cost_ = char_reco_cost; - - // the string of this node is the same as that of the language model edge - str_ = (edge == NULL ? NULL : edge->EdgeString()); - - // compute best path total reco cost - best_path_reco_cost_ = (parent_node_ == NULL) ? 0 : - parent_node_->CharRecoCost() + parent_node_->BestPathRecoCost(); - - // update best path length - best_path_len_ = (parent_node_ == NULL) ? - 1 : parent_node_->BestPathLength() + 1; - if (edge != NULL && edge->IsRoot() && parent_node_ != NULL) { - best_path_len_++; - } - - // compute best reco cost mean cost - mean_char_reco_cost_ = static_cast( - (best_path_reco_cost_ + char_reco_cost_) / - static_cast(best_path_len_)); - - // get language model cost - int lm_cost = LangModCost(lang_mod_edge_, parent_node_); - - // compute aggregate best cost - best_cost_ = static_cast(cntxt_->Params()->RecoWgt() * - (best_path_reco_cost_ + char_reco_cost_) / - static_cast(best_path_len_) - ) + lm_cost; -} - -SearchNode::~SearchNode() { - if (lang_mod_edge_ != NULL) { - delete lang_mod_edge_; - } -} - -// update the parent_node node if provides a better (less) cost -bool SearchNode::UpdateParent(SearchNode *new_parent, int new_reco_cost, - LangModEdge *new_edge) { - if (lang_mod_edge_ == NULL) { - if (new_edge != NULL) { - return false; - } - } else { - // to update the parent_node, we have to have the same target - // state and char - if (new_edge == NULL || !lang_mod_edge_->IsIdentical(new_edge) || - !SearchNode::IdenticalPath(parent_node_, new_parent)) { - return false; - } - } - - // compute the path cost and combined cost of the new path - int new_best_path_reco_cost; - int new_cost; - int new_best_path_len; - - new_best_path_reco_cost = (new_parent == NULL) ? - 0 : new_parent->BestPathRecoCost() + new_parent->CharRecoCost(); - - new_best_path_len = - (new_parent == NULL) ? 1 : new_parent->BestPathLength() + 1; - - // compute the new language model cost - int new_lm_cost = LangModCost(new_edge, new_parent); - - new_cost = static_cast(cntxt_->Params()->RecoWgt() * - (new_best_path_reco_cost + new_reco_cost) / - static_cast(new_best_path_len) - ) + new_lm_cost; - - // update if it is better (less) than the current one - if (best_cost_ > new_cost) { - parent_node_ = new_parent; - char_reco_cost_ = new_reco_cost; - best_path_reco_cost_ = new_best_path_reco_cost; - best_path_len_ = new_best_path_len; - mean_char_reco_cost_ = static_cast( - (best_path_reco_cost_ + char_reco_cost_) / - static_cast(best_path_len_)); - best_cost_ = static_cast(cntxt_->Params()->RecoWgt() * - (best_path_reco_cost_ + char_reco_cost_) / - static_cast(best_path_len_) - ) + new_lm_cost; - return true; - } - return false; -} - -char_32 *SearchNode::PathString() { - SearchNode *node = this; - - // compute string length - int len = 0; - - while (node != NULL) { - if (node->str_ != NULL) { - len += CubeUtils::StrLen(node->str_); - } - - // if the edge is a root and does not have a NULL parent, account for space - LangModEdge *lm_edge = node->LangModelEdge(); - if (lm_edge != NULL && lm_edge->IsRoot() && node->ParentNode() != NULL) { - len++; - } - - node = node->parent_node_; - } - - char_32 *char_ptr = new char_32[len + 1]; - if (char_ptr == NULL) { - return NULL; - } - - int ch_idx = len; - - node = this; - char_ptr[ch_idx--] = 0; - - while (node != NULL) { - int str_len = ((node->str_ == NULL) ? 0 : CubeUtils::StrLen(node->str_)); - while (str_len > 0) { - char_ptr[ch_idx--] = node->str_[--str_len]; - } - - // if the edge is a root and does not have a NULL parent, insert a space - LangModEdge *lm_edge = node->LangModelEdge(); - if (lm_edge != NULL && lm_edge->IsRoot() && node->ParentNode() != NULL) { - char_ptr[ch_idx--] = (char_32)' '; - } - - node = node->parent_node_; - } - - return char_ptr; -} - -// compares the path of two nodes and checks if its identical -bool SearchNode::IdenticalPath(SearchNode *node1, SearchNode *node2) { - if (node1 != NULL && node2 != NULL && - node1->best_path_len_ != node2->best_path_len_) { - return false; - } - - // backtrack until either a root or a NULL edge is reached - while (node1 != NULL && node2 != NULL) { - if (node1->str_ != node2->str_) { - return false; - } - - // stop if either nodes is a root - if (node1->LangModelEdge()->IsRoot() || node2->LangModelEdge()->IsRoot()) { - break; - } - - node1 = node1->parent_node_; - node2 = node2->parent_node_; - } - - return ((node1 == NULL && node2 == NULL) || - (node1 != NULL && node1->LangModelEdge()->IsRoot() && - node2 != NULL && node2->LangModelEdge()->IsRoot())); -} - -// Computes the language model cost of a path -int SearchNode::LangModCost(LangModEdge *current_lm_edge, - SearchNode *parent_node) { - int lm_cost = 0; - int node_cnt = 0; - - do { - // check if root - bool is_root = ((current_lm_edge != NULL && current_lm_edge->IsRoot()) || - parent_node == NULL); - if (is_root) { - node_cnt++; - lm_cost += (current_lm_edge == NULL ? 0 : current_lm_edge->PathCost()); - } - - // continue until we hit a null parent - if (parent_node == NULL) { - break; - } - - // get the previous language model edge - current_lm_edge = parent_node->LangModelEdge(); - // back track - parent_node = parent_node->ParentNode(); - } while (true); - - return static_cast(lm_cost / static_cast(node_cnt)); -} -} // namespace tesseract diff --git a/cube/search_node.h b/cube/search_node.h deleted file mode 100644 index b4b69b8c..00000000 --- a/cube/search_node.h +++ /dev/null @@ -1,168 +0,0 @@ -/********************************************************************** - * File: search_node.h - * Description: Declaration of the Beam Search Node Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The SearchNode class abstracts the search lattice node in the lattice -// generated by the BeamSearch class -// The SearchNode class holds the lang_mod_edge associated with the lattice -// node. It also holds a pointer to the parent SearchNode in the search path -// In addition it holds the recognition and the language model costs of the -// node and the path leading to this node - -#ifndef SEARCH_NODE_H -#define SEARCH_NODE_H - -#include "lang_mod_edge.h" -#include "cube_reco_context.h" - -namespace tesseract { - -class SearchNode { - public: - SearchNode(CubeRecoContext *cntxt, SearchNode *parent_node, - int char_reco_cost, LangModEdge *edge, int col_idx); - - ~SearchNode(); - - // Updates the parent of the current node if the specified path yields - // a better path cost - bool UpdateParent(SearchNode *new_parent, int new_reco_cost, - LangModEdge *new_edge); - // returns the 32-bit string corresponding to the path leading to this node - char_32 *PathString(); - // True if the two input nodes correspond to the same path - static bool IdenticalPath(SearchNode *node1, SearchNode *node2); - - inline const char_32 *NodeString() { return str_; } - inline void SetString(char_32 *str) { str_ = str; } - - // This node's character recognition cost. - inline int CharRecoCost() { return char_reco_cost_; } - // Total character recognition cost of the nodes in the best path, - // excluding this node. - inline int BestPathRecoCost() { return best_path_reco_cost_; } - // Number of nodes in best path. - inline int BestPathLength() { return best_path_len_; } - // Mean mixed cost, i.e., mean character recognition cost + - // current language model cost, all weighted by the RecoWgt parameter - inline int BestCost() { return best_cost_; } - // Mean character recognition cost of the nodes on the best path, - // including this node. - inline int BestRecoCost() { return mean_char_reco_cost_ ; } - - inline int ColIdx() { return col_idx_; } - inline SearchNode *ParentNode() { return parent_node_; } - inline LangModEdge *LangModelEdge() { return lang_mod_edge_;} - inline int LangModCost() { return LangModCost(lang_mod_edge_, parent_node_); } - - // A comparer function that allows the SearchColumn class to sort the - // nodes based on the path cost - inline static int SearchNodeComparer(const void *node1, const void *node2) { - return (*(reinterpret_cast(node1)))->best_cost_ - - (*(reinterpret_cast(node2)))->best_cost_; - } - - private: - CubeRecoContext *cntxt_; - // Character code - const char_32 *str_; - // Recognition cost of most recent character - int char_reco_cost_; - // Mean mixed cost, i.e., mean character recognition cost + - // current language model cost, all weighted by the RecoWgt parameter - int best_cost_; - // Mean character recognition cost of the nodes on the best path, - // including this node. - int mean_char_reco_cost_ ; - // Total character recognition cost of the nodes in the best path, - // excluding this node. - int best_path_reco_cost_; - // Number of nodes in best path. - int best_path_len_; - // Column index - int col_idx_; - // Parent Node - SearchNode *parent_node_; - // Language model edge - LangModEdge *lang_mod_edge_; - static int LangModCost(LangModEdge *lang_mod_edge, SearchNode *parent_node); -}; - -// Implments a SearchNode hash table used to detect if a Search Node exists -// or not. This is needed to make sure that identical paths in the BeamSearch -// converge -class SearchNodeHashTable { - public: - SearchNodeHashTable() { - memset(bin_size_array_, 0, sizeof(bin_size_array_)); - } - - ~SearchNodeHashTable() { - } - - // inserts an entry in the hash table - inline bool Insert(LangModEdge *lang_mod_edge, SearchNode *srch_node) { - // compute hash based on the edge and its parent node edge - unsigned int edge_hash = lang_mod_edge->Hash(); - unsigned int parent_hash = (srch_node->ParentNode() == NULL ? - 0 : srch_node->ParentNode()->LangModelEdge()->Hash()); - unsigned int hash_bin = (edge_hash + parent_hash) % kSearchNodeHashBins; - - // already maxed out, just fail - if (bin_size_array_[hash_bin] >= kMaxSearchNodePerBin) { - return false; - } - - bin_array_[hash_bin][bin_size_array_[hash_bin]++] = srch_node; - - return true; - } - - // Looks up an entry in the hash table - inline SearchNode *Lookup(LangModEdge *lang_mod_edge, - SearchNode *parent_node) { - // compute hash based on the edge and its parent node edge - unsigned int edge_hash = lang_mod_edge->Hash(); - unsigned int parent_hash = (parent_node == NULL ? - 0 : parent_node->LangModelEdge()->Hash()); - unsigned int hash_bin = (edge_hash + parent_hash) % kSearchNodeHashBins; - - // lookup the entries in the hash bin - for (int node_idx = 0; node_idx < bin_size_array_[hash_bin]; node_idx++) { - if (lang_mod_edge->IsIdentical( - bin_array_[hash_bin][node_idx]->LangModelEdge()) == true && - SearchNode::IdenticalPath( - bin_array_[hash_bin][node_idx]->ParentNode(), parent_node) == true) { - return bin_array_[hash_bin][node_idx]; - } - } - - return NULL; - } - - private: - // Hash bin size parameters. These were determined emperically. These affect - // the speed of the beam search but have no impact on accuracy - static const int kSearchNodeHashBins = 4096; - static const int kMaxSearchNodePerBin = 512; - int bin_size_array_[kSearchNodeHashBins]; - SearchNode *bin_array_[kSearchNodeHashBins][kMaxSearchNodePerBin]; -}; -} - -#endif // SEARCH_NODE_H diff --git a/cube/search_object.h b/cube/search_object.h deleted file mode 100644 index 84b866e6..00000000 --- a/cube/search_object.h +++ /dev/null @@ -1,55 +0,0 @@ -/********************************************************************** - * File: search_object.h - * Description: Declaration of the Beam Search Object Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The SearchObject class represents a char_samp (a word bitmap) that is -// being searched for characters (or recognizeable entities). -// This is an abstract class that all SearchObjects should inherit from -// A SearchObject class provides methods to: -// 1- Returns the count of segments -// 2- Recognize a segment range -// 3- Creates a CharSamp for a segment range - -#ifndef SEARCH_OBJECT_H -#define SEARCH_OBJECT_H - -#include "char_altlist.h" -#include "char_samp.h" -#include "cube_reco_context.h" - -namespace tesseract { -class SearchObject { - public: - explicit SearchObject(CubeRecoContext *cntxt) { cntxt_ = cntxt; } - virtual ~SearchObject() {} - - virtual int SegPtCnt() = 0; - virtual CharAltList *RecognizeSegment(int start_pt, int end_pt) = 0; - virtual CharSamp *CharSample(int start_pt, int end_pt) = 0; - virtual Box* CharBox(int start_pt, int end_pt) = 0; - - virtual int SpaceCost(int seg_pt) = 0; - virtual int NoSpaceCost(int seg_pt) = 0; - virtual int NoSpaceCost(int start_pt, int end_pt) = 0; - - protected: - CubeRecoContext *cntxt_; -}; -} - -#endif // SEARCH_OBJECT_H diff --git a/cube/string_32.h b/cube/string_32.h deleted file mode 100644 index 0ae0ceec..00000000 --- a/cube/string_32.h +++ /dev/null @@ -1,44 +0,0 @@ -/********************************************************************** - * File: string_32.h - * Description: Declaration of a 32 Bit string class - * Author: Ahmad Abdulkader - * Created: 2007 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// the string_32 class provides the functionality needed -// for a 32-bit string class - -#ifndef STRING_32_H -#define STRING_32_H - -#include -#include -#include -#include - -#ifdef USE_STD_NAMESPACE -using std::basic_string; -using std::string; -using std::vector; -#endif - -namespace tesseract { - -// basic definitions -typedef signed int char_32; -typedef basic_string string_32; -} - -#endif // STRING_32_H diff --git a/cube/tess_lang_mod_edge.cpp b/cube/tess_lang_mod_edge.cpp deleted file mode 100644 index 4d16f3ac..00000000 --- a/cube/tess_lang_mod_edge.cpp +++ /dev/null @@ -1,124 +0,0 @@ -/********************************************************************** - * File: tess_lang_mod_edge.cpp - * Description: Implementation of the Tesseract Language Model Edge Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "tess_lang_mod_edge.h" -#include "const.h" -#include "unichar.h" - - - -namespace tesseract { -// OOD constructor -TessLangModEdge::TessLangModEdge(CubeRecoContext *cntxt, int class_id) { - root_ = false; - cntxt_ = cntxt; - dawg_ = NULL; - start_edge_ = 0; - end_edge_ = 0; - edge_mask_ = 0; - class_id_ = class_id; - str_ = cntxt_->CharacterSet()->ClassString(class_id); - path_cost_ = Cost(); -} - -/** - * leading, trailing punc constructor and single byte UTF char - */ -TessLangModEdge::TessLangModEdge(CubeRecoContext *cntxt, - const Dawg *dawg, EDGE_REF edge_idx, int class_id) { - root_ = false; - cntxt_ = cntxt; - dawg_ = dawg; - start_edge_ = edge_idx; - end_edge_ = edge_idx; - edge_mask_ = 0; - class_id_ = class_id; - str_ = cntxt_->CharacterSet()->ClassString(class_id); - path_cost_ = Cost(); -} - -/** - * dict constructor: multi byte UTF char -*/ -TessLangModEdge::TessLangModEdge(CubeRecoContext *cntxt, const Dawg *dawg, - EDGE_REF start_edge_idx, EDGE_REF end_edge_idx, - int class_id) { - root_ = false; - cntxt_ = cntxt; - dawg_ = dawg; - start_edge_ = start_edge_idx; - end_edge_ = end_edge_idx; - edge_mask_ = 0; - class_id_ = class_id; - str_ = cntxt_->CharacterSet()->ClassString(class_id); - path_cost_ = Cost(); -} - -char *TessLangModEdge::Description() const { - char *char_ptr = new char[256]; - if (!char_ptr) { - return NULL; - } - - char dawg_str[256]; - char edge_str[32]; - if (dawg_ == (Dawg *)DAWG_OOD) { - strcpy(dawg_str, "OOD"); - } else if (dawg_ == (Dawg *)DAWG_NUMBER) { - strcpy(dawg_str, "NUM"); - } else if (dawg_->permuter() == SYSTEM_DAWG_PERM) { - strcpy(dawg_str, "Main"); - } else if (dawg_->permuter() == USER_DAWG_PERM) { - strcpy(dawg_str, "User"); - } else if (dawg_->permuter() == DOC_DAWG_PERM) { - strcpy(dawg_str, "Doc"); - } else { - strcpy(dawg_str, "N/A"); - } - - sprintf(edge_str, "%d", static_cast(start_edge_)); - if (IsLeadingPuncEdge(edge_mask_)) { - strcat(edge_str, "-LP"); - } - if (IsTrailingPuncEdge(edge_mask_)) { - strcat(edge_str, "-TP"); - } - sprintf(char_ptr, "%s(%s)%s, Wtd Dawg Cost=%d", - dawg_str, edge_str, IsEOW() ? "-EOW-" : "", path_cost_); - - return char_ptr; -} - -int TessLangModEdge::CreateChildren(CubeRecoContext *cntxt, - const Dawg *dawg, - NODE_REF parent_node, - LangModEdge **edge_array) { - int edge_cnt = 0; - NodeChildVector vec; - dawg->unichar_ids_of(parent_node, &vec, false); // find all children - for (int i = 0; i < vec.size(); ++i) { - const NodeChild &child = vec[i]; - if (child.unichar_id == INVALID_UNICHAR_ID) continue; - edge_array[edge_cnt] = - new TessLangModEdge(cntxt, dawg, child.edge_ref, child.unichar_id); - if (edge_array[edge_cnt] != NULL) edge_cnt++; - } - return edge_cnt; -} -} diff --git a/cube/tess_lang_mod_edge.h b/cube/tess_lang_mod_edge.h deleted file mode 100644 index adad9518..00000000 --- a/cube/tess_lang_mod_edge.h +++ /dev/null @@ -1,233 +0,0 @@ -/********************************************************************** - * File: tess_lang_mod_edge.h - * Description: Declaration of the Tesseract Language Model Edge Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The TessLangModEdge models an edge in the Tesseract language models -// It inherits from the LangModEdge class - -#ifndef TESS_LANG_MOD_EDGE_H -#define TESS_LANG_MOD_EDGE_H - -#include "dawg.h" -#include "char_set.h" - -#include "lang_mod_edge.h" -#include "cube_reco_context.h" -#include "cube_utils.h" - -// Macros needed to identify punctuation in the langmodel state -#ifdef _HMSW32_H -#define LEAD_PUNC_EDGE_REF_MASK (inT64) 0x0000000100000000i64 -#define TRAIL_PUNC_EDGE_REF_MASK (inT64) 0x0000000200000000i64 -#define TRAIL_PUNC_REPEAT_MASK (inT64) 0xffff000000000000i64 -#else -#define LEAD_PUNC_EDGE_REF_MASK (inT64) 0x0000000100000000ll -#define TRAIL_PUNC_EDGE_REF_MASK (inT64) 0x0000000200000000ll -#define TRAIL_PUNC_REPEAT_MASK (inT64) 0xffff000000000000ll -#endif - -// Number state machine macros -#define NUMBER_STATE_SHIFT 0 -#define NUMBER_STATE_MASK 0x0000000fl -#define NUMBER_LITERAL_SHIFT 4 -#define NUMBER_LITERAL_MASK 0x000000f0l -#define NUMBER_REPEAT_SHIFT 8 -#define NUMBER_REPEAT_MASK 0x00000f00l -#define NUM_TRM -99 -#define TRAIL_PUNC_REPEAT_SHIFT 48 - -#define IsLeadingPuncEdge(edge_mask) \ - ((edge_mask & LEAD_PUNC_EDGE_REF_MASK) != 0) -#define IsTrailingPuncEdge(edge_mask) \ - ((edge_mask & TRAIL_PUNC_EDGE_REF_MASK) != 0) -#define TrailingPuncCount(edge_mask) \ - ((edge_mask & TRAIL_PUNC_REPEAT_MASK) >> TRAIL_PUNC_REPEAT_SHIFT) -#define TrailingPuncEdgeMask(Cnt) \ - (TRAIL_PUNC_EDGE_REF_MASK | ((Cnt) << TRAIL_PUNC_REPEAT_SHIFT)) - -// State machine IDs -#define DAWG_OOD 0 -#define DAWG_NUMBER 1 - -namespace tesseract { -class TessLangModEdge : public LangModEdge { - public: - // Different ways of constructing a TessLangModEdge - TessLangModEdge(CubeRecoContext *cntxt, const Dawg *edge_array, - EDGE_REF edge, int class_id); - TessLangModEdge(CubeRecoContext *cntxt, const Dawg *edge_array, - EDGE_REF start_edge_idx, EDGE_REF end_edge_idx, - int class_id); - TessLangModEdge(CubeRecoContext *cntxt, int class_id); - ~TessLangModEdge() {} - - // Accessors - inline bool IsRoot() const { - return root_; - } - inline void SetRoot(bool flag) { root_ = flag; } - - inline bool IsOOD() const { - return (dawg_ == (Dawg *)DAWG_OOD); - } - - inline bool IsNumber() const { - return (dawg_ == (Dawg *)DAWG_NUMBER); - } - - inline bool IsEOW() const { - return (IsTerminal() || (dawg_->end_of_word(end_edge_) != 0)); - } - - inline const Dawg *GetDawg() const { return dawg_; } - inline EDGE_REF StartEdge() const { return start_edge_; } - inline EDGE_REF EndEdge() const { return end_edge_; } - inline EDGE_REF EdgeMask() const { return edge_mask_; } - inline const char_32 * EdgeString() const { return str_; } - inline int ClassID () const { return class_id_; } - inline int PathCost() const { return path_cost_; } - inline void SetEdgeMask(EDGE_REF edge_mask) { edge_mask_ = edge_mask; } - inline void SetDawg(Dawg *dawg) { dawg_ = dawg; } - inline void SetStartEdge(EDGE_REF edge_idx) { start_edge_ = edge_idx; } - inline void SetEndEdge(EDGE_REF edge_idx) { end_edge_ = edge_idx; } - - // is this a terminal node: - // we can terminate at any OOD char, trailing punc or - // when the dawg terminates - inline bool IsTerminal() const { - return (IsOOD() || IsNumber() || IsTrailingPuncEdge(start_edge_) || - dawg_->next_node(end_edge_) == 0); - } - - // How many signals does the LM provide for tuning. These are flags like: - // OOD or not, Number of not that are used by the training to compute - // extra costs for each word. - inline int SignalCnt() const { - return 2; - } - - // returns the weight assigned to a specified signal - inline double SignalWgt(int signal) const { - CubeTuningParams *params = - reinterpret_cast(cntxt_->Params()); - if (params != NULL) { - switch (signal) { - case 0: - return params->OODWgt(); - break; - - case 1: - return params->NumWgt(); - break; - } - } - - return 0.0; - } - - // sets the weight assigned to a specified signal: Used in training - void SetSignalWgt(int signal, double wgt) { - CubeTuningParams *params = - reinterpret_cast(cntxt_->Params()); - if (params != NULL) { - switch (signal) { - case 0: - params->SetOODWgt(wgt); - break; - - case 1: - params->SetNumWgt(wgt); - break; - } - } - } - - // returns the actual value of a specified signal - int Signal(int signal) { - switch (signal) { - case 0: - return IsOOD() ? MIN_PROB_COST : 0; - break; - - case 1: - return IsNumber() ? MIN_PROB_COST : 0; - break; - - default: - return 0; - } - } - - // returns the Hash value of the edge. Used by the SearchNode hash table - // to quickly lookup exisiting edges to converge during search - inline unsigned int Hash() const { - return static_cast( - ((start_edge_ | end_edge_) ^ ((reinterpret_cast(dawg_)))) ^ - ((unsigned int)edge_mask_) ^ class_id_); - } - - // A verbal description of the edge: Used by visualizers - char *Description() const; - - // Is this edge identical to the specified edge - inline bool IsIdentical(LangModEdge *lang_mod_edge) const { - return (class_id_ == - reinterpret_cast(lang_mod_edge)->class_id_ && - str_ == reinterpret_cast(lang_mod_edge)->str_ && - dawg_ == reinterpret_cast(lang_mod_edge)->dawg_ && - start_edge_ == - reinterpret_cast(lang_mod_edge)->start_edge_ && - end_edge_ == - reinterpret_cast(lang_mod_edge)->end_edge_ && - edge_mask_ == - reinterpret_cast(lang_mod_edge)->edge_mask_); - } - - // Creates a set of fan-out edges for the specified edge - static int CreateChildren(CubeRecoContext *cntxt, - const Dawg *edges, - NODE_REF edge_reg, - LangModEdge **lm_edges); - - private: - bool root_; - CubeRecoContext *cntxt_; - const Dawg *dawg_; - EDGE_REF start_edge_; - EDGE_REF end_edge_; - EDGE_REF edge_mask_; - int path_cost_; - int class_id_; - const char_32 * str_; - // returns the cost of the lang_mod_edge - inline int Cost() const { - if (cntxt_ != NULL) { - CubeTuningParams *params = - reinterpret_cast(cntxt_->Params()); - if (dawg_ == (Dawg *)DAWG_OOD) { - return static_cast(params->OODWgt() * MIN_PROB_COST); - } else if (dawg_ == (Dawg *)DAWG_NUMBER) { - return static_cast(params->NumWgt() * MIN_PROB_COST); - } - } - return 0; - } -}; -} // namespace tesseract - -#endif // TESS_LANG_MOD_EDGE_H diff --git a/cube/tess_lang_model.cpp b/cube/tess_lang_model.cpp deleted file mode 100644 index 51132072..00000000 --- a/cube/tess_lang_model.cpp +++ /dev/null @@ -1,523 +0,0 @@ -/********************************************************************** - * File: tess_lang_model.cpp - * Description: Implementation of the Tesseract Language Model Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The TessLangModel class abstracts the Tesseract language model. It inherits -// from the LangModel class. The Tesseract language model encompasses several -// Dawgs (words from training data, punctuation, numbers, document words). -// On top of this Cube adds an OOD state machine -// The class provides methods to traverse the language model in a generative -// fashion. Given any node in the DAWG, the language model can generate a list -// of children (or fan-out) edges - -#include -#include - -#include "char_samp.h" -#include "cube_utils.h" -#include "dict.h" -#include "tesseractclass.h" -#include "tess_lang_model.h" -#include "tessdatamanager.h" -#include "unicharset.h" - -namespace tesseract { -// max fan-out (used for preallocation). Initialized here, but modified by -// constructor -int TessLangModel::max_edge_ = 4096; - -// Language model extra State machines -const Dawg *TessLangModel::ood_dawg_ = reinterpret_cast(DAWG_OOD); -const Dawg *TessLangModel::number_dawg_ = reinterpret_cast(DAWG_NUMBER); - -// number state machine -const int TessLangModel::num_state_machine_[kStateCnt][kNumLiteralCnt] = { - {0, 1, 1, NUM_TRM, NUM_TRM}, - {NUM_TRM, 1, 1, 3, 2}, - {NUM_TRM, NUM_TRM, 1, NUM_TRM, 2}, - {NUM_TRM, NUM_TRM, 3, NUM_TRM, 2}, -}; -const int TessLangModel::num_max_repeat_[kStateCnt] = {3, 32, 8, 3}; - -// thresholds and penalties -int TessLangModel::max_ood_shape_cost_ = CubeUtils::Prob2Cost(1e-4); - -TessLangModel::TessLangModel(const string &lm_params, - const string &data_file_path, - bool load_system_dawg, - TessdataManager *tessdata_manager, - CubeRecoContext *cntxt) { - cntxt_ = cntxt; - has_case_ = cntxt_->HasCase(); - // Load the rest of the language model elements from file - LoadLangModelElements(lm_params); - // Load word_dawgs_ if needed. - if (tessdata_manager->SeekToStart(TESSDATA_CUBE_UNICHARSET)) { - word_dawgs_ = new DawgVector(); - if (load_system_dawg && - tessdata_manager->SeekToStart(TESSDATA_CUBE_SYSTEM_DAWG)) { - // The last parameter to the Dawg constructor (the debug level) is set to - // false, until Cube has a way to express its preferred debug level. - *word_dawgs_ += new SquishedDawg(tessdata_manager->GetDataFilePtr(), - DAWG_TYPE_WORD, - cntxt_->Lang().c_str(), - SYSTEM_DAWG_PERM, false); - } - } else { - word_dawgs_ = NULL; - } -} - -// Cleanup an edge array -void TessLangModel::FreeEdges(int edge_cnt, LangModEdge **edge_array) { - if (edge_array != NULL) { - for (int edge_idx = 0; edge_idx < edge_cnt; edge_idx++) { - if (edge_array[edge_idx] != NULL) { - delete edge_array[edge_idx]; - } - } - delete []edge_array; - } -} - -// Determines if a sequence of 32-bit chars is valid in this language model -// starting from the specified edge. If the eow_flag is ON, also checks for -// a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last -// edge -bool TessLangModel::IsValidSequence(LangModEdge *edge, - const char_32 *sequence, - bool eow_flag, - LangModEdge **final_edge) { - // get the edges emerging from this edge - int edge_cnt = 0; - LangModEdge **edge_array = GetEdges(NULL, edge, &edge_cnt); - - // find the 1st char in the sequence in the children - for (int edge_idx = 0; edge_idx < edge_cnt; edge_idx++) { - // found a match - if (sequence[0] == edge_array[edge_idx]->EdgeString()[0]) { - // if this is the last char - if (sequence[1] == 0) { - // succeed if we are in prefix mode or this is a terminal edge - if (eow_flag == false || edge_array[edge_idx]->IsEOW()) { - if (final_edge != NULL) { - (*final_edge) = edge_array[edge_idx]; - edge_array[edge_idx] = NULL; - } - - FreeEdges(edge_cnt, edge_array); - return true; - } - } else { - // not the last char continue checking - if (IsValidSequence(edge_array[edge_idx], sequence + 1, eow_flag, - final_edge) == true) { - FreeEdges(edge_cnt, edge_array); - return true; - } - } - } - } - - FreeEdges(edge_cnt, edge_array); - return false; -} - -// Determines if a sequence of 32-bit chars is valid in this language model -// starting from the root. If the eow_flag is ON, also checks for -// a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last -// edge -bool TessLangModel::IsValidSequence(const char_32 *sequence, bool eow_flag, - LangModEdge **final_edge) { - if (final_edge != NULL) { - (*final_edge) = NULL; - } - - return IsValidSequence(NULL, sequence, eow_flag, final_edge); -} - -bool TessLangModel::IsLeadingPunc(const char_32 ch) { - return lead_punc_.find(ch) != string::npos; -} - -bool TessLangModel::IsTrailingPunc(const char_32 ch) { - return trail_punc_.find(ch) != string::npos; -} - -bool TessLangModel::IsDigit(const char_32 ch) { - return digits_.find(ch) != string::npos; -} - -// The general fan-out generation function. Returns the list of edges -// fanning-out of the specified edge and their count. If an AltList is -// specified, only the class-ids with a minimum cost are considered -LangModEdge ** TessLangModel::GetEdges(CharAltList *alt_list, - LangModEdge *lang_mod_edge, - int *edge_cnt) { - TessLangModEdge *tess_lm_edge = - reinterpret_cast(lang_mod_edge); - LangModEdge **edge_array = NULL; - (*edge_cnt) = 0; - - // if we are starting from the root, we'll instantiate every DAWG - // and get the all the edges that emerge from the root - if (tess_lm_edge == NULL) { - // get DAWG count from Tesseract - int dawg_cnt = NumDawgs(); - // preallocate the edge buffer - (*edge_cnt) = dawg_cnt * max_edge_; - edge_array = new LangModEdge *[(*edge_cnt)]; - if (edge_array == NULL) { - return NULL; - } - - for (int dawg_idx = (*edge_cnt) = 0; dawg_idx < dawg_cnt; dawg_idx++) { - const Dawg *curr_dawg = GetDawg(dawg_idx); - // Only look through word Dawgs (since there is a special way of - // handling numbers and punctuation). - if (curr_dawg->type() == DAWG_TYPE_WORD) { - (*edge_cnt) += FanOut(alt_list, curr_dawg, 0, 0, NULL, true, - edge_array + (*edge_cnt)); - } - } // dawg - - (*edge_cnt) += FanOut(alt_list, number_dawg_, 0, 0, NULL, true, - edge_array + (*edge_cnt)); - - // OOD: it is intentionally not added to the list to make sure it comes - // at the end - (*edge_cnt) += FanOut(alt_list, ood_dawg_, 0, 0, NULL, true, - edge_array + (*edge_cnt)); - - // set the root flag for all root edges - for (int edge_idx = 0; edge_idx < (*edge_cnt); edge_idx++) { - edge_array[edge_idx]->SetRoot(true); - } - } else { // not starting at the root - // preallocate the edge buffer - (*edge_cnt) = max_edge_; - // allocate memory for edges - edge_array = new LangModEdge *[(*edge_cnt)]; - if (edge_array == NULL) { - return NULL; - } - - // get the FanOut edges from the root of each dawg - (*edge_cnt) = FanOut(alt_list, - tess_lm_edge->GetDawg(), - tess_lm_edge->EndEdge(), tess_lm_edge->EdgeMask(), - tess_lm_edge->EdgeString(), false, edge_array); - } - return edge_array; -} - -// generate edges from an NULL terminated string -// (used for punctuation, operators and digits) -int TessLangModel::Edges(const char *strng, const Dawg *dawg, - EDGE_REF edge_ref, EDGE_REF edge_mask, - LangModEdge **edge_array) { - int edge_idx, - edge_cnt = 0; - - for (edge_idx = 0; strng[edge_idx] != 0; edge_idx++) { - int class_id = cntxt_->CharacterSet()->ClassID((char_32)strng[edge_idx]); - if (class_id != INVALID_UNICHAR_ID) { - // create an edge object - edge_array[edge_cnt] = new TessLangModEdge(cntxt_, dawg, edge_ref, - class_id); - if (edge_array[edge_cnt] == NULL) { - return 0; - } - - reinterpret_cast(edge_array[edge_cnt])-> - SetEdgeMask(edge_mask); - edge_cnt++; - } - } - - return edge_cnt; -} - -// generate OOD edges -int TessLangModel::OODEdges(CharAltList *alt_list, EDGE_REF edge_ref, - EDGE_REF edge_ref_mask, LangModEdge **edge_array) { - int class_cnt = cntxt_->CharacterSet()->ClassCount(); - int edge_cnt = 0; - for (int class_id = 0; class_id < class_cnt; class_id++) { - // produce an OOD edge only if the cost of the char is low enough - if ((alt_list == NULL || - alt_list->ClassCost(class_id) <= max_ood_shape_cost_)) { - // create an edge object - edge_array[edge_cnt] = new TessLangModEdge(cntxt_, class_id); - if (edge_array[edge_cnt] == NULL) { - return 0; - } - - edge_cnt++; - } - } - - return edge_cnt; -} - -// computes and returns the edges that fan out of an edge ref -int TessLangModel::FanOut(CharAltList *alt_list, const Dawg *dawg, - EDGE_REF edge_ref, EDGE_REF edge_mask, - const char_32 *str, bool root_flag, - LangModEdge **edge_array) { - int edge_cnt = 0; - NODE_REF next_node = NO_EDGE; - - // OOD - if (dawg == reinterpret_cast(DAWG_OOD)) { - if (ood_enabled_ == true) { - return OODEdges(alt_list, edge_ref, edge_mask, edge_array); - } else { - return 0; - } - } else if (dawg == reinterpret_cast(DAWG_NUMBER)) { - // Number - if (numeric_enabled_ == true) { - return NumberEdges(edge_ref, edge_array); - } else { - return 0; - } - } else if (IsTrailingPuncEdge(edge_mask)) { - // a TRAILING PUNC MASK, generate more trailing punctuation and return - if (punc_enabled_ == true) { - EDGE_REF trail_cnt = TrailingPuncCount(edge_mask); - return Edges(trail_punc_.c_str(), dawg, edge_ref, - TrailingPuncEdgeMask(trail_cnt + 1), edge_array); - } else { - return 0; - } - } else if (root_flag == true || edge_ref == 0) { - // Root, generate leading punctuation and continue - if (root_flag) { - if (punc_enabled_ == true) { - edge_cnt += Edges(lead_punc_.c_str(), dawg, 0, LEAD_PUNC_EDGE_REF_MASK, - edge_array); - } - } - next_node = 0; - } else { - // a node in the main trie - bool eow_flag = (dawg->end_of_word(edge_ref) != 0); - - // for EOW - if (eow_flag == true) { - // generate trailing punctuation - if (punc_enabled_ == true) { - edge_cnt += Edges(trail_punc_.c_str(), dawg, edge_ref, - TrailingPuncEdgeMask((EDGE_REF)1), edge_array); - // generate a hyphen and go back to the root - edge_cnt += Edges("-/", dawg, 0, 0, edge_array + edge_cnt); - } - } - - // advance node - next_node = dawg->next_node(edge_ref); - if (next_node == 0 || next_node == NO_EDGE) { - return edge_cnt; - } - } - - // now get all the emerging edges if word list is enabled - if (word_list_enabled_ == true && next_node != NO_EDGE) { - // create child edges - int child_edge_cnt = - TessLangModEdge::CreateChildren(cntxt_, dawg, next_node, - edge_array + edge_cnt); - int strt_cnt = edge_cnt; - - // set the edge mask - for (int child = 0; child < child_edge_cnt; child++) { - reinterpret_cast(edge_array[edge_cnt++])-> - SetEdgeMask(edge_mask); - } - - // if we are at the root, create upper case forms of these edges if possible - if (root_flag == true) { - for (int child = 0; child < child_edge_cnt; child++) { - TessLangModEdge *child_edge = - reinterpret_cast(edge_array[strt_cnt + child]); - - if (has_case_ == true) { - const char_32 *edge_str = child_edge->EdgeString(); - if (edge_str != NULL && islower(edge_str[0]) != 0 && - edge_str[1] == 0) { - int class_id = - cntxt_->CharacterSet()->ClassID(toupper(edge_str[0])); - if (class_id != INVALID_UNICHAR_ID) { - // generate an upper case edge for lower case chars - edge_array[edge_cnt] = new TessLangModEdge(cntxt_, dawg, - child_edge->StartEdge(), child_edge->EndEdge(), class_id); - - if (edge_array[edge_cnt] != NULL) { - reinterpret_cast(edge_array[edge_cnt])-> - SetEdgeMask(edge_mask); - edge_cnt++; - } - } - } - } - } - } - } - return edge_cnt; -} - -// Generate the edges fanning-out from an edge in the number state machine -int TessLangModel::NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array) { - EDGE_REF new_state, - state; - - inT64 repeat_cnt, - new_repeat_cnt; - - state = ((edge_ref & NUMBER_STATE_MASK) >> NUMBER_STATE_SHIFT); - repeat_cnt = ((edge_ref & NUMBER_REPEAT_MASK) >> NUMBER_REPEAT_SHIFT); - - if (state < 0 || state >= kStateCnt) { - return 0; - } - - // go through all valid transitions from the state - int edge_cnt = 0; - - EDGE_REF new_edge_ref; - - for (int lit = 0; lit < kNumLiteralCnt; lit++) { - // move to the new state - new_state = num_state_machine_[state][lit]; - if (new_state == NUM_TRM) { - continue; - } - - if (new_state == state) { - new_repeat_cnt = repeat_cnt + 1; - } else { - new_repeat_cnt = 1; - } - - // not allowed to repeat beyond this - if (new_repeat_cnt > num_max_repeat_[state]) { - continue; - } - - new_edge_ref = (new_state << NUMBER_STATE_SHIFT) | - (lit << NUMBER_LITERAL_SHIFT) | - (new_repeat_cnt << NUMBER_REPEAT_SHIFT); - - edge_cnt += Edges(literal_str_[lit]->c_str(), number_dawg_, - new_edge_ref, 0, edge_array + edge_cnt); - } - - return edge_cnt; -} - -// Loads Language model elements from contents of the .cube.lm file -bool TessLangModel::LoadLangModelElements(const string &lm_params) { - bool success = true; - // split into lines, each corresponding to a token type below - vector str_vec; - CubeUtils::SplitStringUsing(lm_params, "\r\n", &str_vec); - for (int entry = 0; entry < str_vec.size(); entry++) { - vector tokens; - // should be only two tokens: type and value - CubeUtils::SplitStringUsing(str_vec[entry], "=", &tokens); - if (tokens.size() != 2) - success = false; - if (tokens[0] == "LeadPunc") { - lead_punc_ = tokens[1]; - } else if (tokens[0] == "TrailPunc") { - trail_punc_ = tokens[1]; - } else if (tokens[0] == "NumLeadPunc") { - num_lead_punc_ = tokens[1]; - } else if (tokens[0] == "NumTrailPunc") { - num_trail_punc_ = tokens[1]; - } else if (tokens[0] == "Operators") { - operators_ = tokens[1]; - } else if (tokens[0] == "Digits") { - digits_ = tokens[1]; - } else if (tokens[0] == "Alphas") { - alphas_ = tokens[1]; - } else { - success = false; - } - } - - RemoveInvalidCharacters(&num_lead_punc_); - RemoveInvalidCharacters(&num_trail_punc_); - RemoveInvalidCharacters(&digits_); - RemoveInvalidCharacters(&operators_); - RemoveInvalidCharacters(&alphas_); - - // form the array of literal strings needed for number state machine - // It is essential that the literal strings go in the order below - literal_str_[0] = &num_lead_punc_; - literal_str_[1] = &num_trail_punc_; - literal_str_[2] = &digits_; - literal_str_[3] = &operators_; - literal_str_[4] = &alphas_; - - return success; -} - -void TessLangModel::RemoveInvalidCharacters(string *lm_str) { - CharSet *char_set = cntxt_->CharacterSet(); - tesseract::string_32 lm_str32; - CubeUtils::UTF8ToUTF32(lm_str->c_str(), &lm_str32); - - int len = CubeUtils::StrLen(lm_str32.c_str()); - char_32 *clean_str32 = new char_32[len + 1]; - if (!clean_str32) - return; - int clean_len = 0; - for (int i = 0; i < len; ++i) { - int class_id = char_set->ClassID((char_32)lm_str32[i]); - if (class_id != INVALID_UNICHAR_ID) { - clean_str32[clean_len] = lm_str32[i]; - ++clean_len; - } - } - clean_str32[clean_len] = 0; - if (clean_len < len) { - lm_str->clear(); - CubeUtils::UTF32ToUTF8(clean_str32, lm_str); - } - delete [] clean_str32; -} - -int TessLangModel::NumDawgs() const { - return (word_dawgs_ != NULL) ? - word_dawgs_->size() : cntxt_->TesseractObject()->getDict().NumDawgs(); -} - -// Returns the dawgs with the given index from either the dawgs -// stored by the Tesseract object, or the word_dawgs_. -const Dawg *TessLangModel::GetDawg(int index) const { - if (word_dawgs_ != NULL) { - ASSERT_HOST(index < word_dawgs_->size()); - return (*word_dawgs_)[index]; - } else { - ASSERT_HOST(index < cntxt_->TesseractObject()->getDict().NumDawgs()); - return cntxt_->TesseractObject()->getDict().GetDawg(index); - } -} -} diff --git a/cube/tess_lang_model.h b/cube/tess_lang_model.h deleted file mode 100644 index 3e0c63a1..00000000 --- a/cube/tess_lang_model.h +++ /dev/null @@ -1,142 +0,0 @@ -/********************************************************************** - * File: tess_lang_model.h - * Description: Declaration of the Tesseract Language Model Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#ifndef TESS_LANG_MODEL_H -#define TESS_LANG_MODEL_H - -#include - -#include "char_altlist.h" -#include "cube_reco_context.h" -#include "cube_tuning_params.h" -#include "dict.h" -#include "lang_model.h" -#include "tessdatamanager.h" -#include "tess_lang_mod_edge.h" - -namespace tesseract { - -const int kStateCnt = 4; -const int kNumLiteralCnt = 5; - -class TessLangModel : public LangModel { - public: - TessLangModel(const string &lm_params, - const string &data_file_path, - bool load_system_dawg, - TessdataManager *tessdata_manager, - CubeRecoContext *cntxt); - ~TessLangModel() { - if (word_dawgs_ != NULL) { - word_dawgs_->delete_data_pointers(); - delete word_dawgs_; - } - } - - // returns a pointer to the root of the language model - inline TessLangModEdge *Root() { - return NULL; - } - - // The general fan-out generation function. Returns the list of edges - // fanning-out of the specified edge and their count. If an AltList is - // specified, only the class-ids with a minimum cost are considered - LangModEdge **GetEdges(CharAltList *alt_list, - LangModEdge *edge, - int *edge_cnt); - // Determines if a sequence of 32-bit chars is valid in this language model - // starting from the root. If the eow_flag is ON, also checks for - // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last - // edge - bool IsValidSequence(const char_32 *sequence, bool eow_flag, - LangModEdge **final_edge = NULL); - bool IsLeadingPunc(char_32 ch); - bool IsTrailingPunc(char_32 ch); - bool IsDigit(char_32 ch); - - void RemoveInvalidCharacters(string *lm_str); - private: - // static LM state machines - static const Dawg *ood_dawg_; - static const Dawg *number_dawg_; - static const int num_state_machine_[kStateCnt][kNumLiteralCnt]; - static const int num_max_repeat_[kStateCnt]; - // word_dawgs_ should only be loaded if cube has its own version of the - // unicharset (different from the one used by tesseract) and therefore - // can not use the dawgs loaded for tesseract (since the unichar ids - // encoded in the dawgs differ). - DawgVector *word_dawgs_; - - static int max_edge_; - static int max_ood_shape_cost_; - - // remaining language model elements needed by cube. These get loaded from - // the .lm file - string lead_punc_; - string trail_punc_; - string num_lead_punc_; - string num_trail_punc_; - string operators_; - string digits_; - string alphas_; - // String of characters in RHS of each line of .cube.lm - // Each element is hard-coded to correspond to a specific token type - // (see LoadLangModelElements) - string *literal_str_[kNumLiteralCnt]; - // Recognition context needed to access language properties - // (case, cursive,..) - CubeRecoContext *cntxt_; - bool has_case_; - - // computes and returns the edges that fan out of an edge ref - int FanOut(CharAltList *alt_list, - const Dawg *dawg, EDGE_REF edge_ref, EDGE_REF edge_ref_mask, - const char_32 *str, bool root_flag, LangModEdge **edge_array); - // generate edges from an NULL terminated string - // (used for punctuation, operators and digits) - int Edges(const char *strng, const Dawg *dawg, - EDGE_REF edge_ref, EDGE_REF edge_ref_mask, - LangModEdge **edge_array); - // Generate the edges fanning-out from an edge in the number state machine - int NumberEdges(EDGE_REF edge_ref, LangModEdge **edge_array); - // Generate OOD edges - int OODEdges(CharAltList *alt_list, EDGE_REF edge_ref, - EDGE_REF edge_ref_mask, LangModEdge **edge_array); - // Cleanup an edge array - void FreeEdges(int edge_cnt, LangModEdge **edge_array); - // Determines if a sequence of 32-bit chars is valid in this language model - // starting from the specified edge. If the eow_flag is ON, also checks for - // a valid EndOfWord. If final_edge is not NULL, returns a pointer to the last - // edge - bool IsValidSequence(LangModEdge *edge, const char_32 *sequence, - bool eow_flag, LangModEdge **final_edge); - // Parse language model elements from the given string, which should - // have been loaded from .cube.lm file, e.g. in CubeRecoContext - bool LoadLangModelElements(const string &lm_params); - - // Returns the number of word Dawgs in the language model. - int NumDawgs() const; - - // Returns the dawgs with the given index from either the dawgs - // stored by the Tesseract object, or the word_dawgs_. - const Dawg *GetDawg(int index) const; -}; -} // tesseract - -#endif // TESS_LANG_MODEL_H diff --git a/cube/tuning_params.h b/cube/tuning_params.h deleted file mode 100644 index 4d4943a2..00000000 --- a/cube/tuning_params.h +++ /dev/null @@ -1,129 +0,0 @@ -/********************************************************************** - * File: tuning_params.h - * Description: Declaration of the Tuning Parameters Base Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The TuningParams class abstracts all the parameters that can be learned or -// tuned during the training process. It is a base class that all TuningParams -// classes should inherit from. - -#ifndef TUNING_PARAMS_H -#define TUNING_PARAMS_H - -#include -#ifdef USE_STD_NAMESPACE -using std::string; -#endif - -namespace tesseract { -class TuningParams { - public: - enum type_classifer { - NN, - HYBRID_NN - }; - enum type_feature { - BMP, - CHEBYSHEV, - HYBRID - }; - - TuningParams() {} - virtual ~TuningParams() {} - // Accessor functions - inline double RecoWgt() const { return reco_wgt_; } - inline double SizeWgt() const { return size_wgt_; } - inline double CharBigramWgt() const { return char_bigrams_wgt_; } - inline double WordUnigramWgt() const { return word_unigrams_wgt_; } - inline int MaxSegPerChar() const { return max_seg_per_char_; } - inline int BeamWidth() const { return beam_width_; } - inline int TypeClassifier() const { return tp_classifier_; } - inline int TypeFeature() const { return tp_feat_; } - inline int ConvGridSize() const { return conv_grid_size_; } - inline int HistWindWid() const { return hist_wind_wid_; } - inline int MinConCompSize() const { return min_con_comp_size_; } - inline double MaxWordAspectRatio() const { return max_word_aspect_ratio_; } - inline double MinSpaceHeightRatio() const { return min_space_height_ratio_; } - inline double MaxSpaceHeightRatio() const { return max_space_height_ratio_; } - inline double CombinerRunThresh() const { return combiner_run_thresh_; } - inline double CombinerClassifierThresh() const { - return combiner_classifier_thresh_; } - - inline void SetRecoWgt(double wgt) { reco_wgt_ = wgt; } - inline void SetSizeWgt(double wgt) { size_wgt_ = wgt; } - inline void SetCharBigramWgt(double wgt) { char_bigrams_wgt_ = wgt; } - inline void SetWordUnigramWgt(double wgt) { word_unigrams_wgt_ = wgt; } - inline void SetMaxSegPerChar(int max_seg_per_char) { - max_seg_per_char_ = max_seg_per_char; - } - inline void SetBeamWidth(int beam_width) { beam_width_ = beam_width; } - inline void SetTypeClassifier(type_classifer tp_classifier) { - tp_classifier_ = tp_classifier; - } - inline void SetTypeFeature(type_feature tp_feat) {tp_feat_ = tp_feat;} - inline void SetHistWindWid(int hist_wind_wid) { - hist_wind_wid_ = hist_wind_wid; - } - - virtual bool Save(string file_name) = 0; - virtual bool Load(string file_name) = 0; - - protected: - // weight of recognition cost. This includes the language model cost - double reco_wgt_; - // weight of size cost - double size_wgt_; - // weight of character bigrams cost - double char_bigrams_wgt_; - // weight of word unigrams cost - double word_unigrams_wgt_; - // Maximum number of segments per character - int max_seg_per_char_; - // Beam width equal to the maximum number of nodes kept in the beam search - // trellis column after pruning - int beam_width_; - // Classifier type: See enum type_classifer for classifier types - type_classifer tp_classifier_; - // Feature types: See enum type_feature for feature types - type_feature tp_feat_; - // Grid size to scale a grapheme bitmap used by the BMP feature type - int conv_grid_size_; - // Histogram window size as a ratio of the word height used in computing - // the vertical pixel density histogram in the segmentation algorithm - int hist_wind_wid_; - // Minimum possible size of a connected component - int min_con_comp_size_; - // Maximum aspect ratio of a word (width / height) - double max_word_aspect_ratio_; - // Minimum ratio relative to the line height of a gap to be considered as - // a word break - double min_space_height_ratio_; - // Maximum ratio relative to the line height of a gap to be considered as - // a definite word break - double max_space_height_ratio_; - // When Cube and Tesseract are run in combined mode, only run - // combiner classifier when tesseract confidence is below this - // threshold. When Cube is run without Tesseract, this is ignored. - double combiner_run_thresh_; - // When Cube and tesseract are run in combined mode, threshold on - // output of combiner binary classifier (chosen from ROC during - // combiner training). When Cube is run without Tesseract, this is ignored. - double combiner_classifier_thresh_; -}; -} - -#endif // TUNING_PARAMS_H diff --git a/cube/word_altlist.cpp b/cube/word_altlist.cpp deleted file mode 100644 index d6775360..00000000 --- a/cube/word_altlist.cpp +++ /dev/null @@ -1,125 +0,0 @@ -/********************************************************************** - * File: word_altlist.cpp - * Description: Implementation of the Word Alternate List Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include "word_altlist.h" - -namespace tesseract { -WordAltList::WordAltList(int max_alt) - : AltList(max_alt) { - word_alt_ = NULL; -} - -WordAltList::~WordAltList() { - if (word_alt_ != NULL) { - for (int alt_idx = 0; alt_idx < alt_cnt_; alt_idx++) { - if (word_alt_[alt_idx] != NULL) { - delete []word_alt_[alt_idx]; - } - } - delete []word_alt_; - word_alt_ = NULL; - } -} - -/** - * insert an alternate word with the specified cost and tag - */ -bool WordAltList::Insert(char_32 *word_str, int cost, void *tag) { - if (word_alt_ == NULL || alt_cost_ == NULL) { - word_alt_ = new char_32*[max_alt_]; - alt_cost_ = new int[max_alt_]; - alt_tag_ = new void *[max_alt_]; - - if (word_alt_ == NULL || alt_cost_ == NULL || alt_tag_ == NULL) { - return false; - } - - memset(alt_tag_, 0, max_alt_ * sizeof(*alt_tag_)); - } else { - // check if alt already exists - for (int alt_idx = 0; alt_idx < alt_cnt_; alt_idx++) { - if (CubeUtils::StrCmp(word_str, word_alt_[alt_idx]) == 0) { - // update the cost if we have a lower one - if (cost < alt_cost_[alt_idx]) { - alt_cost_[alt_idx] = cost; - alt_tag_[alt_idx] = tag; - } - return true; - } - } - } - - // determine length of alternate - int len = CubeUtils::StrLen(word_str); - - word_alt_[alt_cnt_] = new char_32[len + 1]; - if (word_alt_[alt_cnt_] == NULL) { - return false; - } - - if (len > 0) { - memcpy(word_alt_[alt_cnt_], word_str, len * sizeof(*word_str)); - } - - word_alt_[alt_cnt_][len] = 0; - alt_cost_[alt_cnt_] = cost; - alt_tag_[alt_cnt_] = tag; - - alt_cnt_++; - - return true; -} - -/** - * sort the alternate in descending order based on the cost - */ -void WordAltList::Sort() { - for (int alt_idx = 0; alt_idx < alt_cnt_; alt_idx++) { - for (int alt = alt_idx + 1; alt < alt_cnt_; alt++) { - if (alt_cost_[alt_idx] > alt_cost_[alt]) { - char_32 *pchTemp = word_alt_[alt_idx]; - word_alt_[alt_idx] = word_alt_[alt]; - word_alt_[alt] = pchTemp; - - int temp = alt_cost_[alt_idx]; - alt_cost_[alt_idx] = alt_cost_[alt]; - alt_cost_[alt] = temp; - - void *tag = alt_tag_[alt_idx]; - alt_tag_[alt_idx] = alt_tag_[alt]; - alt_tag_[alt] = tag; - } - } - } -} - -void WordAltList::PrintDebug() { - for (int alt_idx = 0; alt_idx < alt_cnt_; alt_idx++) { - char_32 *word_32 = word_alt_[alt_idx]; - string word_str; - CubeUtils::UTF32ToUTF8(word_32, &word_str); - int num_unichars = CubeUtils::StrLen(word_32); - fprintf(stderr, "Alt[%d]=%s (cost=%d, num_unichars=%d); unichars=", alt_idx, - word_str.c_str(), alt_cost_[alt_idx], num_unichars); - for (int i = 0; i < num_unichars; ++i) - fprintf(stderr, "%d ", word_32[i]); - fprintf(stderr, "\n"); - } -} -} // namespace tesseract diff --git a/cube/word_altlist.h b/cube/word_altlist.h deleted file mode 100644 index 7b1620fe..00000000 --- a/cube/word_altlist.h +++ /dev/null @@ -1,50 +0,0 @@ -/********************************************************************** - * File: word_altlist.h - * Description: Declaration of the Word Alternate List Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The WordAltList abstracts a alternate list of words and their corresponding -// costs that result from the word recognition process. The class inherits -// from the AltList class -// It provides methods to add a new word alternate, its corresponding score and -// a tag. - -#ifndef WORD_ALT_LIST_H -#define WORD_ALT_LIST_H - -#include "altlist.h" - -namespace tesseract { -class WordAltList : public AltList { - public: - explicit WordAltList(int max_alt); - ~WordAltList(); - // Sort the list of alternates based on cost - void Sort(); - // insert an alternate word with the specified cost and tag - bool Insert(char_32 *char_ptr, int cost, void *tag = NULL); - // returns the alternate string at the specified position - inline char_32 * Alt(int alt_idx) { return word_alt_[alt_idx]; } - // print each entry of the altlist, both UTF8 and unichar ids, and - // their costs, to stderr - void PrintDebug(); - private: - char_32 **word_alt_; -}; -} // namespace tesseract - -#endif // WORD_ALT_LIST_H diff --git a/cube/word_list_lang_model.cpp b/cube/word_list_lang_model.cpp deleted file mode 100644 index 67a6a5a9..00000000 --- a/cube/word_list_lang_model.cpp +++ /dev/null @@ -1,205 +0,0 @@ -/********************************************************************** - * File: word_list_lang_model.cpp - * Description: Implementation of the Word List Language Model Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include "word_list_lang_model.h" -#include "cube_utils.h" - -#include "ratngs.h" -#include "trie.h" - -namespace tesseract { -WordListLangModel::WordListLangModel(CubeRecoContext *cntxt) { - cntxt_ = cntxt; - dawg_ = NULL; - init_ = false; -} - -WordListLangModel::~WordListLangModel() { - Cleanup(); -} - -// Cleanup -void WordListLangModel::Cleanup() { - if (dawg_ != NULL) { - delete dawg_; - dawg_ = NULL; - } - init_ = false; -} - -// Initialize the language model -bool WordListLangModel::Init() { - if (init_ == true) { - return true; - } - // The last parameter to the Trie constructor (the debug level) is set to - // false for now, until Cube has a way to express its preferred debug level. - dawg_ = new Trie(DAWG_TYPE_WORD, "", NO_PERM, - cntxt_->CharacterSet()->ClassCount(), false); - if (dawg_ == NULL) { - return false; - } - init_ = true; - return true; -} - -// return a pointer to the root -LangModEdge * WordListLangModel::Root() { - return NULL; -} - -// return the edges emerging from the current state -LangModEdge **WordListLangModel::GetEdges(CharAltList *alt_list, - LangModEdge *edge, - int *edge_cnt) { - // initialize if necessary - if (init_ == false) { - if (Init() == false) { - return NULL; - } - } - - (*edge_cnt) = 0; - - EDGE_REF edge_ref; - - TessLangModEdge *tess_lm_edge = reinterpret_cast(edge); - - if (tess_lm_edge == NULL) { - edge_ref = 0; - } else { - edge_ref = tess_lm_edge->EndEdge(); - - // advance node - edge_ref = dawg_->next_node(edge_ref); - if (edge_ref == 0) { - return NULL; - } - } - - // allocate memory for edges - LangModEdge **edge_array = new LangModEdge *[kMaxEdge]; - if (edge_array == NULL) { - return NULL; - } - - // now get all the emerging edges - (*edge_cnt) += TessLangModEdge::CreateChildren(cntxt_, dawg_, edge_ref, - edge_array + (*edge_cnt)); - - return edge_array; -} - -// returns true if the char_32 is supported by the language model -// TODO(ahmadab) currently not implemented -bool WordListLangModel::IsValidSequence(const char_32 *sequence, - bool terminal, LangModEdge **edges) { - return false; -} - -// Recursive helper function for WordVariants(). -void WordListLangModel::WordVariants(const CharSet &char_set, - string_32 prefix_str32, - WERD_CHOICE *word_so_far, - string_32 str32, - vector *word_variants) { - int str_len = str32.length(); - if (str_len == 0) { - if (word_so_far->length() > 0) { - word_variants->push_back(new WERD_CHOICE(*word_so_far)); - } - } else { - // Try out all the possible prefixes of the str32. - for (int len = 1; len <= str_len; len++) { - // Check if prefix is supported in character set. - string_32 str_pref32 = str32.substr(0, len); - int class_id = char_set.ClassID(reinterpret_cast( - str_pref32.c_str())); - if (class_id <= 0) { - continue; - } else { - string_32 new_prefix_str32 = prefix_str32 + str_pref32; - string_32 new_str32 = str32.substr(len); - word_so_far->append_unichar_id(class_id, 1, 0.0, 0.0); - WordVariants(char_set, new_prefix_str32, word_so_far, new_str32, - word_variants); - word_so_far->remove_last_unichar_id(); - } - } - } -} - -// Compute all the variants of a 32-bit string in terms of the class-ids -// This is needed for languages that have ligatures. A word can then have more -// than one spelling in terms of the class-ids -void WordListLangModel::WordVariants(const CharSet &char_set, - const UNICHARSET *uchset, string_32 str32, - vector *word_variants) { - for (int i = 0; i < word_variants->size(); i++) { - delete (*word_variants)[i]; - } - word_variants->clear(); - string_32 prefix_str32; - WERD_CHOICE word_so_far(uchset); - WordVariants(char_set, prefix_str32, &word_so_far, str32, word_variants); -} - -// add a new UTF-8 string to the lang model -bool WordListLangModel::AddString(const char *char_ptr) { - if (!init_ && !Init()) { // initialize if necessary - return false; - } - - string_32 str32; - CubeUtils::UTF8ToUTF32(char_ptr, &str32); - if (str32.length() < 1) { - return false; - } - return AddString32(str32.c_str()); -} - -// add a new UTF-32 string to the lang model -bool WordListLangModel::AddString32(const char_32 *char_32_ptr) { - if (char_32_ptr == NULL) { - return false; - } - // get all the word variants - vector word_variants; - WordVariants(*(cntxt_->CharacterSet()), cntxt_->TessUnicharset(), - char_32_ptr, &word_variants); - - if (word_variants.size() > 0) { - // find the shortest variant - int shortest_word = 0; - for (int word = 1; word < word_variants.size(); word++) { - if (word_variants[shortest_word]->length() > - word_variants[word]->length()) { - shortest_word = word; - } - } - // only add the shortest grapheme interpretation of string to the word list - dawg_->add_word_to_dawg(*word_variants[shortest_word]); - } - for (int i = 0; i < word_variants.size(); i++) { delete word_variants[i]; } - return true; -} - -} diff --git a/cube/word_list_lang_model.h b/cube/word_list_lang_model.h deleted file mode 100644 index 099d6294..00000000 --- a/cube/word_list_lang_model.h +++ /dev/null @@ -1,89 +0,0 @@ -/********************************************************************** - * File: word_list_lang_model.h - * Description: Declaration of the Word List Language Model Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The WordListLangModel class abstracts a language model that is based on -// a list of words. It inherits from the LangModel abstract class -// Besides providing the methods inherited from the LangModel abstract class, -// the class provided methods to add new strings to the Language Model: -// AddString & AddString32 - -#ifndef WORD_LIST_LANG_MODEL_H -#define WORD_LIST_LANG_MODEL_H - -#include - -#include "cube_reco_context.h" -#include "lang_model.h" -#include "tess_lang_mod_edge.h" - -namespace tesseract { - -class Trie; - -class WordListLangModel : public LangModel { - public: - explicit WordListLangModel(CubeRecoContext *cntxt); - ~WordListLangModel(); - // Returns an edge pointer to the Root - LangModEdge *Root(); - // Returns the edges that fan-out of the specified edge and their count - LangModEdge **GetEdges(CharAltList *alt_list, - LangModEdge *edge, - int *edge_cnt); - // Returns is a sequence of 32-bit characters are valid within this language - // model or net. And EndOfWord flag is specified. If true, the sequence has - // to end on a valid word. The function also optionally returns the list - // of language model edges traversed to parse the string - bool IsValidSequence(const char_32 *sequence, - bool eow_flag, - LangModEdge **edges); - bool IsLeadingPunc(char_32 ch) { return false; } // not yet implemented - bool IsTrailingPunc(char_32 ch) { return false; } // not yet implemented - bool IsDigit(char_32 ch) { return false; } // not yet implemented - // Adds a new UTF-8 string to the language model - bool AddString(const char *char_ptr); - // Adds a new UTF-32 string to the language model - bool AddString32(const char_32 *char_32_ptr); - // Compute all the variants of a 32-bit string in terms of the class-ids. - // This is needed for languages that have ligatures. A word can then have - // more than one spelling in terms of the class-ids. - static void WordVariants(const CharSet &char_set, const UNICHARSET *uchset, - string_32 str32, - vector *word_variants); - private: - // constants needed to configure the language model - static const int kMaxEdge = 512; - - CubeRecoContext *cntxt_; - Trie *dawg_; - bool init_; - // Initialize the language model - bool Init(); - // Cleanup - void Cleanup(); - // Recursive helper function for WordVariants(). - static void WordVariants( - const CharSet &char_set, - string_32 prefix_str32, WERD_CHOICE *word_so_far, - string_32 str32, - vector *word_variants); -}; -} // tesseract - -#endif // WORD_LIST_LANG_MODEL_H diff --git a/cube/word_size_model.cpp b/cube/word_size_model.cpp deleted file mode 100644 index 6b9a4530..00000000 --- a/cube/word_size_model.cpp +++ /dev/null @@ -1,301 +0,0 @@ -/********************************************************************** - * File: word_size_model.cpp - * Description: Implementation of the Word Size Model Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include "word_size_model.h" -#include "cube_utils.h" - -namespace tesseract { - -WordSizeModel::WordSizeModel(CharSet * char_set, bool contextual) { - char_set_ = char_set; - contextual_ = contextual; -} - -WordSizeModel::~WordSizeModel() { - for (int fnt = 0; fnt < font_pair_size_models_.size(); fnt++) { - FontPairSizeInfo fnt_info = font_pair_size_models_[fnt]; - delete []fnt_info.pair_size_info[0]; - delete []fnt_info.pair_size_info; - } -} - -WordSizeModel *WordSizeModel::Create(const string &data_file_path, - const string &lang, - CharSet *char_set, - bool contextual) { - WordSizeModel *obj = new WordSizeModel(char_set, contextual); - if (!obj) { - fprintf(stderr, "Cube ERROR (WordSizeModel::Create): unable to allocate " - "new word size model object\n"); - return NULL; - } - - if (!obj->Init(data_file_path, lang)) { - delete obj; - return NULL; - } - return obj; -} - -bool WordSizeModel::Init(const string &data_file_path, const string &lang) { - string stats_file_name; - stats_file_name = data_file_path + lang; - stats_file_name += ".cube.size"; - - // read file to memory - string str_data; - - if (!CubeUtils::ReadFileToString(stats_file_name, &str_data)) { - return false; - } - - // split to words - vector tokens; - CubeUtils::SplitStringUsing(str_data, "\t\r\n", &tokens); - if (tokens.size() < 1) { - fprintf(stderr, "Cube ERROR (WordSizeModel::Init): invalid " - "file contents: %s\n", stats_file_name.c_str()); - return false; - } - - font_pair_size_models_.clear(); - - // token count per line depends on whether the language is contextual or not - int token_cnt = contextual_ ? - (kExpectedTokenCount + 4) : kExpectedTokenCount; - // the count of size classes depends on whether the language is contextual - // or not. For non contextual languages (Ex: Eng), it is equal to the class - // count. For contextual languages (Ex: Ara), it is equal to the class count - // multiplied by the position count (4: start, middle, final, isolated) - int size_class_cnt = contextual_ ? - (char_set_->ClassCount() * 4) : char_set_->ClassCount(); - string fnt_name = ""; - - for (int tok = 0; tok < tokens.size(); tok += token_cnt) { - // a new font, write the old font data and re-init - if (tok == 0 || fnt_name != tokens[tok]) { - FontPairSizeInfo fnt_info; - - fnt_info.pair_size_info = new PairSizeInfo *[size_class_cnt]; - if (!fnt_info.pair_size_info) { - fprintf(stderr, "Cube ERROR (WordSizeModel::Init): error allcoating " - "memory for font pair size info\n"); - return false; - } - - fnt_info.pair_size_info[0] = - new PairSizeInfo[size_class_cnt * size_class_cnt]; - if (!fnt_info.pair_size_info[0]) { - fprintf(stderr, "Cube ERROR (WordSizeModel::Init): error allocating " - "memory for font pair size info\n"); - return false; - } - - memset(fnt_info.pair_size_info[0], 0, size_class_cnt * size_class_cnt * - sizeof(PairSizeInfo)); - - for (int cls = 1; cls < size_class_cnt; cls++) { - fnt_info.pair_size_info[cls] = - fnt_info.pair_size_info[cls - 1] + size_class_cnt; - } - - // strip out path and extension - string stripped_font_name = tokens[tok].substr(0, tokens[tok].find('.')); - string::size_type strt_pos = stripped_font_name.find_last_of("/\\"); - if (strt_pos != string::npos) { - fnt_info.font_name = stripped_font_name.substr(strt_pos); - } else { - fnt_info.font_name = stripped_font_name; - } - font_pair_size_models_.push_back(fnt_info); - } - - // parse the data - int cls_0; - int cls_1; - double delta_top; - double wid_0; - double hgt_0; - double wid_1; - double hgt_1; - int size_code_0; - int size_code_1; - - // read and parse the tokens - if (contextual_) { - int start_0; - int end_0; - int start_1; - int end_1; - // The expected format for a character size bigram is as follows: - // ClassId0Start-flag0End-flag0String0(ignored) - // Width0Height0 - // ClassId1Start-flag1End-flag1String1(ignored) - // HeightDeltaWidth1Height0 - // In case of non-contextual languages, the Start and End flags are - // omitted - if (sscanf(tokens[tok + 1].c_str(), "%d", &cls_0) != 1 || - sscanf(tokens[tok + 2].c_str(), "%d", &start_0) != 1 || - sscanf(tokens[tok + 3].c_str(), "%d", &end_0) != 1 || - sscanf(tokens[tok + 5].c_str(), "%lf", &wid_0) != 1 || - sscanf(tokens[tok + 6].c_str(), "%lf", &hgt_0) != 1 || - sscanf(tokens[tok + 7].c_str(), "%d", &cls_1) != 1 || - sscanf(tokens[tok + 8].c_str(), "%d", &start_1) != 1 || - sscanf(tokens[tok + 9].c_str(), "%d", &end_1) != 1 || - sscanf(tokens[tok + 11].c_str(), "%lf", &delta_top) != 1 || - sscanf(tokens[tok + 12].c_str(), "%lf", &wid_1) != 1 || - sscanf(tokens[tok + 13].c_str(), "%lf", &hgt_1) != 1 || - (start_0 != 0 && start_0 != 1) || (end_0 != 0 && end_0 != 1) || - (start_1 != 0 && start_1 != 1) || (end_1 != 0 && end_1 != 1)) { - fprintf(stderr, "Cube ERROR (WordSizeModel::Init): bad format at " - "line %d\n", 1 + (tok / token_cnt)); - return false; - } - size_code_0 = SizeCode(cls_0, start_0, end_0); - size_code_1 = SizeCode(cls_1, start_1, end_1); - } else { - if (sscanf(tokens[tok + 1].c_str(), "%d", &cls_0) != 1 || - sscanf(tokens[tok + 3].c_str(), "%lf", &wid_0) != 1 || - sscanf(tokens[tok + 4].c_str(), "%lf", &hgt_0) != 1 || - sscanf(tokens[tok + 5].c_str(), "%d", &cls_1) != 1 || - sscanf(tokens[tok + 7].c_str(), "%lf", &delta_top) != 1 || - sscanf(tokens[tok + 8].c_str(), "%lf", &wid_1) != 1 || - sscanf(tokens[tok + 9].c_str(), "%lf", &hgt_1) != 1) { - fprintf(stderr, "Cube ERROR (WordSizeModel::Init): bad format at " - "line %d\n", 1 + (tok / token_cnt)); - return false; - } - size_code_0 = cls_0; - size_code_1 = cls_1; - } - - // copy the data to the size tables - FontPairSizeInfo fnt_info = font_pair_size_models_.back(); - fnt_info.pair_size_info[size_code_0][size_code_1].delta_top = - static_cast(delta_top * kShapeModelScale); - fnt_info.pair_size_info[size_code_0][size_code_1].wid_0 = - static_cast(wid_0 * kShapeModelScale); - fnt_info.pair_size_info[size_code_0][size_code_1].hgt_0 = - static_cast(hgt_0 * kShapeModelScale); - fnt_info.pair_size_info[size_code_0][size_code_1].wid_1 = - static_cast(wid_1 * kShapeModelScale); - fnt_info.pair_size_info[size_code_0][size_code_1].hgt_1 = - static_cast(hgt_1 * kShapeModelScale); - - fnt_name = tokens[tok]; - } - - return true; -} - -int WordSizeModel::Cost(CharSamp **samp_array, int samp_cnt) const { - if (samp_cnt < 2) { - return 0; - } - double best_dist = static_cast(WORST_COST); - int best_fnt = -1; - for (int fnt = 0; fnt < font_pair_size_models_.size(); fnt++) { - const FontPairSizeInfo *fnt_info = &font_pair_size_models_[fnt]; - double mean_dist = 0; - int pair_cnt = 0; - - for (int smp_0 = 0; smp_0 < samp_cnt; smp_0++) { - int cls_0 = char_set_->ClassID(samp_array[smp_0]->StrLabel()); - if (cls_0 < 1) { - continue; - } - // compute size code for samp 0 based on class id and position - int size_code_0; - if (contextual_) { - size_code_0 = SizeCode(cls_0, - samp_array[smp_0]->FirstChar() == 0 ? 0 : 1, - samp_array[smp_0]->LastChar() == 0 ? 0 : 1); - } else { - size_code_0 = cls_0; - } - - int char0_height = samp_array[smp_0]->Height(); - int char0_width = samp_array[smp_0]->Width(); - int char0_top = samp_array[smp_0]->Top(); - - for (int smp_1 = smp_0 + 1; smp_1 < samp_cnt; smp_1++) { - int cls_1 = char_set_->ClassID(samp_array[smp_1]->StrLabel()); - if (cls_1 < 1) { - continue; - } - // compute size code for samp 0 based on class id and position - int size_code_1; - if (contextual_) { - size_code_1 = SizeCode(cls_1, - samp_array[smp_1]->FirstChar() == 0 ? 0 : 1, - samp_array[smp_1]->LastChar() == 0 ? 0 : 1); - } else { - size_code_1 = cls_1; - } - double dist = PairCost( - char0_width, char0_height, char0_top, samp_array[smp_1]->Width(), - samp_array[smp_1]->Height(), samp_array[smp_1]->Top(), - fnt_info->pair_size_info[size_code_0][size_code_1]); - if (dist > 0) { - mean_dist += dist; - pair_cnt++; - } - } // smp_1 - } // smp_0 - if (pair_cnt == 0) { - continue; - } - mean_dist /= pair_cnt; - if (best_fnt == -1 || mean_dist < best_dist) { - best_dist = mean_dist; - best_fnt = fnt; - } - } - if (best_fnt == -1) { - return static_cast(WORST_COST); - } else { - return static_cast(best_dist); - } -} - -double WordSizeModel::PairCost(int width_0, int height_0, int top_0, - int width_1, int height_1, int top_1, - const PairSizeInfo& pair_info) { - double scale_factor = static_cast(pair_info.hgt_0) / - static_cast(height_0); - double dist = 0.0; - if (scale_factor > 0) { - double norm_width_0 = width_0 * scale_factor; - double norm_width_1 = width_1 * scale_factor; - double norm_height_1 = height_1 * scale_factor; - double norm_delta_top = (top_1 - top_0) * scale_factor; - - // accumulate the distance between the model character and the - // predicted one on all dimensions of the pair - dist += fabs(pair_info.wid_0 - norm_width_0); - dist += fabs(pair_info.wid_1 - norm_width_1); - dist += fabs(pair_info.hgt_1 - norm_height_1); - dist += fabs(pair_info.delta_top - norm_delta_top); - } - return dist; -} -} // namespace tesseract diff --git a/cube/word_size_model.h b/cube/word_size_model.h deleted file mode 100644 index fade595b..00000000 --- a/cube/word_size_model.h +++ /dev/null @@ -1,100 +0,0 @@ -/********************************************************************** - * File: word_size_model.h - * Description: Declaration of the Word Size Model Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The WordSizeModel class abstracts the geometrical relationships -// between characters/shapes in the same word (presumeably of the same font) -// A non-parametric bigram model describes the three geometrical properties of a -// character pair: -// 1- Normalized Width -// 2- Normalized Top -// 3- Normalized Height -// These dimensions are computed for each character pair in a word. These are -// then compared to the same information for each of the fonts that the size -// model knows about. The WordSizeCost is the cost of the font that matches -// best. - -#ifndef WORD_SIZE_MODEL_H -#define WORD_SIZE_MODEL_H - -#include -#include "char_samp.h" -#include "char_set.h" - -namespace tesseract { -struct PairSizeInfo { - int delta_top; - int wid_0; - int hgt_0; - int wid_1; - int hgt_1; -}; - -struct FontPairSizeInfo { - string font_name; - PairSizeInfo **pair_size_info; -}; - -class WordSizeModel { - public: - WordSizeModel(CharSet *, bool contextual); - virtual ~WordSizeModel(); - static WordSizeModel *Create(const string &data_file_path, - const string &lang, - CharSet *char_set, - bool contextual); - // Given a word and number of unichars, return the size cost, - // minimized over all fonts in the size model. - int Cost(CharSamp **samp_array, int samp_cnt) const; - // Given dimensions of a pair of character samples and a font size - // model for that character pair, return the pair's size cost for - // the font. - static double PairCost(int width_0, int height_0, int top_0, - int width_1, int height_1, int top_1, - const PairSizeInfo& pair_info); - bool Save(string file_name); - // Number of fonts in size model. - inline int FontCount() const { - return font_pair_size_models_.size(); - } - inline const FontPairSizeInfo *FontInfo() const { - return &font_pair_size_models_[0]; - } - // Helper functions to convert between size codes, class id and position - // codes - static inline int SizeCode(int cls_id, int start, int end) { - return (cls_id << 2) + (end << 1) + start; - } - - private: - // Scaling constant used to convert floating point ratios in size table - // to fixed point - static const int kShapeModelScale = 1000; - static const int kExpectedTokenCount = 10; - - // Language properties - bool contextual_; - CharSet *char_set_; - // Size ratios table - vector font_pair_size_models_; - - // Initialize the word size model object - bool Init(const string &data_file_path, const string &lang); -}; -} -#endif // WORD_SIZE_MODEL_H diff --git a/cube/word_unigrams.cpp b/cube/word_unigrams.cpp deleted file mode 100644 index 70cc9ee1..00000000 --- a/cube/word_unigrams.cpp +++ /dev/null @@ -1,271 +0,0 @@ -/********************************************************************** - * File: word_unigrams.cpp - * Description: Implementation of the Word Unigrams Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -#include -#include -#include -#include - -#include "const.h" -#include "cube_utils.h" -#include "ndminx.h" -#include "word_unigrams.h" - -namespace tesseract { - -WordUnigrams::WordUnigrams() { - costs_ = NULL; - words_ = NULL; - word_cnt_ = 0; -} - -WordUnigrams::~WordUnigrams() { - if (words_ != NULL) { - if (words_[0] != NULL) { - delete []words_[0]; - } - - delete []words_; - words_ = NULL; - } - - if (costs_ != NULL) { - delete []costs_; - } -} - -/** - * Load the word-list and unigrams from file and create an object - * The word list is assumed to be sorted in lexicographic order. - */ -WordUnigrams *WordUnigrams::Create(const string &data_file_path, - const string &lang) { - string file_name; - string str; - - file_name = data_file_path + lang; - file_name += ".cube.word-freq"; - - // load the string into memory - if (CubeUtils::ReadFileToString(file_name, &str) == false) { - return NULL; - } - - // split into lines - vector str_vec; - CubeUtils::SplitStringUsing(str, "\r\n \t", &str_vec); - if (str_vec.size() < 2) { - return NULL; - } - - // allocate memory - WordUnigrams *word_unigrams_obj = new WordUnigrams(); - if (word_unigrams_obj == NULL) { - fprintf(stderr, "Cube ERROR (WordUnigrams::Create): could not create " - "word unigrams object.\n"); - return NULL; - } - - int full_len = str.length(); - int word_cnt = str_vec.size() / 2; - word_unigrams_obj->words_ = new char*[word_cnt]; - word_unigrams_obj->costs_ = new int[word_cnt]; - - if (word_unigrams_obj->words_ == NULL || - word_unigrams_obj->costs_ == NULL) { - fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating " - "word unigram fields.\n"); - delete word_unigrams_obj; - return NULL; - } - - word_unigrams_obj->words_[0] = new char[full_len]; - if (word_unigrams_obj->words_[0] == NULL) { - fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error allocating " - "word unigram fields.\n"); - delete word_unigrams_obj; - return NULL; - } - - // construct sorted list of words and costs - word_unigrams_obj->word_cnt_ = 0; - char *char_buff = word_unigrams_obj->words_[0]; - word_cnt = 0; - int max_cost = 0; - - for (int wrd = 0; wrd < str_vec.size(); wrd += 2) { - word_unigrams_obj->words_[word_cnt] = char_buff; - - strcpy(char_buff, str_vec[wrd].c_str()); - char_buff += (str_vec[wrd].length() + 1); - - if (sscanf(str_vec[wrd + 1].c_str(), "%d", - word_unigrams_obj->costs_ + word_cnt) != 1) { - fprintf(stderr, "Cube ERROR (WordUnigrams::Create): error reading " - "word unigram data.\n"); - delete word_unigrams_obj; - return NULL; - } - // update max cost - max_cost = MAX(max_cost, word_unigrams_obj->costs_[word_cnt]); - word_cnt++; - } - word_unigrams_obj->word_cnt_ = word_cnt; - - // compute the not-in-list-cost by assuming that a word not in the list - // [ahmadab]: This can be computed as follows: - // - Given that the distribution of words follow Zipf's law: - // (F = K / (rank ^ S)), where s is slightly > 1.0 - // - Number of words in the list is N - // - The mean frequency of a word that did not appear in the list is the - // area under the rest of the Zipf's curve divided by 2 (the mean) - // - The area would be the bound integral from N to infinity = - // (K * S) / (N ^ (S + 1)) ~= K / (N ^ 2) - // - Given that cost = -LOG(prob), the cost of an unlisted word would be - // = max_cost + 2*LOG(N) - word_unigrams_obj->not_in_list_cost_ = max_cost + - (2 * CubeUtils::Prob2Cost(1.0 / word_cnt)); - // success - return word_unigrams_obj; -} - -/** - * Split input into space-separated tokens, strip trailing punctuation - * from each, determine case properties, call UTF-8 flavor of cost - * function on each word, and aggregate all into single mean word - * cost. - */ -int WordUnigrams::Cost(const char_32 *key_str32, - LangModel *lang_mod, - CharSet *char_set) const { - if (!key_str32) - return 0; - // convert string to UTF8 to split into space-separated words - string key_str; - CubeUtils::UTF32ToUTF8(key_str32, &key_str); - vector words; - CubeUtils::SplitStringUsing(key_str, " \t", &words); - - // no words => no cost - if (words.size() <= 0) { - return 0; - } - - // aggregate the costs of all the words - int cost = 0; - for (int word_idx = 0; word_idx < words.size(); word_idx++) { - // convert each word back to UTF32 for analyzing case and punctuation - string_32 str32; - CubeUtils::UTF8ToUTF32(words[word_idx].c_str(), &str32); - int len = CubeUtils::StrLen(str32.c_str()); - - // strip all trailing punctuation - string clean_str; - int clean_len = len; - bool trunc = false; - while (clean_len > 0 && - lang_mod->IsTrailingPunc(str32.c_str()[clean_len - 1])) { - --clean_len; - trunc = true; - } - - // If either the original string was not truncated (no trailing - // punctuation) or the entire string was removed (all characters - // are trailing punctuation), evaluate original word as is; - // otherwise, copy all but the trailing punctuation characters - char_32 *clean_str32 = NULL; - if (clean_len == 0 || !trunc) { - clean_str32 = CubeUtils::StrDup(str32.c_str()); - } else { - clean_str32 = new char_32[clean_len + 1]; - for (int i = 0; i < clean_len; ++i) { - clean_str32[i] = str32[i]; - } - clean_str32[clean_len] = '\0'; - } - ASSERT_HOST(clean_str32 != NULL); - - string str8; - CubeUtils::UTF32ToUTF8(clean_str32, &str8); - int word_cost = CostInternal(str8.c_str()); - - // if case invariant, get costs of all-upper-case and all-lower-case - // versions and return the min cost - if (clean_len >= kMinLengthNumOrCaseInvariant && - CubeUtils::IsCaseInvariant(clean_str32, char_set)) { - char_32 *lower_32 = CubeUtils::ToLower(clean_str32, char_set); - if (lower_32) { - string lower_8; - CubeUtils::UTF32ToUTF8(lower_32, &lower_8); - word_cost = MIN(word_cost, CostInternal(lower_8.c_str())); - delete [] lower_32; - } - char_32 *upper_32 = CubeUtils::ToUpper(clean_str32, char_set); - if (upper_32) { - string upper_8; - CubeUtils::UTF32ToUTF8(upper_32, &upper_8); - word_cost = MIN(word_cost, CostInternal(upper_8.c_str())); - delete [] upper_32; - } - } - - if (clean_len >= kMinLengthNumOrCaseInvariant) { - // if characters are all numeric, incur 0 word cost - bool is_numeric = true; - for (int i = 0; i < clean_len; ++i) { - if (!lang_mod->IsDigit(clean_str32[i])) - is_numeric = false; - } - if (is_numeric) - word_cost = 0; - } - delete [] clean_str32; - cost += word_cost; - } // word_idx - - // return the mean cost - return static_cast(cost / static_cast(words.size())); -} - -/** - * Search for UTF-8 string using binary search of sorted words_ array. - */ -int WordUnigrams::CostInternal(const char *key_str) const { - if (strlen(key_str) == 0) - return not_in_list_cost_; - int hi = word_cnt_ - 1; - int lo = 0; - while (lo <= hi) { - int current = (hi + lo) / 2; - int comp = strcmp(key_str, words_[current]); - // a match - if (comp == 0) { - return costs_[current]; - } - if (comp < 0) { - // go lower - hi = current - 1; - } else { - // go higher - lo = current + 1; - } - } - return not_in_list_cost_; -} -} // namespace tesseract diff --git a/cube/word_unigrams.h b/cube/word_unigrams.h deleted file mode 100644 index 4fcfd452..00000000 --- a/cube/word_unigrams.h +++ /dev/null @@ -1,69 +0,0 @@ - /********************************************************************** - * File: word_unigrams.h - * Description: Declaration of the Word Unigrams Class - * Author: Ahmad Abdulkader - * Created: 2008 - * - * (C) Copyright 2008, Google Inc. - ** Licensed under the Apache License, Version 2.0 (the "License"); - ** you may not use this file except in compliance with the License. - ** You may obtain a copy of the License at - ** http://www.apache.org/licenses/LICENSE-2.0 - ** Unless required by applicable law or agreed to in writing, software - ** distributed under the License is distributed on an "AS IS" BASIS, - ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. - ** See the License for the specific language governing permissions and - ** limitations under the License. - * - **********************************************************************/ - -// The WordUnigram class holds the unigrams of the most frequent set of words -// in a language. It is an optional component of the Cube OCR engine. If -// present, the unigram cost of a word is aggregated with the other costs -// (Recognition, Language Model, Size) to compute a cost for a word. -// The word list is assumed to be sorted in lexicographic order. - -#ifndef WORD_UNIGRAMS_H -#define WORD_UNIGRAMS_H - -#include -#include "char_set.h" -#include "lang_model.h" - -namespace tesseract { -class WordUnigrams { - public: - WordUnigrams(); - ~WordUnigrams(); - // Load the word-list and unigrams from file and create an object - // The word list is assumed to be sorted - static WordUnigrams *Create(const string &data_file_path, - const string &lang); - // Compute the unigram cost of a UTF-32 string. Splits into - // space-separated tokens, strips trailing punctuation from each - // token, evaluates case properties, and calls internal Cost() - // function on UTF-8 version. To avoid unnecessarily penalizing - // all-one-case words or capitalized words (first-letter - // upper-case and remaining letters lower-case) when not all - // versions of the word appear in the .cube.word-freq file, a - // case-invariant cost is computed in those cases, assuming the word - // meets a minimum length. - int Cost(const char_32 *str32, LangModel *lang_mod, - CharSet *char_set) const; - protected: - // Compute the word unigram cost of a UTF-8 string with binary - // search of sorted words_ array. - int CostInternal(const char *str) const; - private: - // Only words this length or greater qualify for all-numeric or - // case-invariant word unigram cost. - static const int kMinLengthNumOrCaseInvariant = 4; - - int word_cnt_; - char **words_; - int *costs_; - int not_in_list_cost_; -}; -} - -#endif // WORD_UNIGRAMS_H diff --git a/cutil/Makefile.am b/cutil/Makefile.am index 15b339c8..5b0ffc6e 100644 --- a/cutil/Makefile.am +++ b/cutil/Makefile.am @@ -7,7 +7,7 @@ endif noinst_HEADERS = \ bitvec.h callcpp.h const.h cutil.h cutil_class.h danerror.h efio.h \ - emalloc.h freelist.h globals.h listio.h \ + emalloc.h freelist.h globals.h \ oldlist.h structures.h if !USING_MULTIPLELIBS @@ -22,7 +22,7 @@ endif libtesseract_cutil_la_SOURCES = \ bitvec.cpp callcpp.cpp cutil.cpp cutil_class.cpp danerror.cpp efio.cpp \ - emalloc.cpp freelist.cpp listio.cpp \ + emalloc.cpp freelist.cpp \ oldlist.cpp structures.cpp diff --git a/cutil/bitvec.h b/cutil/bitvec.h index f70d748b..d2a364d2 100644 --- a/cutil/bitvec.h +++ b/cutil/bitvec.h @@ -30,29 +30,29 @@ typedef uinT32 *BIT_VECTOR; /*----------------------------------------------------------------------------- Public Function Prototypes -----------------------------------------------------------------------------*/ -#define zero_all_bits(array,length) \ -{\ - int index; /*temporary index*/\ -\ -for (index=0;index -#include -#include -#include "listio.h" - -/*--------------------------------------------------------------------------- - Public Function Code ----------------------------------------------------------------------------*/ -/************************************************************************* - * R E A D L I S T - * - * Read a list of strings from a file. Return the string list to the - * caller. - *************************************************************************/ -LIST read_list(const char *filename) { - FILE *infile; - char s[CHARS_PER_LINE]; - LIST list; - - if ((infile = open_file (filename, "r")) == NULL) - return (NIL_LIST); - - list = NIL_LIST; - while (fgets (s, CHARS_PER_LINE, infile) != NULL) { - s[CHARS_PER_LINE - 1] = '\0'; - if (strlen (s) > 0) { - if (s[strlen (s) - 1] == '\n') - s[strlen (s) - 1] = '\0'; - if (strlen (s) > 0) { - list = push (list, (LIST) strsave (s)); - } - } - } - - fclose(infile); - return (reverse_d (list)); -} diff --git a/cutil/listio.h b/cutil/listio.h deleted file mode 100644 index 7d9c19f7..00000000 --- a/cutil/listio.h +++ /dev/null @@ -1,43 +0,0 @@ -/* -*-C-*- -################################################################################ -# -# File: listio.h -# Description: List I/O processing procedures. -# Author: Mark Seaman, Software Productivity -# Created: Thu Jul 23 13:24:09 1987 -# Modified: Mon Oct 16 11:38:52 1989 (Mark Seaman) marks@hpgrlt -# Language: C -# Package: N/A -# Status: Reusable Software Component -# -# (c) Copyright 1987, Hewlett-Packard Company. -** Licensed under the Apache License, Version 2.0 (the "License"); -** you may not use this file except in compliance with the License. -** You may obtain a copy of the License at -** http://www.apache.org/licenses/LICENSE-2.0 -** Unless required by applicable law or agreed to in writing, software -** distributed under the License is distributed on an "AS IS" BASIS, -** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -** See the License for the specific language governing permissions and -** limitations under the License. -# -################################################################################ - * Revision 1.5 89/06/27 11:56:00 11:56:00 marks (Mark Seaman) - * Fixed MAC_OR_DOS bug - * - - This file contains the interface definitions to a set of general purpose - list I/O routines. - -***********************************************************************/ -#ifndef LISTIO_H -#define LISTIO_H - -#include -#include "oldlist.h" - -/*---------------------------------------------------------------------------- - Public Function Prototypes ---------------------------------------------------------------------------*/ -LIST read_list(const char *filename); -#endif diff --git a/cutil/oldlist.cpp b/cutil/oldlist.cpp index 52c0d868..9e3f6f4c 100644 --- a/cutil/oldlist.cpp +++ b/cutil/oldlist.cpp @@ -206,8 +206,8 @@ void destroy_nodes(LIST list, void_dest destructor) { destructor = memfree; while (list != NIL_LIST) { - (*destructor) (first_node (list)); - list = pop (list); + if (first_node(list) != NULL) (*destructor)(first_node(list)); + list = pop(list); } } @@ -401,7 +401,6 @@ LIST s_adjoin(LIST var_list, void *variable, int_compare compare) { return (push_last (var_list, variable)); } - /********************************************************************** * s e a r c h * diff --git a/dict/context.cpp b/dict/context.cpp index a9acb137..a41fda19 100644 --- a/dict/context.cpp +++ b/dict/context.cpp @@ -32,30 +32,24 @@ namespace tesseract { static const int kMinAbsoluteGarbageWordLength = 10; static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f; -const int case_state_table[6][4] = { { - /* 0. Beginning of word */ - /* P U L D */ - /* -1. Error on case */ - 0, 1, 5, 4 - }, - { /* 1. After initial capital */ - 0, 3, 2, 4 - }, - { /* 2. After lower case */ - 0, -1, 2, -1 - }, - { /* 3. After upper case */ - 0, 3, -1, 4 - }, - { /* 4. After a digit */ - 0, -1, -1, 4 - }, - { /* 5. After initial lower case */ - 5, -1, 2, -1 - }, - }; +const int case_state_table[6][4] = { + {/* 0. Beginning of word */ + /* P U L D */ + /* -1. Error on case */ + 0, 1, 5, 4}, + {/* 1. After initial capital */ + 0, 3, 2, 4}, + {/* 2. After lower case */ + 0, -1, 2, -1}, + {/* 3. After upper case */ + 0, 3, -1, 4}, + {/* 4. After a digit */ + 0, -1, -1, 4}, + {/* 5. After initial lower case */ + 5, -1, 2, -1}, +}; -int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) { +int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const { int state = 0; int x; for (x = 0; x < word.length(); ++x) { diff --git a/dict/dawg.cpp b/dict/dawg.cpp index 4bfb5464..6df22c53 100644 --- a/dict/dawg.cpp +++ b/dict/dawg.cpp @@ -174,11 +174,7 @@ bool Dawg::match_words(WERD_CHOICE *word, inT32 index, return false; } -void Dawg::init(DawgType type, const STRING &lang, - PermuterType perm, int unicharset_size, int debug_level) { - type_ = type; - lang_ = lang; - perm_ = perm; +void Dawg::init(int unicharset_size) { ASSERT_HOST(unicharset_size > 0); unicharset_size_ = unicharset_size; // Set bit masks. We will use the value unicharset_size_ as a null char, so @@ -188,8 +184,6 @@ void Dawg::init(DawgType type, const STRING &lang, letter_mask_ = ~(~0ull << flag_start_bit_); next_node_mask_ = ~0ull << (flag_start_bit_ + NUM_FLAG_BITS); flags_mask_ = ~(letter_mask_ | next_node_mask_); - - debug_level_ = debug_level; } @@ -315,44 +309,34 @@ void SquishedDawg::print_edge(EDGE_REF edge) const { } } -void SquishedDawg::read_squished_dawg(FILE *file, - DawgType type, - const STRING &lang, - PermuterType perm, - int debug_level) { - if (debug_level) tprintf("Reading squished dawg\n"); +bool SquishedDawg::read_squished_dawg(TFile *file) { + if (debug_level_) tprintf("Reading squished dawg\n"); // Read the magic number and if it does not match kDawgMagicNumber // set swap to true to indicate that we need to switch endianness. inT16 magic; - fread(&magic, sizeof(inT16), 1, file); + if (file->FRead(&magic, sizeof(inT16), 1) != 1) return false; bool swap = (magic != kDawgMagicNumber); - int unicharset_size; - fread(&unicharset_size, sizeof(inT32), 1, file); - fread(&num_edges_, sizeof(inT32), 1, file); - - if (swap) { - ReverseN(&unicharset_size, sizeof(unicharset_size)); - ReverseN(&num_edges_, sizeof(num_edges_)); - } + inT32 unicharset_size; + if (file->FReadEndian(&unicharset_size, sizeof(unicharset_size), 1, swap) != + 1) + return false; + if (file->FReadEndian(&num_edges_, sizeof(num_edges_), 1, swap) != 1) + return false; ASSERT_HOST(num_edges_ > 0); // DAWG should not be empty - Dawg::init(type, lang, perm, unicharset_size, debug_level); + Dawg::init(unicharset_size); edges_ = (EDGE_ARRAY) memalloc(sizeof(EDGE_RECORD) * num_edges_); - fread(&edges_[0], sizeof(EDGE_RECORD), num_edges_, file); - EDGE_REF edge; - if (swap) { - for (edge = 0; edge < num_edges_; ++edge) { - ReverseN(&edges_[edge], sizeof(edges_[edge])); - } - } - if (debug_level > 2) { + if (file->FReadEndian(&edges_[0], sizeof(edges_[0]), num_edges_, swap) != + num_edges_) + return false; + if (debug_level_ > 2) { tprintf("type: %d lang: %s perm: %d unicharset_size: %d num_edges: %d\n", type_, lang_.string(), perm_, unicharset_size_, num_edges_); - for (edge = 0; edge < num_edges_; ++edge) - print_edge(edge); + for (EDGE_REF edge = 0; edge < num_edges_; ++edge) print_edge(edge); } + return true; } NODE_MAP SquishedDawg::build_node_map(inT32 *num_nodes) const { diff --git a/dict/dawg.h b/dict/dawg.h index c0c39fa2..a0312a06 100644 --- a/dict/dawg.h +++ b/dict/dawg.h @@ -128,7 +128,7 @@ class Dawg { inline const STRING &lang() const { return lang_; } inline PermuterType permuter() const { return perm_; } - virtual ~Dawg() {}; + virtual ~Dawg() {} /// Returns true if the given word is in the Dawg. bool word_in_dawg(const WERD_CHOICE &word) const; @@ -183,18 +183,30 @@ class Dawg { /// of the given unichar_id. virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, - GenericVector *vec) const {}; + GenericVector *vec) const { + (void)unichar_id; + (void)unicharset; + (void)vec; + } /// Returns the given EDGE_REF if the EDGE_RECORD that it points to has /// a self loop and the given unichar_id matches the unichar_id stored in the /// EDGE_RECORD, returns NO_EDGE otherwise. virtual EDGE_REF pattern_loop_edge( EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const { + (void)edge_ref; + (void)unichar_id; + (void)word_end; return false; } protected: - Dawg() {} + Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level) + : type_(type), + lang_(lang), + perm_(perm), + unicharset_size_(0), + debug_level_(debug_level) {} /// Returns the next node visited by following this edge. inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const { @@ -267,10 +279,9 @@ class Dawg { (!word_end || (word_end == other_word_end))); } - /// Sets type_, lang_, perm_, unicharset_size_. + /// Sets unicharset_size_. /// Initializes the values of various masks from unicharset_size_. - void init(DawgType type, const STRING &lang, - PermuterType perm, int unicharset_size, int debug_level); + void init(int unicharset_size); /// Matches all of the words that are represented by this string. /// If wilcard is set to something other than INVALID_UNICHAR_ID, @@ -368,14 +379,6 @@ struct DawgPosition { class DawgPositionVector : public GenericVector { public: - /// Overload destructor, since clear() does not delete data_[] any more. - ~DawgPositionVector() { - if (size_reserved_ > 0) { - delete[] data_; - size_used_ = 0; - size_reserved_ = 0; - } - } /// Overload clear() in order to avoid allocating/deallocating memory /// when clearing the vector and re-inserting entries into it later. void clear() { size_used_ = 0; } @@ -408,32 +411,36 @@ class DawgPositionVector : public GenericVector { // class SquishedDawg : public Dawg { public: - SquishedDawg(FILE *file, DawgType type, const STRING &lang, - PermuterType perm, int debug_level) { - read_squished_dawg(file, type, lang, perm, debug_level); + SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, + int debug_level) + : Dawg(type, lang, perm, debug_level) {} + SquishedDawg(const char *filename, DawgType type, const STRING &lang, + PermuterType perm, int debug_level) + : Dawg(type, lang, perm, debug_level) { + TFile file; + ASSERT_HOST(file.Open(filename, nullptr)); + ASSERT_HOST(read_squished_dawg(&file)); num_forward_edges_in_node0 = num_forward_edges(0); } - SquishedDawg(const char* filename, DawgType type, - const STRING &lang, PermuterType perm, int debug_level) { - FILE *file = fopen(filename, "rb"); - if (file == NULL) { - tprintf("Failed to open dawg file %s\n", filename); - exit(1); - } - read_squished_dawg(file, type, lang, perm, debug_level); - num_forward_edges_in_node0 = num_forward_edges(0); - fclose(file); - } SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, - const STRING &lang, PermuterType perm, - int unicharset_size, int debug_level) : - edges_(edges), num_edges_(num_edges) { - init(type, lang, perm, unicharset_size, debug_level); + const STRING &lang, PermuterType perm, int unicharset_size, + int debug_level) + : Dawg(type, lang, perm, debug_level), + edges_(edges), + num_edges_(num_edges) { + init(unicharset_size); num_forward_edges_in_node0 = num_forward_edges(0); if (debug_level > 3) print_all("SquishedDawg:"); } ~SquishedDawg(); + // Loads using the given TFile. Returns false on failure. + bool Load(TFile *fp) { + if (!read_squished_dawg(fp)) return false; + num_forward_edges_in_node0 = num_forward_edges(0); + return true; + } + int NumEdges() { return num_edges_; } /// Returns the edge that corresponds to the letter out of this node. @@ -530,8 +537,7 @@ class SquishedDawg : public Dawg { inT32 num_forward_edges(NODE_REF node) const; /// Reads SquishedDawg from a file. - void read_squished_dawg(FILE *file, DawgType type, const STRING &lang, - PermuterType perm, int debug_level); + bool read_squished_dawg(TFile *file); /// Prints the contents of an edge indicated by the given EDGE_REF. void print_edge(EDGE_REF edge) const; @@ -548,7 +554,7 @@ class SquishedDawg : public Dawg { // Member variables. EDGE_ARRAY edges_; - int num_edges_; + inT32 num_edges_; int num_forward_edges_in_node0; }; diff --git a/dict/dawg_cache.cpp b/dict/dawg_cache.cpp index 335b7c3b..f16ec26f 100644 --- a/dict/dawg_cache.cpp +++ b/dict/dawg_cache.cpp @@ -31,31 +31,27 @@ namespace tesseract { struct DawgLoader { - DawgLoader(const STRING &lang, - const char *data_file_name, - TessdataType tessdata_dawg_type, - int dawg_debug_level) + DawgLoader(const STRING &lang, TessdataType tessdata_dawg_type, + int dawg_debug_level, TessdataManager *data_file) : lang_(lang), - data_file_name_(data_file_name), + data_file_(data_file), tessdata_dawg_type_(tessdata_dawg_type), dawg_debug_level_(dawg_debug_level) {} Dawg *Load(); STRING lang_; - const char *data_file_name_; + TessdataManager *data_file_; TessdataType tessdata_dawg_type_; int dawg_debug_level_; }; -Dawg *DawgCache::GetSquishedDawg( - const STRING &lang, - const char *data_file_name, - TessdataType tessdata_dawg_type, - int debug_level) { - STRING data_id = data_file_name; +Dawg *DawgCache::GetSquishedDawg(const STRING &lang, + TessdataType tessdata_dawg_type, + int debug_level, TessdataManager *data_file) { + STRING data_id = data_file->GetDataFileName(); data_id += kTessdataFileSuffixes[tessdata_dawg_type]; - DawgLoader loader(lang, data_file_name, tessdata_dawg_type, debug_level); + DawgLoader loader(lang, tessdata_dawg_type, debug_level, data_file); return dawgs_.Get(data_id, NewTessCallback(&loader, &DawgLoader::Load)); } @@ -73,27 +69,23 @@ Dawg * DawgCache::GetHfstWordModel( #endif Dawg *DawgLoader::Load() { - TessdataManager data_loader; - if (!data_loader.Init(data_file_name_, dawg_debug_level_)) { - return NULL; - } - if (!data_loader.SeekToStart(tessdata_dawg_type_)) { - data_loader.End(); - return NULL; - } - FILE *fp = data_loader.GetDataFilePtr(); + TFile fp; + if (!data_file_->GetComponent(tessdata_dawg_type_, &fp)) return nullptr; DawgType dawg_type; PermuterType perm_type; switch (tessdata_dawg_type_) { case TESSDATA_PUNC_DAWG: + case TESSDATA_LSTM_PUNC_DAWG: dawg_type = DAWG_TYPE_PUNCTUATION; perm_type = PUNC_PERM; break; case TESSDATA_SYSTEM_DAWG: + case TESSDATA_LSTM_SYSTEM_DAWG: dawg_type = DAWG_TYPE_WORD; perm_type = SYSTEM_DAWG_PERM; break; case TESSDATA_NUMBER_DAWG: + case TESSDATA_LSTM_NUMBER_DAWG: dawg_type = DAWG_TYPE_NUMBER; perm_type = NUMBER_PERM; break; @@ -118,8 +110,7 @@ Dawg *DawgLoader::Load() { #endif default: - data_loader.End(); - return NULL; + return nullptr; } #ifdef WITH_HFST @@ -130,10 +121,11 @@ Dawg *DawgLoader::Load() { return retval; } else { #endif - SquishedDawg *retval = - new SquishedDawg(fp, dawg_type, lang_, perm_type, dawg_debug_level_); - data_loader.End(); - return retval; + SquishedDawg *retval = + new SquishedDawg(dawg_type, lang_, perm_type, dawg_debug_level_); + if (retval->Load(&fp)) return retval; + delete retval; + return nullptr; #ifdef WITH_HFST } #endif diff --git a/dict/dawg_cache.h b/dict/dawg_cache.h index 0ec2f024..35aa5c89 100644 --- a/dict/dawg_cache.h +++ b/dict/dawg_cache.h @@ -29,11 +29,8 @@ namespace tesseract { class DawgCache { public: - Dawg *GetSquishedDawg( - const STRING &lang, - const char *data_file_name, - TessdataType tessdata_dawg_type, - int debug_level); + Dawg *GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, + int debug_level, TessdataManager *data_file); #ifdef WITH_HFST Dawg *GetHfstWordModel( diff --git a/dict/dict.cpp b/dict/dict.cpp index a2ab42d1..195a0852 100644 --- a/dict/dict.cpp +++ b/dict/dict.cpp @@ -34,13 +34,12 @@ namespace tesseract { class Image; -Dict::Dict(CCUtil* ccutil) +Dict::Dict(CCUtil *ccutil) : letter_is_okay_(&tesseract::Dict::def_letter_is_okay), probability_in_context_(&tesseract::Dict::def_probability_in_context), params_model_classify_(NULL), ccutil_(ccutil), - STRING_MEMBER(user_words_file, "", - "A filename of user-provided words.", + STRING_MEMBER(user_words_file, "", "A filename of user-provided words.", getCCUtil()->params()), STRING_INIT_MEMBER(user_words_suffix, "", "A suffix of user-provided words located in tessdata.", @@ -58,12 +57,17 @@ Dict::Dict(CCUtil* ccutil) getCCUtil()->params()), BOOL_INIT_MEMBER(load_unambig_dawg, true, "Load unambiguous word dawg.", getCCUtil()->params()), - BOOL_INIT_MEMBER(load_punc_dawg, true, "Load dawg with punctuation" - " patterns.", getCCUtil()->params()), - BOOL_INIT_MEMBER(load_number_dawg, true, "Load dawg with number" - " patterns.", getCCUtil()->params()), - BOOL_INIT_MEMBER(load_bigram_dawg, true, "Load dawg with special word " - "bigrams.", getCCUtil()->params()), + BOOL_INIT_MEMBER(load_punc_dawg, true, + "Load dawg with punctuation" + " patterns.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_number_dawg, true, + "Load dawg with number" + " patterns.", + getCCUtil()->params()), + BOOL_INIT_MEMBER(load_bigram_dawg, true, + "Load dawg with special word " + "bigrams.", BOOL_INIT_MEMBER(load_hfst_fsm, false, "Load hfst word model.", getCCUtil()->params()), double_MEMBER(xheight_penalty_subscripts, 0.125, @@ -72,21 +76,23 @@ Dict::Dict(CCUtil* ccutil) getCCUtil()->params()), double_MEMBER(xheight_penalty_inconsistent, 0.25, "Score penalty (0.1 = 10%) added if an xheight is " - "inconsistent.", getCCUtil()->params()), + "inconsistent.", + getCCUtil()->params()), double_MEMBER(segment_penalty_dict_frequent_word, 1.0, "Score multiplier for word matches which have good case and" "are frequent in the given language (lower is better).", getCCUtil()->params()), double_MEMBER(segment_penalty_dict_case_ok, 1.1, "Score multiplier for word matches that have good case " - "(lower is better).", getCCUtil()->params()), + "(lower is better).", + getCCUtil()->params()), double_MEMBER(segment_penalty_dict_case_bad, 1.3125, "Default score multiplier for word matches, which may have " "case issues (lower is better).", getCCUtil()->params()), double_MEMBER(segment_penalty_ngram_best_choice, 1.24, - "Multipler to for the best choice from the ngram model.", - getCCUtil()->params()), + "Multipler to for the best choice from the ngram model.", + getCCUtil()->params()), double_MEMBER(segment_penalty_dict_nonword, 1.25, "Score multiplier for glyph fragment segmentations which " "do not match a dictionary word (lower is better).", @@ -94,11 +100,13 @@ Dict::Dict(CCUtil* ccutil) double_MEMBER(segment_penalty_garbage, 1.50, "Score multiplier for poorly cased strings that are not in" " the dictionary and generally look like garbage (lower is" - " better).", getCCUtil()->params()), + " better).", + getCCUtil()->params()), STRING_MEMBER(output_ambig_words_file, "", "Output file for ambiguities found in the dictionary", getCCUtil()->params()), - INT_MEMBER(dawg_debug_level, 0, "Set to 1 for general debug info" + INT_MEMBER(dawg_debug_level, 0, + "Set to 1 for general debug info" ", to 2 for more details, to 3 to see all the debug messages", getCCUtil()->params()), INT_MEMBER(hyphen_debug_level, 0, "Debug level for hyphenated words.", @@ -115,12 +123,12 @@ Dict::Dict(CCUtil* ccutil) "Certainty threshold for non-dict words", getCCUtil()->params()), double_MEMBER(stopper_phase2_certainty_rejection_offset, 1.0, - "Reject certainty offset", - getCCUtil()->params()), + "Reject certainty offset", getCCUtil()->params()), INT_MEMBER(stopper_smallword_size, 2, "Size of dict word to be treated as non-dict word", getCCUtil()->params()), - double_MEMBER(stopper_certainty_per_char, -0.50, "Certainty to add" + double_MEMBER(stopper_certainty_per_char, -0.50, + "Certainty to add" " for each dict char above small word size.", getCCUtil()->params()), double_MEMBER(stopper_allowable_character_badness, 3.0, @@ -136,9 +144,9 @@ Dict::Dict(CCUtil* ccutil) "Deprecated- backward compatibility only", getCCUtil()->params()), INT_MEMBER(tessedit_truncate_wordchoice_log, 10, - "Max words to keep in list", - getCCUtil()->params()), - STRING_MEMBER(word_to_debug, "", "Word for which stopper debug" + "Max words to keep in list", getCCUtil()->params()), + STRING_MEMBER(word_to_debug, "", + "Word for which stopper debug" " information should be printed to stdout", getCCUtil()->params()), STRING_MEMBER(word_to_debug_lengths, "", @@ -147,10 +155,10 @@ Dict::Dict(CCUtil* ccutil) INT_MEMBER(fragments_debug, 0, "Debug character fragments", getCCUtil()->params()), BOOL_MEMBER(segment_nonalphabetic_script, false, - "Don't use any alphabetic-specific tricks." - "Set to true in the traineddata config file for" - " scripts that are cursive or inherently fixed-pitch", - getCCUtil()->params()), + "Don't use any alphabetic-specific tricks." + "Set to true in the traineddata config file for" + " scripts that are cursive or inherently fixed-pitch", + getCCUtil()->params()), BOOL_MEMBER(save_doc_words, 0, "Save Document Words", getCCUtil()->params()), double_MEMBER(doc_dict_pending_threshold, 0.0, @@ -158,8 +166,10 @@ Dict::Dict(CCUtil* ccutil) getCCUtil()->params()), double_MEMBER(doc_dict_certainty_threshold, -2.25, "Worst certainty for words that can be inserted into the" - "document dictionary", getCCUtil()->params()), - INT_MEMBER(max_permuter_attempts, 10000, "Maximum number of different" + "document dictionary", + getCCUtil()->params()), + INT_MEMBER(max_permuter_attempts, 10000, + "Maximum number of different" " character choices to consider during permutation." " This limit is especially useful when user patterns" " are specified, since overly generic patterns can result in" @@ -185,22 +195,20 @@ Dict::Dict(CCUtil* ccutil) } Dict::~Dict() { - if (hyphen_word_ != NULL) delete hyphen_word_; + End(); + delete hyphen_word_; if (output_ambig_words_file_ != NULL) fclose(output_ambig_words_file_); } DawgCache *Dict::GlobalDawgCache() { - // We dynamically allocate this global cache (a singleton) so it will outlive - // every Tesseract instance (even those that someone else might declare as - // global statics). - static DawgCache *cache = new DawgCache(); // evil global singleton - return cache; + // This global cache (a singleton) will outlive every Tesseract instance + // (even those that someone else might declare as global statics). + static DawgCache cache; + return &cache; } -void Dict::Load(DawgCache *dawg_cache) { - STRING name; - STRING &lang = getCCUtil()->lang; - +// Sets up ready for a Load or LoadLSTM. +void Dict::SetupForLoad(DawgCache *dawg_cache) { if (dawgs_.length() != 0) this->End(); apostrophe_unichar_id_ = getUnicharset().unichar_to_id(kApostropheSymbol); @@ -215,38 +223,38 @@ void Dict::Load(DawgCache *dawg_cache) { dawg_cache_ = new DawgCache(); dawg_cache_is_ours_ = true; } +} - TessdataManager &tessdata_manager = getCCUtil()->tessdata_manager; - const char *data_file_name = tessdata_manager.GetDataFileName().string(); - +// Loads the dawgs needed by Tesseract. Call FinishLoad() after. +void Dict::Load(const STRING &lang, TessdataManager *data_file) { // Load dawgs_. if (load_punc_dawg) { - punc_dawg_ = dawg_cache_->GetSquishedDawg( - lang, data_file_name, TESSDATA_PUNC_DAWG, dawg_debug_level); + punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, + dawg_debug_level, data_file); if (punc_dawg_) dawgs_ += punc_dawg_; } if (load_system_dawg) { Dawg *system_dawg = dawg_cache_->GetSquishedDawg( - lang, data_file_name, TESSDATA_SYSTEM_DAWG, dawg_debug_level); + lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file); if (system_dawg) dawgs_ += system_dawg; } if (load_number_dawg) { Dawg *number_dawg = dawg_cache_->GetSquishedDawg( - lang, data_file_name, TESSDATA_NUMBER_DAWG, dawg_debug_level); + lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file); if (number_dawg) dawgs_ += number_dawg; } if (load_bigram_dawg) { - bigram_dawg_ = dawg_cache_->GetSquishedDawg( - lang, data_file_name, TESSDATA_BIGRAM_DAWG, dawg_debug_level); + bigram_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_BIGRAM_DAWG, + dawg_debug_level, data_file); } if (load_freq_dawg) { - freq_dawg_ = dawg_cache_->GetSquishedDawg( - lang, data_file_name, TESSDATA_FREQ_DAWG, dawg_debug_level); + freq_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, + dawg_debug_level, data_file); if (freq_dawg_) { dawgs_ += freq_dawg_; } } if (load_unambig_dawg) { - unambig_dawg_ = dawg_cache_->GetSquishedDawg( - lang, data_file_name, TESSDATA_UNAMBIG_DAWG, dawg_debug_level); + unambig_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, + dawg_debug_level, data_file); if (unambig_dawg_) dawgs_ += unambig_dawg_; } @@ -264,6 +272,7 @@ void Dict::Load(DawgCache *dawg_cache) { } #endif + STRING name; if (((STRING &)user_words_suffix).length() > 0 || ((STRING &)user_words_file).length() > 0) { Trie *trie_ptr = new Trie(DAWG_TYPE_WORD, lang, USER_DAWG_PERM, @@ -309,8 +318,33 @@ void Dict::Load(DawgCache *dawg_cache) { // This dawg is temporary and should not be searched by letter_is_ok. pending_words_ = new Trie(DAWG_TYPE_WORD, lang, NO_PERM, getUnicharset().size(), dawg_debug_level); +} - // Construct a list of corresponding successors for each dawg. Each entry i +// Loads the dawgs needed by the LSTM model. Call FinishLoad() after. +void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) { + // Load dawgs_. + if (load_punc_dawg) { + punc_dawg_ = dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, + dawg_debug_level, data_file); + if (punc_dawg_) dawgs_ += punc_dawg_; + } + if (load_system_dawg) { + Dawg *system_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file); + if (system_dawg) dawgs_ += system_dawg; + } + if (load_number_dawg) { + Dawg *number_dawg = dawg_cache_->GetSquishedDawg( + lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file); + if (number_dawg) dawgs_ += number_dawg; + } +} + +// Completes the loading process after Load() and/or LoadLSTM(). +// Returns false if no dictionaries were loaded. +bool Dict::FinishLoad() { + if (dawgs_.empty()) return false; + // Construct a list of corresponding successors for each dawg. Each entry, i, // in the successors_ vector is a vector of integers that represent the // indices into the dawgs_ vector of the successors for dawg i. successors_.reserve(dawgs_.length()); @@ -325,6 +359,7 @@ void Dict::Load(DawgCache *dawg_cache) { } successors_ += lst; } + return true; } void Dict::End() { @@ -344,10 +379,8 @@ void Dict::End() { dawgs_.clear(); successors_.clear(); document_words_ = NULL; - if (pending_words_ != NULL) { - delete pending_words_; - pending_words_ = NULL; - } + delete pending_words_; + pending_words_ = NULL; } // Returns true if in light of the current state unichar_id is allowed @@ -377,6 +410,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args, // Initialization. PermuterType curr_perm = NO_PERM; dawg_args->updated_dawgs->clear(); + dawg_args->valid_end = false; // Go over the active_dawgs vector and insert DawgPosition records // with the updated ref (an edge with the corresponding unichar id) into @@ -414,6 +448,9 @@ int Dict::def_letter_is_okay(void* void_dawg_args, dawg_debug_level > 0, "Append transition from punc dawg to current dawgs: "); if (sdawg->permuter() > curr_perm) curr_perm = sdawg->permuter(); + if (sdawg->end_of_word(dawg_edge) && + punc_dawg->end_of_word(punc_transition_edge)) + dawg_args->valid_end = true; } } } @@ -428,6 +465,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args, dawg_debug_level > 0, "Extend punctuation dawg: "); if (PUNC_PERM > curr_perm) curr_perm = PUNC_PERM; + if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true; } continue; } @@ -445,6 +483,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args, dawg_debug_level > 0, "Return to punctuation dawg: "); if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); + if (punc_dawg->end_of_word(punc_edge)) dawg_args->valid_end = true; } } @@ -454,8 +493,8 @@ int Dict::def_letter_is_okay(void* void_dawg_args, // possible edges, not only for the exact unichar_id, but also // for all its character classes (alpha, digit, etc). if (dawg->type() == DAWG_TYPE_PATTERN) { - ProcessPatternEdges(dawg, pos, unichar_id, word_end, - dawg_args->updated_dawgs, &curr_perm); + ProcessPatternEdges(dawg, pos, unichar_id, word_end, dawg_args, + &curr_perm); // There can't be any successors to dawg that is of type // DAWG_TYPE_PATTERN, so we are done examining this DawgPosition. continue; @@ -482,6 +521,9 @@ int Dict::def_letter_is_okay(void* void_dawg_args, continue; } if (dawg->permuter() > curr_perm) curr_perm = dawg->permuter(); + if (dawg->end_of_word(edge) && + (punc_dawg == NULL || punc_dawg->end_of_word(pos.punc_ref))) + dawg_args->valid_end = true; dawg_args->updated_dawgs->add_unique( DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, false), @@ -506,7 +548,7 @@ int Dict::def_letter_is_okay(void* void_dawg_args, void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHAR_ID unichar_id, bool word_end, - DawgPositionVector *updated_dawgs, + DawgArgs *dawg_args, PermuterType *curr_perm) const { NODE_REF node = GetStartingNode(dawg, pos.dawg_ref); // Try to find the edge corresponding to the exact unichar_id and to all the @@ -529,7 +571,8 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, tprintf("Letter found in pattern dawg %d\n", pos.dawg_index); } if (dawg->permuter() > *curr_perm) *curr_perm = dawg->permuter(); - updated_dawgs->add_unique( + if (dawg->end_of_word(edge)) dawg_args->valid_end = true; + dawg_args->updated_dawgs->add_unique( DawgPosition(pos.dawg_index, edge, pos.punc_index, pos.punc_ref, pos.back_to_punc), dawg_debug_level > 0, @@ -825,5 +868,13 @@ bool Dict::valid_punctuation(const WERD_CHOICE &word) { return false; } +/// Returns true if the language is space-delimited (not CJ, or T). +bool Dict::IsSpaceDelimitedLang() const { + const UNICHARSET &u_set = getUnicharset(); + if (u_set.han_sid() > 0) return false; + if (u_set.katakana_sid() > 0) return false; + if (u_set.thai_sid() > 0) return false; + return true; +} } // namespace tesseract diff --git a/dict/dict.h b/dict/dict.h index 8815e0b4..3ae0220b 100644 --- a/dict/dict.h +++ b/dict/dict.h @@ -23,7 +23,6 @@ #include "dawg.h" #include "dawg_cache.h" #include "host.h" -#include "oldlist.h" #include "ratngs.h" #include "stopper.h" #include "trie.h" @@ -76,11 +75,13 @@ enum XHeightConsistencyEnum {XH_GOOD, XH_SUBNORMAL, XH_INCONSISTENT}; struct DawgArgs { DawgArgs(DawgPositionVector *d, DawgPositionVector *up, PermuterType p) - : active_dawgs(d), updated_dawgs(up), permuter(p) {} + : active_dawgs(d), updated_dawgs(up), permuter(p), valid_end(false) {} DawgPositionVector *active_dawgs; DawgPositionVector *updated_dawgs; PermuterType permuter; + // True if the current position is a valid word end. + bool valid_end; }; class Dict { @@ -259,7 +260,7 @@ class Dict { MATRIX *ratings); /// Returns the length of the shortest alpha run in WordChoice. - int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice); + int LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const; /// Returns true if the certainty of the BestChoice word is within a /// reasonable range of the average certainties for the best choices for /// each character in the segmentation. This test is used to catch words @@ -274,7 +275,7 @@ class Dict { /// Returns false if the best choice for the current word is questionable /// and should be tried again on the second pass or should be flagged to /// the user. - bool AcceptableResult(WERD_RES* word); + bool AcceptableResult(WERD_RES *word) const; void EndDangerousAmbigs(); /// Prints the current choices for this word to stdout. void DebugWordChoices(); @@ -284,7 +285,7 @@ class Dict { void SettupStopperPass2(); /* context.cpp *************************************************************/ /// Check a string to see if it matches a set of lexical rules. - int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset); + int case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) const; /// Returns true if the word looks like an absolute garbage /// (e.g. image mistakenly recognized as text). bool absolute_garbage(const WERD_CHOICE &word, const UNICHARSET &unicharset); @@ -294,7 +295,15 @@ class Dict { /// Initialize Dict class - load dawgs from [lang].traineddata and /// user-specified wordlist and parttern list. static DawgCache *GlobalDawgCache(); - void Load(DawgCache *dawg_cache); + // Sets up ready for a Load or LoadLSTM. + void SetupForLoad(DawgCache *dawg_cache); + // Loads the dawgs needed by Tesseract. Call FinishLoad() after. + void Load(const STRING &lang, TessdataManager *data_file); + // Loads the dawgs needed by the LSTM model. Call FinishLoad() after. + void LoadLSTM(const STRING &lang, TessdataManager *data_file); + // Completes the loading process after Load() and/or LoadLSTM(). + // Returns false if no dictionaries were loaded. + bool FinishLoad(); void End(); // Resets the document dictionary analogous to ResetAdaptiveClassifier. @@ -374,10 +383,11 @@ class Dict { double def_probability_in_context( const char* lang, const char* context, int context_bytes, const char* character, int character_bytes) { - (void) context; - (void) context_bytes; - (void) character; - (void) character_bytes; + (void)lang; + (void)context; + (void)context_bytes; + (void)character; + (void)character_bytes; return 0.0; } double ngram_probability_in_context(const char* lang, @@ -397,9 +407,7 @@ class Dict { } inline void SetWildcardID(UNICHAR_ID id) { wildcard_unichar_id_ = id; } - inline UNICHAR_ID WildcardID() const { - return wildcard_unichar_id_; - } + inline UNICHAR_ID WildcardID() const { return wildcard_unichar_id_; } /// Return the number of dawgs in the dawgs_ vector. inline int NumDawgs() const { return dawgs_.size(); } /// Return i-th dawg pointer recorded in the dawgs_ vector. @@ -436,7 +444,7 @@ class Dict { /// edges were found. void ProcessPatternEdges(const Dawg *dawg, const DawgPosition &info, UNICHAR_ID unichar_id, bool word_end, - DawgPositionVector *updated_dawgs, + DawgArgs *dawg_args, PermuterType *current_permuter) const; /// Read/Write/Access special purpose dawgs which contain words @@ -483,6 +491,8 @@ class Dict { inline void SetWordsegRatingAdjustFactor(float f) { wordseg_rating_adjust_factor_ = f; } + /// Returns true if the language is space-delimited (not CJ, or T). + bool IsSpaceDelimitedLang() const; private: /** Private member variables. */ diff --git a/dict/stopper.cpp b/dict/stopper.cpp index 660b4c8c..67327823 100644 --- a/dict/stopper.cpp +++ b/dict/stopper.cpp @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: stopper.c - ** Purpose: Stopping criteria for word classifier. - ** Author: Dan Johnson - ** History: Mon Apr 29 14:56:49 1991, DSJ, Created. + ** Filename: stopper.c + ** Purpose: Stopping criteria for word classifier. + ** Author: Dan Johnson + ** History: Mon Apr 29 14:56:49 1991, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at @@ -41,7 +41,6 @@ #pragma warning(disable:4800) // int/bool warnings #endif -using tesseract::ScriptPos; /*---------------------------------------------------------------------------- Private Code ----------------------------------------------------------------------------*/ @@ -108,7 +107,7 @@ bool Dict::AcceptableChoice(const WERD_CHOICE& best_choice, } } -bool Dict::AcceptableResult(WERD_RES* word) { +bool Dict::AcceptableResult(WERD_RES *word) const { if (word->best_choice == NULL) return false; float CertaintyThreshold = stopper_nondict_certainty_base - reject_offset_; int WordSize; @@ -449,7 +448,7 @@ void Dict::ReplaceAmbig(int wrong_ngram_begin_index, int wrong_ngram_size, } } -int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) { +int Dict::LengthOfShortestAlphaRun(const WERD_CHOICE &WordChoice) const { int shortest = MAX_INT32; int curr_len = 0; for (int w = 0; w < WordChoice.length(); ++w) { diff --git a/dict/stopper.h b/dict/stopper.h index b028b0ee..58d23734 100644 --- a/dict/stopper.h +++ b/dict/stopper.h @@ -1,10 +1,10 @@ /****************************************************************************** - ** Filename: stopper.h - ** Purpose: Stopping criteria for word classifier. - ** Author: Dan Johnson - ** History: Wed May 1 09:42:57 1991, DSJ, Created. + ** Filename: stopper.h + ** Purpose: Stopping criteria for word classifier. + ** Author: Dan Johnson + ** History: Wed May 1 09:42:57 1991, DSJ, Created. ** - ** (c) Copyright Hewlett-Packard Company, 1988. + ** (c) Copyright Hewlett-Packard Company, 1988. ** Licensed under the Apache License, Version 2.0 (the "License"); ** you may not use this file except in compliance with the License. ** You may obtain a copy of the License at diff --git a/dict/trie.cpp b/dict/trie.cpp index 6a7a8d1e..ac7dd33c 100644 --- a/dict/trie.cpp +++ b/dict/trie.cpp @@ -276,7 +276,6 @@ bool Trie::add_word_to_dawg(const WERD_CHOICE &word, NODE_REF Trie::new_dawg_node() { TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD(); - if (node == NULL) return 0; // failed to create new node nodes_.push_back(node); return nodes_.length() - 1; } diff --git a/dict/trie.h b/dict/trie.h index 48ec56e0..8428ebba 100644 --- a/dict/trie.h +++ b/dict/trie.h @@ -87,8 +87,9 @@ class Trie : public Dawg { // contain more edges than max_num_edges, all the edges are cleared // so that new inserts can proceed). Trie(DawgType type, const STRING &lang, PermuterType perm, - int unicharset_size, int debug_level) { - init(type, lang, perm, unicharset_size, debug_level); + int unicharset_size, int debug_level) + : Dawg(type, lang, perm, debug_level) { + init(unicharset_size); num_edges_ = 0; deref_node_index_mask_ = ~letter_mask_; new_dawg_node(); // need to allocate node 0 @@ -402,7 +403,7 @@ class Trie : public Dawg { EDGE_VECTOR* backward_edges, NODE_MARKER reduced_nodes); - /** + /** * Order num_edges of consequtive EDGE_RECORDS in the given EDGE_VECTOR in * increasing order of unichar ids. This function is normally called * for all edges in a single node, and since number of edges in each node diff --git a/doc/Doxyfile b/doc/Doxyfile index c4f496be..ef945802 100644 --- a/doc/Doxyfile +++ b/doc/Doxyfile @@ -2135,7 +2135,7 @@ DOT_NUM_THREADS = 0 # The default value is: Helvetica. # This tag requires that the tag HAVE_DOT is set to YES. -DOT_FONTNAME = FreeSans +DOT_FONTNAME = # The DOT_FONTSIZE tag can be used to set the size (in points) of the font of # dot graphs. diff --git a/doc/ambiguous_words.1.html b/doc/ambiguous_words.1.html index 3fd5f7f1..be74b62d 100644 --- a/doc/ambiguous_words.1.html +++ b/doc/ambiguous_words.1.html @@ -1,790 +1,790 @@ - - - - - -AMBIGUOUS_WORDS(1) - - - - - -
-
-

SYNOPSIS

-
-

ambiguous_words [-l lang] TESSDATADIR WORDLIST AMBIGUOUSFILE

-
-
-
-

DESCRIPTION

-
-

ambiguous_words(1) runs Tesseract in a special mode, and for each word -in word list, produces a set of words which Tesseract thinks might be -ambiguous with it. TESSDATADIR must be set to the absolute path of -a directory containing tessdata/lang.traineddata.

-
-
-
-

SEE ALSO

-
-

tesseract(1)

-
-
-
-

COPYING

-
-

Copyright (C) 2012 Google, Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +AMBIGUOUS_WORDS(1) + + + + + +
+
+

SYNOPSIS

+
+

ambiguous_words [-l lang] TESSDATADIR WORDLIST AMBIGUOUSFILE

+
+
+
+

DESCRIPTION

+
+

ambiguous_words(1) runs Tesseract in a special mode, and for each word +in word list, produces a set of words which Tesseract thinks might be +ambiguous with it. TESSDATADIR must be set to the absolute path of +a directory containing tessdata/lang.traineddata.

+
+
+
+

SEE ALSO

+
+

tesseract(1)

+
+
+
+

COPYING

+
+

Copyright (C) 2012 Google, Inc. +Licensed under the Apache License, Version 2.0

+
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/ambiguous_words.1.xml b/doc/ambiguous_words.1.xml index 6293866c..4900c6eb 100644 --- a/doc/ambiguous_words.1.xml +++ b/doc/ambiguous_words.1.xml @@ -1,43 +1,43 @@ - - - - - - - AMBIGUOUS_WORDS(1) - - -ambiguous_words -1 -  -  - - - ambiguous_words - generate sets of words Tesseract is likely to find ambiguous - - -ambiguous_words [-l lang] TESSDATADIR WORDLIST AMBIGUOUSFILE - - -DESCRIPTION -ambiguous_words(1) runs Tesseract in a special mode, and for each word -in word list, produces a set of words which Tesseract thinks might be -ambiguous with it. TESSDATADIR must be set to the absolute path of -a directory containing tessdata/lang.traineddata. - - -SEE ALSO -tesseract(1) - - -COPYING -Copyright (C) 2012 Google, Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + AMBIGUOUS_WORDS(1) + + +ambiguous_words +1 +  +  + + + ambiguous_words + generate sets of words Tesseract is likely to find ambiguous + + +ambiguous_words [-l lang] TESSDATADIR WORDLIST AMBIGUOUSFILE + + +DESCRIPTION +ambiguous_words(1) runs Tesseract in a special mode, and for each word +in word list, produces a set of words which Tesseract thinks might be +ambiguous with it. TESSDATADIR must be set to the absolute path of +a directory containing tessdata/lang.traineddata. + + +SEE ALSO +tesseract(1) + + +COPYING +Copyright (C) 2012 Google, Inc. +Licensed under the Apache License, Version 2.0 + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/doc/cntraining.1.html b/doc/cntraining.1.html index 706d3bd0..7653061e 100644 --- a/doc/cntraining.1.html +++ b/doc/cntraining.1.html @@ -1,805 +1,805 @@ - - - - - -CNTRAINING(1) - - - - - -
-
-

SYNOPSIS

-
-

cntraining [-D dir] FILE

-
-
-
-

DESCRIPTION

-
-

cntraining takes a list of .tr files, from which it generates the -normproto data file (the character normalization sensitivity -prototypes).

-
-
-
-

OPTIONS

-
-
-
--D dir -
-
-

- Directory to write output files to. -

-
-
-
-
-
-

SEE ALSO

-
-

tesseract(1), shapeclustering(1), mftraining(1)

- -
-
-
-

COPYING

-
-

Copyright (c) Hewlett-Packard Company, 1988 -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +CNTRAINING(1) + + + + + +
+
+

SYNOPSIS

+
+

cntraining [-D dir] FILE

+
+
+
+

DESCRIPTION

+
+

cntraining takes a list of .tr files, from which it generates the +normproto data file (the character normalization sensitivity +prototypes).

+
+
+
+

OPTIONS

+
+
+
+-D dir +
+
+

+ Directory to write output files to. +

+
+
+
+
+
+

SEE ALSO

+
+

tesseract(1), shapeclustering(1), mftraining(1)

+ +
+
+
+

COPYING

+
+

Copyright (c) Hewlett-Packard Company, 1988 +Licensed under the Apache License, Version 2.0

+
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/cntraining.1.xml b/doc/cntraining.1.xml index 6795f12f..6efc99be 100644 --- a/doc/cntraining.1.xml +++ b/doc/cntraining.1.xml @@ -1,58 +1,58 @@ - - - - - - - CNTRAINING(1) - - -cntraining -1 -  -  - - - cntraining - character normalization training for Tesseract - - -cntraining [-D dir] FILE - - -DESCRIPTION -cntraining takes a list of .tr files, from which it generates the -normproto data file (the character normalization sensitivity -prototypes). - - -OPTIONS - - - --D dir - - - - Directory to write output files to. - - - - - - -SEE ALSO -tesseract(1), shapeclustering(1), mftraining(1) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (c) Hewlett-Packard Company, 1988 -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + CNTRAINING(1) + + +cntraining +1 +  +  + + + cntraining + character normalization training for Tesseract + + +cntraining [-D dir] FILE + + +DESCRIPTION +cntraining takes a list of .tr files, from which it generates the +normproto data file (the character normalization sensitivity +prototypes). + + +OPTIONS + + + +-D dir + + + + Directory to write output files to. + + + + + + +SEE ALSO +tesseract(1), shapeclustering(1), mftraining(1) +https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract + + +COPYING +Copyright (c) Hewlett-Packard Company, 1988 +Licensed under the Apache License, Version 2.0 + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/doc/combine_tessdata.1 b/doc/combine_tessdata.1 index d876d1b8..7f29bad4 100644 --- a/doc/combine_tessdata.1 +++ b/doc/combine_tessdata.1 @@ -171,16 +171,6 @@ lang\&.fixed\-length\-dawgs (Optional) Several dawgs of different fixed lengths \(em useful for languages like Chinese\&. .RE .PP -lang\&.cube\-unicharset -.RS 4 -(Optional) A unicharset for cube, if cube was trained on a different set of symbols\&. -.RE -.PP -lang\&.cube\-word\-dawg -.RS 4 -(Optional) A word dawg for cube\(cqs alternate unicharset\&. Not needed if Cube was trained with Tesseract\(cqs unicharset\&. -.RE -.PP lang\&.shapetable .RS 4 (Optional) When present, a shapetable is an extra layer between the character classifier and the word recognizer that allows the character classifier to return a collection of unichar ids and fonts instead of a single unichar\-id and font\&. diff --git a/doc/combine_tessdata.1.asc b/doc/combine_tessdata.1.asc index d93de7ea..ec702330 100644 --- a/doc/combine_tessdata.1.asc +++ b/doc/combine_tessdata.1.asc @@ -11,7 +11,7 @@ SYNOPSIS DESCRIPTION ----------- -combine_tessdata(1) is the main program to combine/extract/overwrite +combine_tessdata(1) is the main program to combine/extract/overwrite tessdata components in [lang].traineddata files. To combine all the individual tessdata components (unicharset, DAWGs, @@ -120,14 +120,6 @@ lang.fixed-length-dawgs:: (Optional) Several dawgs of different fixed lengths -- useful for languages like Chinese. -lang.cube-unicharset:: - (Optional) A unicharset for cube, if cube was trained on a different set - of symbols. - -lang.cube-word-dawg:: - (Optional) A word dawg for cube's alternate unicharset. Not needed if Cube - was trained with Tesseract's unicharset. - lang.shapetable:: (Optional) When present, a shapetable is an extra layer between the character classifier and the word recognizer that allows the character classifier to diff --git a/doc/combine_tessdata.1.html b/doc/combine_tessdata.1.html index 8de474b3..2fc45b08 100644 --- a/doc/combine_tessdata.1.html +++ b/doc/combine_tessdata.1.html @@ -1,1014 +1,996 @@ - - - - - -COMBINE_TESSDATA(1) - - - - - -
-
-

SYNOPSIS

-
-

combine_tessdata [OPTION] FILE

-
-
-
-

DESCRIPTION

-
-

combine_tessdata(1) is the main program to combine/extract/overwrite -tessdata components in [lang].traineddata files.

-

To combine all the individual tessdata components (unicharset, DAWGs, -classifier templates, ambiguities, language configs) located at, say, -/home/$USER/temp/eng.* run:

-
-
-
combine_tessdata /home/$USER/temp/eng.
-
-

The result will be a combined tessdata file /home/$USER/temp/eng.traineddata

-

Specify option -e if you would like to extract individual components -from a combined traineddata file. For example, to extract language config -file and the unicharset from tessdata/eng.traineddata run:

-
-
-
combine_tessdata -e tessdata/eng.traineddata \
-  /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
-
-

The desired config file and unicharset will be written to -/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset

-

Specify option -o to overwrite individual components of the given -[lang].traineddata file. For example, to overwrite language config -and unichar ambiguities files in tessdata/eng.traineddata use:

-
-
-
combine_tessdata -o tessdata/eng.traineddata \
-  /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
-
-

As a result, tessdata/eng.traineddata will contain the new language config -and unichar ambigs, plus all the original DAWGs, classifier templates, etc.

-

Note: the file names of the files to extract to and to overwrite from should -have the appropriate file suffixes (extensions) indicating their tessdata -component type (.unicharset for the unicharset, .unicharambigs for unichar -ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.

-

Specify option -u to unpack all the components to the specified path:

-
-
-
combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
-
-

This will create /home/$USER/temp/eng.* files with individual tessdata -components from tessdata/eng.traineddata.

-
-
-
-

OPTIONS

-
-

-e .traineddata FILE…: - Extracts the specified components from the .traineddata file

-

-o .traineddata FILE…: - Overwrites the specified components of the .traineddata file - with those provided on the comand line.

-

-u .traineddata PATHPREFIX - Unpacks the .traineddata using the provided prefix.

-
-
-
-

CAVEATS

-
-

Prefix refers to the full file prefix, including period (.)

-
-
-
-

COMPONENTS

-
-

The components in a Tesseract lang.traineddata file as of -Tesseract 3.02 are briefly described below; For more information on -many of these files, see -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract

-
-
-lang.config -
-
-

- (Optional) Language-specific overrides to default config variables. -

-
-
-lang.unicharset -
-
-

- (Required) The list of symbols that Tesseract recognizes, with properties. - See unicharset(5). -

-
-
-lang.unicharambigs -
-
-

- (Optional) This file contains information on pairs of recognized symbols - which are often confused. For example, rn and m. -

-
-
-lang.inttemp -
-
-

- (Required) Character shape templates for each unichar. Produced by - mftraining(1). -

-
-
-lang.pffmtable -
-
-

- (Required) The number of features expected for each unichar. - Produced by mftraining(1) from .tr files. -

-
-
-lang.normproto -
-
-

- (Required) Character normalization prototypes generated by cntraining(1) - from .tr files. -

-
-
-lang.punc-dawg -
-
-

- (Optional) A dawg made from punctuation patterns found around words. - The "word" part is replaced by a single space. -

-
-
-lang.word-dawg -
-
-

- (Optional) A dawg made from dictionary words from the language. -

-
-
-lang.number-dawg -
-
-

- (Optional) A dawg made from tokens which originally contained digits. - Each digit is replaced by a space character. -

-
-
-lang.freq-dawg -
-
-

- (Optional) A dawg made from the most frequent words which would have - gone into word-dawg. -

-
-
-lang.fixed-length-dawgs -
-
-

- (Optional) Several dawgs of different fixed lengths — useful for - languages like Chinese. -

-
-
-lang.cube-unicharset -
-
-

- (Optional) A unicharset for cube, if cube was trained on a different set - of symbols. -

-
-
-lang.cube-word-dawg -
-
-

- (Optional) A word dawg for cube’s alternate unicharset. Not needed if Cube - was trained with Tesseract’s unicharset. -

-
-
-lang.shapetable -
-
-

- (Optional) When present, a shapetable is an extra layer between the character - classifier and the word recognizer that allows the character classifier to - return a collection of unichar ids and fonts instead of a single unichar-id - and font. -

-
-
-lang.bigram-dawg -
-
-

- (Optional) A dawg of word bigrams where the words are separated by a space - and each digit is replaced by a ?. -

-
-
-lang.unambig-dawg -
-
-

- (Optional) TODO: Describe. -

-
-
-lang.params-training-model -
-
-

- (Optional) TODO: Describe. -

-
-
-
-
-
-

HISTORY

-
-

combine_tessdata(1) first appeared in version 3.00 of Tesseract

-
-
-
-

SEE ALSO

-
-

tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5), -unicharambigs(5)

-
-
-
-

COPYING

-
-

Copyright (C) 2009, Google Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +COMBINE_TESSDATA(1) + + + + + +
+
+

SYNOPSIS

+
+

combine_tessdata [OPTION] FILE

+
+
+
+

DESCRIPTION

+
+

combine_tessdata(1) is the main program to combine/extract/overwrite +tessdata components in [lang].traineddata files.

+

To combine all the individual tessdata components (unicharset, DAWGs, +classifier templates, ambiguities, language configs) located at, say, +/home/$USER/temp/eng.* run:

+
+
+
combine_tessdata /home/$USER/temp/eng.
+
+

The result will be a combined tessdata file /home/$USER/temp/eng.traineddata

+

Specify option -e if you would like to extract individual components +from a combined traineddata file. For example, to extract language config +file and the unicharset from tessdata/eng.traineddata run:

+
+
+
combine_tessdata -e tessdata/eng.traineddata \
+  /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
+
+

The desired config file and unicharset will be written to +/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset

+

Specify option -o to overwrite individual components of the given +[lang].traineddata file. For example, to overwrite language config +and unichar ambiguities files in tessdata/eng.traineddata use:

+
+
+
combine_tessdata -o tessdata/eng.traineddata \
+  /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
+
+

As a result, tessdata/eng.traineddata will contain the new language config +and unichar ambigs, plus all the original DAWGs, classifier templates, etc.

+

Note: the file names of the files to extract to and to overwrite from should +have the appropriate file suffixes (extensions) indicating their tessdata +component type (.unicharset for the unicharset, .unicharambigs for unichar +ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.

+

Specify option -u to unpack all the components to the specified path:

+
+
+
combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
+
+

This will create /home/$USER/temp/eng.* files with individual tessdata +components from tessdata/eng.traineddata.

+
+
+
+

OPTIONS

+
+

-e .traineddata FILE…: + Extracts the specified components from the .traineddata file

+

-o .traineddata FILE…: + Overwrites the specified components of the .traineddata file + with those provided on the comand line.

+

-u .traineddata PATHPREFIX + Unpacks the .traineddata using the provided prefix.

+
+
+
+

CAVEATS

+
+

Prefix refers to the full file prefix, including period (.)

+
+
+
+

COMPONENTS

+
+

The components in a Tesseract lang.traineddata file as of +Tesseract 3.02 are briefly described below; For more information on +many of these files, see +https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract

+
+
+lang.config +
+
+

+ (Optional) Language-specific overrides to default config variables. +

+
+
+lang.unicharset +
+
+

+ (Required) The list of symbols that Tesseract recognizes, with properties. + See unicharset(5). +

+
+
+lang.unicharambigs +
+
+

+ (Optional) This file contains information on pairs of recognized symbols + which are often confused. For example, rn and m. +

+
+
+lang.inttemp +
+
+

+ (Required) Character shape templates for each unichar. Produced by + mftraining(1). +

+
+
+lang.pffmtable +
+
+

+ (Required) The number of features expected for each unichar. + Produced by mftraining(1) from .tr files. +

+
+
+lang.normproto +
+
+

+ (Required) Character normalization prototypes generated by cntraining(1) + from .tr files. +

+
+
+lang.punc-dawg +
+
+

+ (Optional) A dawg made from punctuation patterns found around words. + The "word" part is replaced by a single space. +

+
+
+lang.word-dawg +
+
+

+ (Optional) A dawg made from dictionary words from the language. +

+
+
+lang.number-dawg +
+
+

+ (Optional) A dawg made from tokens which originally contained digits. + Each digit is replaced by a space character. +

+
+
+lang.freq-dawg +
+
+

+ (Optional) A dawg made from the most frequent words which would have + gone into word-dawg. +

+
+
+lang.fixed-length-dawgs +
+
+

+ (Optional) Several dawgs of different fixed lengths — useful for + languages like Chinese. +

+
+
+lang.shapetable +
+
+

+ (Optional) When present, a shapetable is an extra layer between the character + classifier and the word recognizer that allows the character classifier to + return a collection of unichar ids and fonts instead of a single unichar-id + and font. +

+
+
+lang.bigram-dawg +
+
+

+ (Optional) A dawg of word bigrams where the words are separated by a space + and each digit is replaced by a ?. +

+
+
+lang.unambig-dawg +
+
+

+ (Optional) TODO: Describe. +

+
+
+lang.params-training-model +
+
+

+ (Optional) TODO: Describe. +

+
+
+
+
+
+

HISTORY

+
+

combine_tessdata(1) first appeared in version 3.00 of Tesseract

+
+
+
+

SEE ALSO

+
+

tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5), +unicharambigs(5)

+
+
+
+

COPYING

+
+

Copyright (C) 2009, Google Inc. +Licensed under the Apache License, Version 2.0

+
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/combine_tessdata.1.xml b/doc/combine_tessdata.1.xml index 1a43995f..d11bac8f 100644 --- a/doc/combine_tessdata.1.xml +++ b/doc/combine_tessdata.1.xml @@ -1,281 +1,259 @@ - - - - - - - COMBINE_TESSDATA(1) - - -combine_tessdata -1 -  -  - - - combine_tessdata - combine/extract/overwrite Tesseract data - - -combine_tessdata [OPTION] FILE - - -DESCRIPTION -combine_tessdata(1) is the main program to combine/extract/overwrite -tessdata components in [lang].traineddata files. -To combine all the individual tessdata components (unicharset, DAWGs, -classifier templates, ambiguities, language configs) located at, say, -/home/$USER/temp/eng.* run: -combine_tessdata /home/$USER/temp/eng. -The result will be a combined tessdata file /home/$USER/temp/eng.traineddata -Specify option -e if you would like to extract individual components -from a combined traineddata file. For example, to extract language config -file and the unicharset from tessdata/eng.traineddata run: -combine_tessdata -e tessdata/eng.traineddata \ - /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset -The desired config file and unicharset will be written to -/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset -Specify option -o to overwrite individual components of the given -[lang].traineddata file. For example, to overwrite language config -and unichar ambiguities files in tessdata/eng.traineddata use: -combine_tessdata -o tessdata/eng.traineddata \ - /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs -As a result, tessdata/eng.traineddata will contain the new language config -and unichar ambigs, plus all the original DAWGs, classifier templates, etc. -Note: the file names of the files to extract to and to overwrite from should -have the appropriate file suffixes (extensions) indicating their tessdata -component type (.unicharset for the unicharset, .unicharambigs for unichar -ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h. -Specify option -u to unpack all the components to the specified path: -combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng. -This will create /home/$USER/temp/eng.* files with individual tessdata -components from tessdata/eng.traineddata. - - -OPTIONS --e .traineddata FILE…: - Extracts the specified components from the .traineddata file --o .traineddata FILE…: - Overwrites the specified components of the .traineddata file - with those provided on the comand line. --u .traineddata PATHPREFIX - Unpacks the .traineddata using the provided prefix. - - -CAVEATS -Prefix refers to the full file prefix, including period (.) - - -COMPONENTS -The components in a Tesseract lang.traineddata file as of -Tesseract 3.02 are briefly described below; For more information on -many of these files, see -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - - -lang.config - - - - (Optional) Language-specific overrides to default config variables. - - - - - -lang.unicharset - - - - (Required) The list of symbols that Tesseract recognizes, with properties. - See unicharset(5). - - - - - -lang.unicharambigs - - - - (Optional) This file contains information on pairs of recognized symbols - which are often confused. For example, rn and m. - - - - - -lang.inttemp - - - - (Required) Character shape templates for each unichar. Produced by - mftraining(1). - - - - - -lang.pffmtable - - - - (Required) The number of features expected for each unichar. - Produced by mftraining(1) from .tr files. - - - - - -lang.normproto - - - - (Required) Character normalization prototypes generated by cntraining(1) - from .tr files. - - - - - -lang.punc-dawg - - - - (Optional) A dawg made from punctuation patterns found around words. - The "word" part is replaced by a single space. - - - - - -lang.word-dawg - - - - (Optional) A dawg made from dictionary words from the language. - - - - - -lang.number-dawg - - - - (Optional) A dawg made from tokens which originally contained digits. - Each digit is replaced by a space character. - - - - - -lang.freq-dawg - - - - (Optional) A dawg made from the most frequent words which would have - gone into word-dawg. - - - - - -lang.fixed-length-dawgs - - - - (Optional) Several dawgs of different fixed lengths — useful for - languages like Chinese. - - - - - -lang.cube-unicharset - - - - (Optional) A unicharset for cube, if cube was trained on a different set - of symbols. - - - - - -lang.cube-word-dawg - - - - (Optional) A word dawg for cube’s alternate unicharset. Not needed if Cube - was trained with Tesseract’s unicharset. - - - - - -lang.shapetable - - - - (Optional) When present, a shapetable is an extra layer between the character - classifier and the word recognizer that allows the character classifier to - return a collection of unichar ids and fonts instead of a single unichar-id - and font. - - - - - -lang.bigram-dawg - - - - (Optional) A dawg of word bigrams where the words are separated by a space - and each digit is replaced by a ?. - - - - - -lang.unambig-dawg - - - - (Optional) TODO: Describe. - - - - - -lang.params-training-model - - - - (Optional) TODO: Describe. - - - - - - -HISTORY -combine_tessdata(1) first appeared in version 3.00 of Tesseract - - -SEE ALSO -tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5), -unicharambigs(5) - - -COPYING -Copyright (C) 2009, Google Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + COMBINE_TESSDATA(1) + + +combine_tessdata +1 +  +  + + + combine_tessdata + combine/extract/overwrite Tesseract data + + +combine_tessdata [OPTION] FILE + + +DESCRIPTION +combine_tessdata(1) is the main program to combine/extract/overwrite +tessdata components in [lang].traineddata files. +To combine all the individual tessdata components (unicharset, DAWGs, +classifier templates, ambiguities, language configs) located at, say, +/home/$USER/temp/eng.* run: +combine_tessdata /home/$USER/temp/eng. +The result will be a combined tessdata file /home/$USER/temp/eng.traineddata +Specify option -e if you would like to extract individual components +from a combined traineddata file. For example, to extract language config +file and the unicharset from tessdata/eng.traineddata run: +combine_tessdata -e tessdata/eng.traineddata \ + /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset +The desired config file and unicharset will be written to +/home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset +Specify option -o to overwrite individual components of the given +[lang].traineddata file. For example, to overwrite language config +and unichar ambiguities files in tessdata/eng.traineddata use: +combine_tessdata -o tessdata/eng.traineddata \ + /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs +As a result, tessdata/eng.traineddata will contain the new language config +and unichar ambigs, plus all the original DAWGs, classifier templates, etc. +Note: the file names of the files to extract to and to overwrite from should +have the appropriate file suffixes (extensions) indicating their tessdata +component type (.unicharset for the unicharset, .unicharambigs for unichar +ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h. +Specify option -u to unpack all the components to the specified path: +combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng. +This will create /home/$USER/temp/eng.* files with individual tessdata +components from tessdata/eng.traineddata. + + +OPTIONS +-e .traineddata FILE…: + Extracts the specified components from the .traineddata file +-o .traineddata FILE…: + Overwrites the specified components of the .traineddata file + with those provided on the comand line. +-u .traineddata PATHPREFIX + Unpacks the .traineddata using the provided prefix. + + +CAVEATS +Prefix refers to the full file prefix, including period (.) + + +COMPONENTS +The components in a Tesseract lang.traineddata file as of +Tesseract 3.02 are briefly described below; For more information on +many of these files, see +https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract + + + +lang.config + + + + (Optional) Language-specific overrides to default config variables. + + + + + +lang.unicharset + + + + (Required) The list of symbols that Tesseract recognizes, with properties. + See unicharset(5). + + + + + +lang.unicharambigs + + + + (Optional) This file contains information on pairs of recognized symbols + which are often confused. For example, rn and m. + + + + + +lang.inttemp + + + + (Required) Character shape templates for each unichar. Produced by + mftraining(1). + + + + + +lang.pffmtable + + + + (Required) The number of features expected for each unichar. + Produced by mftraining(1) from .tr files. + + + + + +lang.normproto + + + + (Required) Character normalization prototypes generated by cntraining(1) + from .tr files. + + + + + +lang.punc-dawg + + + + (Optional) A dawg made from punctuation patterns found around words. + The "word" part is replaced by a single space. + + + + + +lang.word-dawg + + + + (Optional) A dawg made from dictionary words from the language. + + + + + +lang.number-dawg + + + + (Optional) A dawg made from tokens which originally contained digits. + Each digit is replaced by a space character. + + + + + +lang.freq-dawg + + + + (Optional) A dawg made from the most frequent words which would have + gone into word-dawg. + + + + + +lang.fixed-length-dawgs + + + + (Optional) Several dawgs of different fixed lengths — useful for + languages like Chinese. + + + + + +lang.shapetable + + + + (Optional) When present, a shapetable is an extra layer between the character + classifier and the word recognizer that allows the character classifier to + return a collection of unichar ids and fonts instead of a single unichar-id + and font. + + + + + +lang.bigram-dawg + + + + (Optional) A dawg of word bigrams where the words are separated by a space + and each digit is replaced by a ?. + + + + + +lang.unambig-dawg + + + + (Optional) TODO: Describe. + + + + + +lang.params-training-model + + + + (Optional) TODO: Describe. + + + + + + +HISTORY +combine_tessdata(1) first appeared in version 3.00 of Tesseract + + +SEE ALSO +tesseract(1), wordlist2dawg(1), cntraining(1), mftraining(1), unicharset(5), +unicharambigs(5) + + +COPYING +Copyright (C) 2009, Google Inc. +Licensed under the Apache License, Version 2.0 + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/doc/dawg2wordlist.1.html b/doc/dawg2wordlist.1.html index b700fe18..0b2645df 100644 --- a/doc/dawg2wordlist.1.html +++ b/doc/dawg2wordlist.1.html @@ -1,802 +1,802 @@ - - - - - -DAWG2WORDLIST(1) - - - - - -
-
-

SYNOPSIS

-
-

dawg2wordlist UNICHARSET DAWG WORDLIST

-
-
-
-

DESCRIPTION

-
-

dawg2wordlist(1) converts a Tesseract Directed Acyclic Word -Graph (DAWG) to a list of words using a unicharset as key.

-
-
-
-

OPTIONS

-
-

UNICHARSET - The unicharset of the language. This is the unicharset - generated by mftraining(1).

-

DAWG - The input DAWG, created by wordlist2dawg(1)

-

WORDLIST - Plain text (output) file in UTF-8, one word per line

-
-
-
-

SEE ALSO

-
-

tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5), -combine_tessdata(1)

- -
-
-
-

COPYING

-
-

Copyright (C) 2012 Google, Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +DAWG2WORDLIST(1) + + + + + +
+
+

SYNOPSIS

+
+

dawg2wordlist UNICHARSET DAWG WORDLIST

+
+
+
+

DESCRIPTION

+
+

dawg2wordlist(1) converts a Tesseract Directed Acyclic Word +Graph (DAWG) to a list of words using a unicharset as key.

+
+
+
+

OPTIONS

+
+

UNICHARSET + The unicharset of the language. This is the unicharset + generated by mftraining(1).

+

DAWG + The input DAWG, created by wordlist2dawg(1)

+

WORDLIST + Plain text (output) file in UTF-8, one word per line

+
+
+
+

SEE ALSO

+
+

tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5), +combine_tessdata(1)

+ +
+
+
+

COPYING

+
+

Copyright (C) 2012 Google, Inc. +Licensed under the Apache License, Version 2.0

+
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/dawg2wordlist.1.xml b/doc/dawg2wordlist.1.xml index c7311319..ee960ad9 100644 --- a/doc/dawg2wordlist.1.xml +++ b/doc/dawg2wordlist.1.xml @@ -1,53 +1,53 @@ - - - - - - - DAWG2WORDLIST(1) - - -dawg2wordlist -1 -  -  - - - dawg2wordlist - convert a Tesseract DAWG to a wordlist - - -dawg2wordlist UNICHARSET DAWG WORDLIST - - -DESCRIPTION -dawg2wordlist(1) converts a Tesseract Directed Acyclic Word -Graph (DAWG) to a list of words using a unicharset as key. - - -OPTIONS -UNICHARSET - The unicharset of the language. This is the unicharset - generated by mftraining(1). -DAWG - The input DAWG, created by wordlist2dawg(1) -WORDLIST - Plain text (output) file in UTF-8, one word per line - - -SEE ALSO -tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5), -combine_tessdata(1) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (C) 2012 Google, Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + DAWG2WORDLIST(1) + + +dawg2wordlist +1 +  +  + + + dawg2wordlist + convert a Tesseract DAWG to a wordlist + + +dawg2wordlist UNICHARSET DAWG WORDLIST + + +DESCRIPTION +dawg2wordlist(1) converts a Tesseract Directed Acyclic Word +Graph (DAWG) to a list of words using a unicharset as key. + + +OPTIONS +UNICHARSET + The unicharset of the language. This is the unicharset + generated by mftraining(1). +DAWG + The input DAWG, created by wordlist2dawg(1) +WORDLIST + Plain text (output) file in UTF-8, one word per line + + +SEE ALSO +tesseract(1), mftraining(1), wordlist2dawg(1), unicharset(5), +combine_tessdata(1) +https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract + + +COPYING +Copyright (C) 2012 Google, Inc. +Licensed under the Apache License, Version 2.0 + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/doc/mftraining.1.asc b/doc/mftraining.1.asc index 85e1263a..43fe533a 100644 --- a/doc/mftraining.1.asc +++ b/doc/mftraining.1.asc @@ -24,12 +24,12 @@ OPTIONS -F 'font_properties_file':: (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: - + *font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur* -X 'xheights_file':: (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] - + *font_name* *xheight* -D 'dir':: diff --git a/doc/mftraining.1.html b/doc/mftraining.1.html index 4abdfd6a..41a38044 100644 --- a/doc/mftraining.1.html +++ b/doc/mftraining.1.html @@ -1,847 +1,847 @@ - - - - - -MFTRAINING(1) - - - - - -
-
-

SYNOPSIS

-
-

mftraining -U unicharset -O lang.unicharset FILE

-
-
-
-

DESCRIPTION

-
-

mftraining takes a list of .tr files, from which it generates the -files inttemp (the shape prototypes), shapetable, and pffmtable -(the number of expected features for each character). (A fourth file -called Microfeat is also written by this program, but it is not used.)

-
-
-
-

OPTIONS

-
-
-
--U FILE -
-
-

- (Input) The unicharset generated by unicharset_extractor(1) -

-
-
--F font_properties_file -
-
-

- (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: -

-
-
-
*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur*
-
-
-
--X xheights_file -
-
-

- (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] -

-
-
-
*font_name* *xheight*
-
-
-
--D dir -
-
-

- Directory to write output files to. -

-
-
--O FILE -
-
-

- (Output) The output unicharset that will be given to combine_tessdata(1) -

-
-
-
-
-
-

SEE ALSO

-
-

tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), -shapeclustering(1), unicharset(5)

- -
-
-
-

COPYING

-
-

Copyright (C) Hewlett-Packard Company, 1988 -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +MFTRAINING(1) + + + + + +
+
+

SYNOPSIS

+
+

mftraining -U unicharset -O lang.unicharset FILE

+
+
+
+

DESCRIPTION

+
+

mftraining takes a list of .tr files, from which it generates the +files inttemp (the shape prototypes), shapetable, and pffmtable +(the number of expected features for each character). (A fourth file +called Microfeat is also written by this program, but it is not used.)

+
+
+
+

OPTIONS

+
+
+
+-U FILE +
+
+

+ (Input) The unicharset generated by unicharset_extractor(1) +

+
+
+-F font_properties_file +
+
+

+ (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: +

+
+
+
*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur*
+
+
+
+-X xheights_file +
+
+

+ (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] +

+
+
+
*font_name* *xheight*
+
+
+
+-D dir +
+
+

+ Directory to write output files to. +

+
+
+-O FILE +
+
+

+ (Output) The output unicharset that will be given to combine_tessdata(1) +

+
+
+
+
+
+

SEE ALSO

+
+

tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), +shapeclustering(1), unicharset(5)

+ +
+
+
+

COPYING

+
+

Copyright (C) Hewlett-Packard Company, 1988 +Licensed under the Apache License, Version 2.0

+
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/mftraining.1.xml b/doc/mftraining.1.xml index 239178a5..10b3c6d2 100644 --- a/doc/mftraining.1.xml +++ b/doc/mftraining.1.xml @@ -1,102 +1,102 @@ - - - - - - - MFTRAINING(1) - - -mftraining -1 -  -  - - - mftraining - feature training for Tesseract - - -mftraining -U unicharset -O lang.unicharset FILE - - -DESCRIPTION -mftraining takes a list of .tr files, from which it generates the -files inttemp (the shape prototypes), shapetable, and pffmtable -(the number of expected features for each character). (A fourth file -called Microfeat is also written by this program, but it is not used.) - - -OPTIONS - - - --U FILE - - - - (Input) The unicharset generated by unicharset_extractor(1) - - - - - --F font_properties_file - - - - (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: - -*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur* - - - - --X xheights_file - - - - (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] - -*font_name* *xheight* - - - - --D dir - - - - Directory to write output files to. - - - - - --O FILE - - - - (Output) The output unicharset that will be given to combine_tessdata(1) - - - - - - -SEE ALSO -tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), -shapeclustering(1), unicharset(5) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (C) Hewlett-Packard Company, 1988 -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + MFTRAINING(1) + + +mftraining +1 +  +  + + + mftraining + feature training for Tesseract + + +mftraining -U unicharset -O lang.unicharset FILE + + +DESCRIPTION +mftraining takes a list of .tr files, from which it generates the +files inttemp (the shape prototypes), shapetable, and pffmtable +(the number of expected features for each character). (A fourth file +called Microfeat is also written by this program, but it is not used.) + + +OPTIONS + + + +-U FILE + + + + (Input) The unicharset generated by unicharset_extractor(1) + + + + + +-F font_properties_file + + + + (Input) font properties file, each line is of the following form, where each field other than the font name is 0 or 1: + +*font_name* *italic* *bold* *fixed_pitch* *serif* *fraktur* + + + + +-X xheights_file + + + + (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] + +*font_name* *xheight* + + + + +-D dir + + + + Directory to write output files to. + + + + + +-O FILE + + + + (Output) The output unicharset that will be given to combine_tessdata(1) + + + + + + +SEE ALSO +tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), +shapeclustering(1), unicharset(5) +https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract + + +COPYING +Copyright (C) Hewlett-Packard Company, 1988 +Licensed under the Apache License, Version 2.0 + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/doc/shapeclustering.1.asc b/doc/shapeclustering.1.asc index 81ca0dbc..0a1bfb03 100644 --- a/doc/shapeclustering.1.asc +++ b/doc/shapeclustering.1.asc @@ -35,7 +35,7 @@ OPTIONS -X 'xheights_file':: (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] - + 'font_name' 'xheight' -O 'FILE':: diff --git a/doc/shapeclustering.1.html b/doc/shapeclustering.1.html index 845d49a8..5fca944f 100644 --- a/doc/shapeclustering.1.html +++ b/doc/shapeclustering.1.html @@ -1,850 +1,850 @@ - - - - - -SHAPECLUSTERING(1) - - - - - -
-
-

SYNOPSIS

-
-

shapeclustering -D output_dir - -U unicharset -O mfunicharset - -F font_props -X xheights - FILE

-
-
-
-

DESCRIPTION

-
-

shapeclustering(1) takes extracted feature .tr files (generated by -tesseract(1) run in a special mode from box files) and produces a -file shapetable and an enhanced unicharset. This program is still -experimental, and is not required (yet) for training Tesseract.

-
-
-
-

OPTIONS

-
-
-
--U FILE -
-
-

- The unicharset generated by unicharset_extractor(1). -

-
-
--D dir -
-
-

- Directory to write output files to. -

-
-
--F font_properties_file -
-
-

- (Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1: -

-
-
-
'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur'
-
-
-
--X xheights_file -
-
-

- (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] -

-
-
-
'font_name' 'xheight'
-
-
-
--O FILE -
-
-

- The output unicharset that will be given to combine_tessdata(1). -

-
-
-
-
-
-

SEE ALSO

-
-

tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), -unicharset(5)

- -
-
-
-

COPYING

-
-

Copyright (C) Google, 2011 -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +SHAPECLUSTERING(1) + + + + + +
+
+

SYNOPSIS

+
+

shapeclustering -D output_dir + -U unicharset -O mfunicharset + -F font_props -X xheights + FILE

+
+
+
+

DESCRIPTION

+
+

shapeclustering(1) takes extracted feature .tr files (generated by +tesseract(1) run in a special mode from box files) and produces a +file shapetable and an enhanced unicharset. This program is still +experimental, and is not required (yet) for training Tesseract.

+
+
+
+

OPTIONS

+
+
+
+-U FILE +
+
+

+ The unicharset generated by unicharset_extractor(1). +

+
+
+-D dir +
+
+

+ Directory to write output files to. +

+
+
+-F font_properties_file +
+
+

+ (Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1: +

+
+
+
'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur'
+
+
+
+-X xheights_file +
+
+

+ (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] +

+
+
+
'font_name' 'xheight'
+
+
+
+-O FILE +
+
+

+ The output unicharset that will be given to combine_tessdata(1). +

+
+
+
+
+
+

SEE ALSO

+
+

tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), +unicharset(5)

+ +
+
+
+

COPYING

+
+

Copyright (C) Google, 2011 +Licensed under the Apache License, Version 2.0

+
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/shapeclustering.1.xml b/doc/shapeclustering.1.xml index d02bcf8d..933789ad 100644 --- a/doc/shapeclustering.1.xml +++ b/doc/shapeclustering.1.xml @@ -1,105 +1,105 @@ - - - - - - - SHAPECLUSTERING(1) - - -shapeclustering -1 -  -  - - - shapeclustering - shape clustering training for Tesseract - - -shapeclustering -D output_dir - -U unicharset -O mfunicharset - -F font_props -X xheights - FILE - - -DESCRIPTION -shapeclustering(1) takes extracted feature .tr files (generated by -tesseract(1) run in a special mode from box files) and produces a -file shapetable and an enhanced unicharset. This program is still -experimental, and is not required (yet) for training Tesseract. - - -OPTIONS - - - --U FILE - - - - The unicharset generated by unicharset_extractor(1). - - - - - --D dir - - - - Directory to write output files to. - - - - - --F font_properties_file - - - - (Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1: - -'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur' - - - - --X xheights_file - - - - (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] - -'font_name' 'xheight' - - - - --O FILE - - - - The output unicharset that will be given to combine_tessdata(1). - - - - - - -SEE ALSO -tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), -unicharset(5) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (C) Google, 2011 -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + SHAPECLUSTERING(1) + + +shapeclustering +1 +  +  + + + shapeclustering + shape clustering training for Tesseract + + +shapeclustering -D output_dir + -U unicharset -O mfunicharset + -F font_props -X xheights + FILE + + +DESCRIPTION +shapeclustering(1) takes extracted feature .tr files (generated by +tesseract(1) run in a special mode from box files) and produces a +file shapetable and an enhanced unicharset. This program is still +experimental, and is not required (yet) for training Tesseract. + + +OPTIONS + + + +-U FILE + + + + The unicharset generated by unicharset_extractor(1). + + + + + +-D dir + + + + Directory to write output files to. + + + + + +-F font_properties_file + + + + (Input) font properties file, where each line is of the following form, where each field other than the font name is 0 or 1: + +'font_name' 'italic' 'bold' 'fixed_pitch' 'serif' 'fraktur' + + + + +-X xheights_file + + + + (Input) x heights file, each line is of the following form, where xheight is calculated as the pixel x height of a character drawn at 32pt on 300 dpi. [ That is, if base x height + ascenders + descenders = 133, how much is x height? ] + +'font_name' 'xheight' + + + + +-O FILE + + + + The output unicharset that will be given to combine_tessdata(1). + + + + + + +SEE ALSO +tesseract(1), cntraining(1), unicharset_extractor(1), combine_tessdata(1), +unicharset(5) +https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract + + +COPYING +Copyright (C) Google, 2011 +Licensed under the Apache License, Version 2.0 + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/doc/tesseract.1 b/doc/tesseract.1 index 95128fec..fdf7cdac 100644 --- a/doc/tesseract.1 +++ b/doc/tesseract.1 @@ -2,12 +2,12 @@ .\" Title: tesseract .\" Author: [see the "AUTHOR" section] .\" Generator: DocBook XSL Stylesheets v1.78.1 -.\" Date: 06/28/2015 +.\" Date: 03/23/2017 .\" Manual: \ \& .\" Source: \ \& .\" Language: English .\" -.TH "TESSERACT" "1" "06/28/2015" "\ \&" "\ \&" +.TH "TESSERACT" "1" "03/23/2017" "\ \&" "\ \&" .\" ----------------------------------------------------------------- .\" * Define some portability stuff .\" ----------------------------------------------------------------- @@ -84,7 +84,7 @@ Set value for control parameter\&. Multiple \-c arguments are allowed\&. The language to use\&. If none is specified, English is assumed\&. Multiple languages may be specified, separated by plus characters\&. Tesseract uses 3\-character ISO 639\-2 language codes\&. (See LANGUAGES) .RE .PP -\fI\-psm N\fR +\fI\-\-psm N\fR .RS 4 Set Tesseract to only run a subset of layout analysis and assume a certain form of image\&. The options for \fBN\fR @@ -111,6 +111,26 @@ are: .\} .RE .PP +\fI\-\-oem N\fR +.RS 4 +Specify OCR Engine mode\&. The options for +\fBN\fR +are: +.sp +.if n \{\ +.RS 4 +.\} +.nf +0 = Original Tesseract only\&. +1 = Neural nets LSTM only\&. +2 = Tesseract + LSTM\&. +3 = Default, based on what is available\&. +.fi +.if n \{\ +.RE +.\} +.RE +.PP \fIconfigfile\fR .RS 4 The name of a config to use\&. A config is a plaintext file which contains a list of variables and their values, one per line, with a space separating variable from value\&. Interesting config files include: @@ -139,22 +159,37 @@ pdf \- Output in pdf instead of a text file\&. .RE .RE .sp -\fBNota Bene:\fR The options \fI\-l lang\fR and \fI\-psm N\fR must occur before any \fIconfigfile\fR\&. +\fBNota Bene:\fR The options \fI\-l lang\fR and \fI\-\-psm N\fR must occur before any \fIconfigfile\fR\&. .SH "SINGLE OPTIONS" .PP -\fI\-v\fR +\fI\-h, \-\-help\fR +.RS 4 +Show help message\&. +.RE +.PP +\fI\-\-help\-psm\fR +.RS 4 +Show page segmentation modes\&. +.RE +.PP +\fI\-\-help\-oem\fR +.RS 4 +Show OCR Engine modes\&. +.RE +.PP +\fI\-v, \-\-version\fR .RS 4 Returns the current version of the tesseract(1) executable\&. .RE .PP \fI\-\-list\-langs\fR .RS 4 -list available languages for tesseract engine\&. Can be used with \-\-tessdata\-dir\&. +List available languages for tesseract engine\&. Can be used with \-\-tessdata\-dir\&. .RE .PP \fI\-\-print\-parameters\fR .RS 4 -print tesseract parameters to the stdout\&. +Print tesseract parameters\&. .RE .SH "LANGUAGES" .sp @@ -220,7 +255,7 @@ user_patterns_suffix user\-patterns Now, if you pass the word \fIbazaar\fR as a trailing command line parameter to Tesseract, Tesseract will not bother loading the system dictionary nor the dictionary of frequent words and will load and use the eng\&.user\-words and eng\&.user\-patterns files you provided\&. The former is a simple word list, one per line\&. The format of the latter is documented in dict/trie\&.h on read_pattern_list()\&. .SH "HISTORY" .sp -The engine was developed at Hewlett Packard Laboratories Bristol and at Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more changes made in 1996 to port to Windows, and some C++izing in 1998\&. A lot of the code was written in C, and then some more was written in C++\&. The C\e++ code makes heavy use of a list system using macros\&. This predates stl, was portable before stl, and is more efficient than stl lists, but has the big negative that if you do get a segmentation violation, it is hard to debug\&. +The engine was developed at Hewlett Packard Laboratories Bristol and at Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more changes made in 1996 to port to Windows, and some C++izing in 1998\&. A lot of the code was written in C, and then some more was written in C++\&. The C++ code makes heavy use of a list system using macros\&. This predates stl, was portable before stl, and is more efficient than stl lists, but has the big negative that if you do get a segmentation violation, it is hard to debug\&. .sp Version 2\&.00 brought Unicode (UTF\-8) support, six languages, and the ability to train Tesseract\&. .sp diff --git a/doc/tesseract.1.asc b/doc/tesseract.1.asc index d6f34d50..6832ea0c 100644 --- a/doc/tesseract.1.asc +++ b/doc/tesseract.1.asc @@ -54,7 +54,7 @@ OPTIONS Multiple languages may be specified, separated by plus characters. Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES) -'-psm N':: +'--psm N':: Set Tesseract to only run a subset of layout analysis and assume a certain form of image. The options for *N* are: @@ -67,9 +67,17 @@ OPTIONS 6 = Assume a single uniform block of text. 7 = Treat the image as a single text line. 8 = Treat the image as a single word. - 9 = Treat the image as a single word in a circle. + 9 = Treat the image as a single word in a circle. 10 = Treat the image as a single character. +'--oem N':: + Specify OCR Engine mode. The options for *N* are: + + 0 = Original Tesseract only. + 1 = Neural nets LSTM only. + 2 = Tesseract + LSTM. + 3 = Default, based on what is available. + 'configfile':: The name of a config to use. A config is a plaintext file which contains a list of variables and their values, one per line, with a @@ -78,20 +86,29 @@ OPTIONS * hocr - Output in hOCR format instead of as a text file. * pdf - Output in pdf instead of a text file. -*Nota Bene:* The options '-l lang' and '-psm N' must occur +*Nota Bene:* The options '-l lang' and '--psm N' must occur before any 'configfile'. SINGLE OPTIONS -------------- -'-v':: +'-h, --help':: + Show help message. + +'--help-psm':: + Show page segmentation modes. + +'--help-oem':: + Show OCR Engine modes. + +'-v, --version':: Returns the current version of the tesseract(1) executable. '--list-langs':: - list available languages for tesseract engine. Can be used with --tessdata-dir. + List available languages for tesseract engine. Can be used with --tessdata-dir. '--print-parameters':: - print tesseract parameters to the stdout. + Print tesseract parameters. @@ -264,11 +281,11 @@ on read_pattern_list(). HISTORY ------- -The engine was developed at Hewlett Packard Laboratories Bristol and at -Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more -changes made in 1996 to port to Windows, and some C\+\+izing in 1998. A -lot of the code was written in C, and then some more was written in C\+\+. -The C\++ code makes heavy use of a list system using macros. This predates +The engine was developed at Hewlett Packard Laboratories Bristol and at +Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more +changes made in 1996 to port to Windows, and some C\+\+izing in 1998. A +lot of the code was written in C, and then some more was written in C\+\+. +The C++ code makes heavy use of a list system using macros. This predates stl, was portable before stl, and is more efficient than stl lists, but has the big negative that if you do get a segmentation violation, it is hard to debug. @@ -276,18 +293,18 @@ debug. Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability to train Tesseract. -Tesseract was included in UNLV's Fourth Annual Test of OCR Accuracy. +Tesseract was included in UNLV's Fourth Annual Test of OCR Accuracy. See . With Tesseract 2.00, -scripts are now included to allow anyone to reproduce some of these tests. -See for more +scripts are now included to allow anyone to reproduce some of these tests. +See for more details. -Tesseract 3.00 adds a number of new languages, including Chinese, Japanese, -and Korean. It also introduces a new, single-file based system of managing +Tesseract 3.00 adds a number of new languages, including Chinese, Japanese, +and Korean. It also introduces a new, single-file based system of managing language data. -Tesseract 3.02 adds BiDirectional text support, the ability to recognize -multiple languages in a single image, and improved layout analysis. +Tesseract 3.02 adds BiDirectional text support, the ability to recognize +multiple languages in a single image, and improved layout analysis. For further details, see the file ReleaseNotes included with the distribution. diff --git a/doc/tesseract.1.html b/doc/tesseract.1.html index 90c5dae7..d9dbcc0b 100644 --- a/doc/tesseract.1.html +++ b/doc/tesseract.1.html @@ -1,1163 +1,1202 @@ - - - - - -TESSERACT(1) - - - - - -
-
-

SYNOPSIS

-
-

tesseract imagename|stdin outputbase|stdout [options…] [configfile…]

-
-
-
-

DESCRIPTION

-
-

tesseract(1) is a commercial quality OCR engine originally developed at HP -between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by -UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed -at Google since then.

-
-
-
-

IN/OUT ARGUMENTS

-
-
-
-imagename -
-
-

- The name of the input image. Most image file formats (anything - readable by Leptonica) are supported. -

-
-
-stdin -
-
-

- Instruction to read data from standard input -

-
-
-outputbase -
-
-

- The basename of the output file (to which the appropriate extension - will be appended). By default the output will be named outbase.txt. -

-
-
-stdout -
-
-

- Instruction to sent output data to standard output -

-
-
-
-
-
-

OPTIONS

-
-
-
---tessdata-dir /path -
-
-

- Specify the location of tessdata path -

-
-
---user-words /path/to/file -
-
-

- Specify the location of user words file -

-
-
---user-patterns /path/to/file specify -
-
-

- The location of user patterns file -

-
-
--c configvar=value -
-
-

- Set value for control parameter. Multiple -c arguments are allowed. -

-
-
--l lang -
-
-

- The language to use. If none is specified, English is assumed. - Multiple languages may be specified, separated by plus characters. - Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES) -

-
-
--psm N -
-
-

- Set Tesseract to only run a subset of layout analysis and assume - a certain form of image. The options for N are: -

-
-
-
0 = Orientation and script detection (OSD) only.
-1 = Automatic page segmentation with OSD.
-2 = Automatic page segmentation, but no OSD, or OCR.
-3 = Fully automatic page segmentation, but no OSD. (Default)
-4 = Assume a single column of text of variable sizes.
-5 = Assume a single uniform block of vertically aligned text.
-6 = Assume a single uniform block of text.
-7 = Treat the image as a single text line.
-8 = Treat the image as a single word.
-9 = Treat the image as a single word in a circle.
-10 = Treat the image as a single character.
-
-
-
-configfile -
-
-

- The name of a config to use. A config is a plaintext file which - contains a list of variables and their values, one per line, with a - space separating variable from value. Interesting config files - include:
-

-
    -
  • -

    -hocr - Output in hOCR format instead of as a text file. -

    -
  • -
  • -

    -pdf - Output in pdf instead of a text file. -

    -
  • -
-
-
-

Nota Bene: The options -l lang and -psm N must occur -before any configfile.

-
-
-
-

SINGLE OPTIONS

-
-
-
--v -
-
-

- Returns the current version of the tesseract(1) executable. -

-
-
---list-langs -
-
-

- list available languages for tesseract engine. Can be used with --tessdata-dir. -

-
-
---print-parameters -
-
-

- print tesseract parameters to the stdout. -

-
-
-
-
-
-

LANGUAGES

-
-

There are currently language packs available for the following languages -(in https://github.com/tesseract-ocr/tessdata):

-

afr (Afrikaans) -amh (Amharic) -ara (Arabic) -asm (Assamese) -aze (Azerbaijani) -aze_cyrl (Azerbaijani - Cyrilic) -bel (Belarusian) -ben (Bengali) -bod (Tibetan) -bos (Bosnian) -bul (Bulgarian) -cat (Catalan; Valencian) -ceb (Cebuano) -ces (Czech) -chi_sim (Chinese - Simplified) -chi_tra (Chinese - Traditional) -chr (Cherokee) -cym (Welsh) -dan (Danish) -dan_frak (Danish - Fraktur) -deu (German) -deu_frak (German - Fraktur) -dzo (Dzongkha) -ell (Greek, Modern (1453-)) -eng (English) -enm (English, Middle (1100-1500)) -epo (Esperanto) -equ (Math / equation detection module) -est (Estonian) -eus (Basque) -fas (Persian) -fin (Finnish) -fra (French) -frk (Frankish) -frm (French, Middle (ca.1400-1600)) -gle (Irish) -glg (Galician) -grc (Greek, Ancient (to 1453)) -guj (Gujarati) -hat (Haitian; Haitian Creole) -heb (Hebrew) -hin (Hindi) -hrv (Croatian) -hun (Hungarian) -iku (Inuktitut) -ind (Indonesian) -isl (Icelandic) -ita (Italian) -ita_old (Italian - Old) -jav (Javanese) -jpn (Japanese) -kan (Kannada) -kat (Georgian) -kat_old (Georgian - Old) -kaz (Kazakh) -khm (Central Khmer) -kir (Kirghiz; Kyrgyz) -kor (Korean) -kur (Kurdish) -lao (Lao) -lat (Latin) -lav (Latvian) -lit (Lithuanian) -mal (Malayalam) -mar (Marathi) -mkd (Macedonian) -mlt (Maltese) -msa (Malay) -mya (Burmese) -nep (Nepali) -nld (Dutch; Flemish) -nor (Norwegian) -ori (Oriya) -osd (Orientation and script detection module) -pan (Panjabi; Punjabi) -pol (Polish) -por (Portuguese) -pus (Pushto; Pashto) -ron (Romanian; Moldavian; Moldovan) -rus (Russian) -san (Sanskrit) -sin (Sinhala; Sinhalese) -slk (Slovak) -slk_frak (Slovak - Fraktur) -slv (Slovenian) -spa (Spanish; Castilian) -spa_old (Spanish; Castilian - Old) -sqi (Albanian) -srp (Serbian) -srp_latn (Serbian - Latin) -swa (Swahili) -swe (Swedish) -syr (Syriac) -tam (Tamil) -tel (Telugu) -tgk (Tajik) -tgl (Tagalog) -tha (Thai) -tir (Tigrinya) -tur (Turkish) -uig (Uighur; Uyghur) -ukr (Ukrainian) -urd (Urdu) -uzb (Uzbek) -uzb_cyrl (Uzbek - Cyrilic) -vie (Vietnamese) -yid (Yiddish)

-

To use a non-standard language pack named foo.traineddata, set the -TESSDATA_PREFIX environment variable so the file can be found at -TESSDATA_PREFIX/tessdata/foo.traineddata and give Tesseract the -argument -l foo.

-
-
-
-

CONFIG FILES AND AUGMENTING WITH USER DATA

-
-

Tesseract config files consist of lines with variable-value pairs (space -separated). The variables are documented as flags in the source code like -the following one in tesseractclass.h:

-

STRING_VAR_H(tessedit_char_blacklist, "", - "Blacklist of chars not to recognize");

-

These variables may enable or disable various features of the engine, and -may cause it to load (or not load) various data. For instance, let’s suppose -you want to OCR in English, but suppress the normal dictionary and load an -alternative word list and an alternative list of patterns — these two files -are the most commonly used extra data files.

-

If your language pack is in /path/to/eng.traineddata and the hocr config -is in /path/to/configs/hocr then create three new files:

-

/path/to/eng.user-words:

-
-
the
-quick
-brown
-fox
-jumped
-
-
-

/path/to/eng.user-patterns:

-
-
1-\d\d\d-GOOG-411
-www.\n\\\*.com
-
-
-

/path/to/configs/bazaar:

-
-
load_system_dawg     F
-load_freq_dawg       F
-user_words_suffix    user-words
-user_patterns_suffix user-patterns
-
-
-

Now, if you pass the word bazaar as a trailing command line parameter -to Tesseract, Tesseract will not bother loading the system dictionary nor -the dictionary of frequent words and will load and use the eng.user-words -and eng.user-patterns files you provided. The former is a simple word list, -one per line. The format of the latter is documented in dict/trie.h -on read_pattern_list().

-
-
-
-

HISTORY

-
-

The engine was developed at Hewlett Packard Laboratories Bristol and at -Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more -changes made in 1996 to port to Windows, and some C++izing in 1998. A -lot of the code was written in C, and then some more was written in C++. -The C\++ code makes heavy use of a list system using macros. This predates -stl, was portable before stl, and is more efficient than stl lists, but has -the big negative that if you do get a segmentation violation, it is hard to -debug.

-

Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability -to train Tesseract.

-

Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy. -See https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf. With Tesseract 2.00, -scripts are now included to allow anyone to reproduce some of these tests. -See https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract for more -details.

-

Tesseract 3.00 adds a number of new languages, including Chinese, Japanese, -and Korean. It also introduces a new, single-file based system of managing -language data.

-

Tesseract 3.02 adds BiDirectional text support, the ability to recognize -multiple languages in a single image, and improved layout analysis.

-

For further details, see the file ReleaseNotes included with the distribution.

-
-
- -
-

SEE ALSO

-
-

ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1), -shape_training(1), mftraining(1), unicharambigs(5), unicharset(5), -unicharset_extractor(1), wordlist2dawg(1)

-
-
-
-

AUTHOR

-
-

Tesseract development was led at Hewlett-Packard and Google by Ray Smith. -The development team has included:

-

Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger, -Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke, -Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle, -Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel -Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh -Lloyd, Shobhit Saxena, and Thomas Kielbus.

-
-
-
-

COPYING

-
-

Licensed under the Apache License, Version 2.0

-
-
-
-

- - - + + + + + +TESSERACT(1) + + + + + +
+
+

SYNOPSIS

+
+

tesseract imagename|stdin outputbase|stdout [options…] [configfile…]

+
+
+
+

DESCRIPTION

+
+

tesseract(1) is a commercial quality OCR engine originally developed at HP +between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by +UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed +at Google since then.

+
+
+
+

IN/OUT ARGUMENTS

+
+
+
+imagename +
+
+

+ The name of the input image. Most image file formats (anything + readable by Leptonica) are supported. +

+
+
+stdin +
+
+

+ Instruction to read data from standard input +

+
+
+outputbase +
+
+

+ The basename of the output file (to which the appropriate extension + will be appended). By default the output will be named outbase.txt. +

+
+
+stdout +
+
+

+ Instruction to sent output data to standard output +

+
+
+
+
+
+

OPTIONS

+
+
+
+--tessdata-dir /path +
+
+

+ Specify the location of tessdata path +

+
+
+--user-words /path/to/file +
+
+

+ Specify the location of user words file +

+
+
+--user-patterns /path/to/file specify +
+
+

+ The location of user patterns file +

+
+
+-c configvar=value +
+
+

+ Set value for control parameter. Multiple -c arguments are allowed. +

+
+
+-l lang +
+
+

+ The language to use. If none is specified, English is assumed. + Multiple languages may be specified, separated by plus characters. + Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES) +

+
+
+--psm N +
+
+

+ Set Tesseract to only run a subset of layout analysis and assume + a certain form of image. The options for N are: +

+
+
+
0 = Orientation and script detection (OSD) only.
+1 = Automatic page segmentation with OSD.
+2 = Automatic page segmentation, but no OSD, or OCR.
+3 = Fully automatic page segmentation, but no OSD. (Default)
+4 = Assume a single column of text of variable sizes.
+5 = Assume a single uniform block of vertically aligned text.
+6 = Assume a single uniform block of text.
+7 = Treat the image as a single text line.
+8 = Treat the image as a single word.
+9 = Treat the image as a single word in a circle.
+10 = Treat the image as a single character.
+
+
+
+--oem N +
+
+

+ Specify OCR Engine mode. The options for N are: +

+
+
+
0 = Original Tesseract only.
+1 = Neural nets LSTM only.
+2 = Tesseract + LSTM.
+3 = Default, based on what is available.
+
+
+
+configfile +
+
+

+ The name of a config to use. A config is a plaintext file which + contains a list of variables and their values, one per line, with a + space separating variable from value. Interesting config files + include:
+

+
    +
  • +

    +hocr - Output in hOCR format instead of as a text file. +

    +
  • +
  • +

    +pdf - Output in pdf instead of a text file. +

    +
  • +
+
+
+

Nota Bene: The options -l lang and --psm N must occur +before any configfile.

+
+
+
+

SINGLE OPTIONS

+
+
+
+-h, --help +
+
+

+ Show help message. +

+
+
+--help-psm +
+
+

+ Show page segmentation modes. +

+
+
+--help-oem +
+
+

+ Show OCR Engine modes. +

+
+
+-v, --version +
+
+

+ Returns the current version of the tesseract(1) executable. +

+
+
+--list-langs +
+
+

+ List available languages for tesseract engine. Can be used with --tessdata-dir. +

+
+
+--print-parameters +
+
+

+ Print tesseract parameters. +

+
+
+
+
+
+

LANGUAGES

+
+

There are currently language packs available for the following languages +(in https://github.com/tesseract-ocr/tessdata):

+

afr (Afrikaans) +amh (Amharic) +ara (Arabic) +asm (Assamese) +aze (Azerbaijani) +aze_cyrl (Azerbaijani - Cyrilic) +bel (Belarusian) +ben (Bengali) +bod (Tibetan) +bos (Bosnian) +bul (Bulgarian) +cat (Catalan; Valencian) +ceb (Cebuano) +ces (Czech) +chi_sim (Chinese - Simplified) +chi_tra (Chinese - Traditional) +chr (Cherokee) +cym (Welsh) +dan (Danish) +dan_frak (Danish - Fraktur) +deu (German) +deu_frak (German - Fraktur) +dzo (Dzongkha) +ell (Greek, Modern (1453-)) +eng (English) +enm (English, Middle (1100-1500)) +epo (Esperanto) +equ (Math / equation detection module) +est (Estonian) +eus (Basque) +fas (Persian) +fin (Finnish) +fra (French) +frk (Frankish) +frm (French, Middle (ca.1400-1600)) +gle (Irish) +glg (Galician) +grc (Greek, Ancient (to 1453)) +guj (Gujarati) +hat (Haitian; Haitian Creole) +heb (Hebrew) +hin (Hindi) +hrv (Croatian) +hun (Hungarian) +iku (Inuktitut) +ind (Indonesian) +isl (Icelandic) +ita (Italian) +ita_old (Italian - Old) +jav (Javanese) +jpn (Japanese) +kan (Kannada) +kat (Georgian) +kat_old (Georgian - Old) +kaz (Kazakh) +khm (Central Khmer) +kir (Kirghiz; Kyrgyz) +kor (Korean) +kur (Kurdish) +lao (Lao) +lat (Latin) +lav (Latvian) +lit (Lithuanian) +mal (Malayalam) +mar (Marathi) +mkd (Macedonian) +mlt (Maltese) +msa (Malay) +mya (Burmese) +nep (Nepali) +nld (Dutch; Flemish) +nor (Norwegian) +ori (Oriya) +osd (Orientation and script detection module) +pan (Panjabi; Punjabi) +pol (Polish) +por (Portuguese) +pus (Pushto; Pashto) +ron (Romanian; Moldavian; Moldovan) +rus (Russian) +san (Sanskrit) +sin (Sinhala; Sinhalese) +slk (Slovak) +slk_frak (Slovak - Fraktur) +slv (Slovenian) +spa (Spanish; Castilian) +spa_old (Spanish; Castilian - Old) +sqi (Albanian) +srp (Serbian) +srp_latn (Serbian - Latin) +swa (Swahili) +swe (Swedish) +syr (Syriac) +tam (Tamil) +tel (Telugu) +tgk (Tajik) +tgl (Tagalog) +tha (Thai) +tir (Tigrinya) +tur (Turkish) +uig (Uighur; Uyghur) +ukr (Ukrainian) +urd (Urdu) +uzb (Uzbek) +uzb_cyrl (Uzbek - Cyrilic) +vie (Vietnamese) +yid (Yiddish)

+

To use a non-standard language pack named foo.traineddata, set the +TESSDATA_PREFIX environment variable so the file can be found at +TESSDATA_PREFIX/tessdata/foo.traineddata and give Tesseract the +argument -l foo.

+
+
+
+

CONFIG FILES AND AUGMENTING WITH USER DATA

+
+

Tesseract config files consist of lines with variable-value pairs (space +separated). The variables are documented as flags in the source code like +the following one in tesseractclass.h:

+

STRING_VAR_H(tessedit_char_blacklist, "", + "Blacklist of chars not to recognize");

+

These variables may enable or disable various features of the engine, and +may cause it to load (or not load) various data. For instance, let’s suppose +you want to OCR in English, but suppress the normal dictionary and load an +alternative word list and an alternative list of patterns — these two files +are the most commonly used extra data files.

+

If your language pack is in /path/to/eng.traineddata and the hocr config +is in /path/to/configs/hocr then create three new files:

+

/path/to/eng.user-words:

+
+
the
+quick
+brown
+fox
+jumped
+
+
+

/path/to/eng.user-patterns:

+
+
1-\d\d\d-GOOG-411
+www.\n\\\*.com
+
+
+

/path/to/configs/bazaar:

+
+
load_system_dawg     F
+load_freq_dawg       F
+user_words_suffix    user-words
+user_patterns_suffix user-patterns
+
+
+

Now, if you pass the word bazaar as a trailing command line parameter +to Tesseract, Tesseract will not bother loading the system dictionary nor +the dictionary of frequent words and will load and use the eng.user-words +and eng.user-patterns files you provided. The former is a simple word list, +one per line. The format of the latter is documented in dict/trie.h +on read_pattern_list().

+
+
+
+

HISTORY

+
+

The engine was developed at Hewlett Packard Laboratories Bristol and at +Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more +changes made in 1996 to port to Windows, and some C++izing in 1998. A +lot of the code was written in C, and then some more was written in C++. +The C++ code makes heavy use of a list system using macros. This predates +stl, was portable before stl, and is more efficient than stl lists, but has +the big negative that if you do get a segmentation violation, it is hard to +debug.

+

Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability +to train Tesseract.

+

Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy. +See https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf. With Tesseract 2.00, +scripts are now included to allow anyone to reproduce some of these tests. +See https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract for more +details.

+

Tesseract 3.00 adds a number of new languages, including Chinese, Japanese, +and Korean. It also introduces a new, single-file based system of managing +language data.

+

Tesseract 3.02 adds BiDirectional text support, the ability to recognize +multiple languages in a single image, and improved layout analysis.

+

For further details, see the file ReleaseNotes included with the distribution.

+
+
+ +
+

SEE ALSO

+
+

ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1), +shape_training(1), mftraining(1), unicharambigs(5), unicharset(5), +unicharset_extractor(1), wordlist2dawg(1)

+
+
+
+

AUTHOR

+
+

Tesseract development was led at Hewlett-Packard and Google by Ray Smith. +The development team has included:

+

Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger, +Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke, +Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle, +Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel +Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh +Lloyd, Shobhit Saxena, and Thomas Kielbus.

+
+
+
+

COPYING

+
+

Licensed under the Apache License, Version 2.0

+
+
+
+

+ + + diff --git a/doc/tesseract.1.xml b/doc/tesseract.1.xml index 2f971caa..941caa5b 100644 --- a/doc/tesseract.1.xml +++ b/doc/tesseract.1.xml @@ -1,424 +1,468 @@ - - - - - - - TESSERACT(1) - - -tesseract -1 -  -  - - - tesseract - command-line OCR engine - - -tesseract imagename|stdin outputbase|stdout [options…] [configfile…] - - -DESCRIPTION -tesseract(1) is a commercial quality OCR engine originally developed at HP -between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by -UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed -at Google since then. - - -IN/OUT ARGUMENTS - - - -imagename - - - - The name of the input image. Most image file formats (anything - readable by Leptonica) are supported. - - - - - -stdin - - - - Instruction to read data from standard input - - - - - -outputbase - - - - The basename of the output file (to which the appropriate extension - will be appended). By default the output will be named outbase.txt. - - - - - -stdout - - - - Instruction to sent output data to standard output - - - - - - -OPTIONS - - - ---tessdata-dir /path - - - - Specify the location of tessdata path - - - - - ---user-words /path/to/file - - - - Specify the location of user words file - - - - - ---user-patterns /path/to/file specify - - - - The location of user patterns file - - - - - --c configvar=value - - - - Set value for control parameter. Multiple -c arguments are allowed. - - - - - --l lang - - - - The language to use. If none is specified, English is assumed. - Multiple languages may be specified, separated by plus characters. - Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES) - - - - - --psm N - - - - Set Tesseract to only run a subset of layout analysis and assume - a certain form of image. The options for N are: - -0 = Orientation and script detection (OSD) only. -1 = Automatic page segmentation with OSD. -2 = Automatic page segmentation, but no OSD, or OCR. -3 = Fully automatic page segmentation, but no OSD. (Default) -4 = Assume a single column of text of variable sizes. -5 = Assume a single uniform block of vertically aligned text. -6 = Assume a single uniform block of text. -7 = Treat the image as a single text line. -8 = Treat the image as a single word. -9 = Treat the image as a single word in a circle. -10 = Treat the image as a single character. - - - - -configfile - - - - The name of a config to use. A config is a plaintext file which - contains a list of variables and their values, one per line, with a - space separating variable from value. Interesting config files - include: - - - - -hocr - Output in hOCR format instead of as a text file. - - - - -pdf - Output in pdf instead of a text file. - - - - - - -Nota Bene: The options -l lang and -psm N must occur -before any configfile. - - -SINGLE OPTIONS - - - --v - - - - Returns the current version of the tesseract(1) executable. - - - - - ---list-langs - - - - list available languages for tesseract engine. Can be used with --tessdata-dir. - - - - - ---print-parameters - - - - print tesseract parameters to the stdout. - - - - - - -LANGUAGES -There are currently language packs available for the following languages -(in https://github.com/tesseract-ocr/tessdata): -afr (Afrikaans) -amh (Amharic) -ara (Arabic) -asm (Assamese) -aze (Azerbaijani) -aze_cyrl (Azerbaijani - Cyrilic) -bel (Belarusian) -ben (Bengali) -bod (Tibetan) -bos (Bosnian) -bul (Bulgarian) -cat (Catalan; Valencian) -ceb (Cebuano) -ces (Czech) -chi_sim (Chinese - Simplified) -chi_tra (Chinese - Traditional) -chr (Cherokee) -cym (Welsh) -dan (Danish) -dan_frak (Danish - Fraktur) -deu (German) -deu_frak (German - Fraktur) -dzo (Dzongkha) -ell (Greek, Modern (1453-)) -eng (English) -enm (English, Middle (1100-1500)) -epo (Esperanto) -equ (Math / equation detection module) -est (Estonian) -eus (Basque) -fas (Persian) -fin (Finnish) -fra (French) -frk (Frankish) -frm (French, Middle (ca.1400-1600)) -gle (Irish) -glg (Galician) -grc (Greek, Ancient (to 1453)) -guj (Gujarati) -hat (Haitian; Haitian Creole) -heb (Hebrew) -hin (Hindi) -hrv (Croatian) -hun (Hungarian) -iku (Inuktitut) -ind (Indonesian) -isl (Icelandic) -ita (Italian) -ita_old (Italian - Old) -jav (Javanese) -jpn (Japanese) -kan (Kannada) -kat (Georgian) -kat_old (Georgian - Old) -kaz (Kazakh) -khm (Central Khmer) -kir (Kirghiz; Kyrgyz) -kor (Korean) -kur (Kurdish) -lao (Lao) -lat (Latin) -lav (Latvian) -lit (Lithuanian) -mal (Malayalam) -mar (Marathi) -mkd (Macedonian) -mlt (Maltese) -msa (Malay) -mya (Burmese) -nep (Nepali) -nld (Dutch; Flemish) -nor (Norwegian) -ori (Oriya) -osd (Orientation and script detection module) -pan (Panjabi; Punjabi) -pol (Polish) -por (Portuguese) -pus (Pushto; Pashto) -ron (Romanian; Moldavian; Moldovan) -rus (Russian) -san (Sanskrit) -sin (Sinhala; Sinhalese) -slk (Slovak) -slk_frak (Slovak - Fraktur) -slv (Slovenian) -spa (Spanish; Castilian) -spa_old (Spanish; Castilian - Old) -sqi (Albanian) -srp (Serbian) -srp_latn (Serbian - Latin) -swa (Swahili) -swe (Swedish) -syr (Syriac) -tam (Tamil) -tel (Telugu) -tgk (Tajik) -tgl (Tagalog) -tha (Thai) -tir (Tigrinya) -tur (Turkish) -uig (Uighur; Uyghur) -ukr (Ukrainian) -urd (Urdu) -uzb (Uzbek) -uzb_cyrl (Uzbek - Cyrilic) -vie (Vietnamese) -yid (Yiddish) -To use a non-standard language pack named foo.traineddata, set the -TESSDATA_PREFIX environment variable so the file can be found at -TESSDATA_PREFIX/tessdata/foo.traineddata and give Tesseract the -argument -l foo. - - -CONFIG FILES AND AUGMENTING WITH USER DATA -Tesseract config files consist of lines with variable-value pairs (space -separated). The variables are documented as flags in the source code like -the following one in tesseractclass.h: -STRING_VAR_H(tessedit_char_blacklist, "", - "Blacklist of chars not to recognize"); -These variables may enable or disable various features of the engine, and -may cause it to load (or not load) various data. For instance, let’s suppose -you want to OCR in English, but suppress the normal dictionary and load an -alternative word list and an alternative list of patterns — these two files -are the most commonly used extra data files. -If your language pack is in /path/to/eng.traineddata and the hocr config -is in /path/to/configs/hocr then create three new files: -/path/to/eng.user-words: -
-the -quick -brown -fox -jumped -
-/path/to/eng.user-patterns: -
-1-\d\d\d-GOOG-411 -www.\n\\\*.com -
-/path/to/configs/bazaar: -
-load_system_dawg F -load_freq_dawg F -user_words_suffix user-words -user_patterns_suffix user-patterns -
-Now, if you pass the word bazaar as a trailing command line parameter -to Tesseract, Tesseract will not bother loading the system dictionary nor -the dictionary of frequent words and will load and use the eng.user-words -and eng.user-patterns files you provided. The former is a simple word list, -one per line. The format of the latter is documented in dict/trie.h -on read_pattern_list(). -
- -HISTORY -The engine was developed at Hewlett Packard Laboratories Bristol and at -Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more -changes made in 1996 to port to Windows, and some C++izing in 1998. A -lot of the code was written in C, and then some more was written in C++. -The C\++ code makes heavy use of a list system using macros. This predates -stl, was portable before stl, and is more efficient than stl lists, but has -the big negative that if you do get a segmentation violation, it is hard to -debug. -Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability -to train Tesseract. -Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy. -See https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf. With Tesseract 2.00, -scripts are now included to allow anyone to reproduce some of these tests. -See https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract for more -details. -Tesseract 3.00 adds a number of new languages, including Chinese, Japanese, -and Korean. It also introduces a new, single-file based system of managing -language data. -Tesseract 3.02 adds BiDirectional text support, the ability to recognize -multiple languages in a single image, and improved layout analysis. -For further details, see the file ReleaseNotes included with the distribution. - - -RESOURCES -Main web site: https://github.com/tesseract-ocr -Information on training: https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -SEE ALSO -ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1), -shape_training(1), mftraining(1), unicharambigs(5), unicharset(5), -unicharset_extractor(1), wordlist2dawg(1) - - -AUTHOR -Tesseract development was led at Hewlett-Packard and Google by Ray Smith. -The development team has included: -Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger, -Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke, -Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle, -Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel -Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh -Lloyd, Shobhit Saxena, and Thomas Kielbus. - - -COPYING -Licensed under the Apache License, Version 2.0 - -
+ + + + + + + TESSERACT(1) + + +tesseract +1 +  +  + + + tesseract + command-line OCR engine + + +tesseract imagename|stdin outputbase|stdout [options…] [configfile…] + + +DESCRIPTION +tesseract(1) is a commercial quality OCR engine originally developed at HP +between 1985 and 1995. In 1995, this engine was among the top 3 evaluated by +UNLV. It was open-sourced by HP and UNLV in 2005, and has been developed +at Google since then. + + +IN/OUT ARGUMENTS + + + +imagename + + + + The name of the input image. Most image file formats (anything + readable by Leptonica) are supported. + + + + + +stdin + + + + Instruction to read data from standard input + + + + + +outputbase + + + + The basename of the output file (to which the appropriate extension + will be appended). By default the output will be named outbase.txt. + + + + + +stdout + + + + Instruction to sent output data to standard output + + + + + + +OPTIONS + + + +--tessdata-dir /path + + + + Specify the location of tessdata path + + + + + +--user-words /path/to/file + + + + Specify the location of user words file + + + + + +--user-patterns /path/to/file specify + + + + The location of user patterns file + + + + + +-c configvar=value + + + + Set value for control parameter. Multiple -c arguments are allowed. + + + + + +-l lang + + + + The language to use. If none is specified, English is assumed. + Multiple languages may be specified, separated by plus characters. + Tesseract uses 3-character ISO 639-2 language codes. (See LANGUAGES) + + + + + +--psm N + + + + Set Tesseract to only run a subset of layout analysis and assume + a certain form of image. The options for N are: + +0 = Orientation and script detection (OSD) only. +1 = Automatic page segmentation with OSD. +2 = Automatic page segmentation, but no OSD, or OCR. +3 = Fully automatic page segmentation, but no OSD. (Default) +4 = Assume a single column of text of variable sizes. +5 = Assume a single uniform block of vertically aligned text. +6 = Assume a single uniform block of text. +7 = Treat the image as a single text line. +8 = Treat the image as a single word. +9 = Treat the image as a single word in a circle. +10 = Treat the image as a single character. + + + + +--oem N + + + + Specify OCR Engine mode. The options for N are: + +0 = Original Tesseract only. +1 = Neural nets LSTM only. +2 = Tesseract + LSTM. +3 = Default, based on what is available. + + + + +configfile + + + + The name of a config to use. A config is a plaintext file which + contains a list of variables and their values, one per line, with a + space separating variable from value. Interesting config files + include: + + + + +hocr - Output in hOCR format instead of as a text file. + + + + +pdf - Output in pdf instead of a text file. + + + + + + +Nota Bene: The options -l lang and --psm N must occur +before any configfile. + + +SINGLE OPTIONS + + + +-h, --help + + + + Show help message. + + + + + +--help-psm + + + + Show page segmentation modes. + + + + + +--help-oem + + + + Show OCR Engine modes. + + + + + +-v, --version + + + + Returns the current version of the tesseract(1) executable. + + + + + +--list-langs + + + + List available languages for tesseract engine. Can be used with --tessdata-dir. + + + + + +--print-parameters + + + + Print tesseract parameters. + + + + + + +LANGUAGES +There are currently language packs available for the following languages +(in https://github.com/tesseract-ocr/tessdata): +afr (Afrikaans) +amh (Amharic) +ara (Arabic) +asm (Assamese) +aze (Azerbaijani) +aze_cyrl (Azerbaijani - Cyrilic) +bel (Belarusian) +ben (Bengali) +bod (Tibetan) +bos (Bosnian) +bul (Bulgarian) +cat (Catalan; Valencian) +ceb (Cebuano) +ces (Czech) +chi_sim (Chinese - Simplified) +chi_tra (Chinese - Traditional) +chr (Cherokee) +cym (Welsh) +dan (Danish) +dan_frak (Danish - Fraktur) +deu (German) +deu_frak (German - Fraktur) +dzo (Dzongkha) +ell (Greek, Modern (1453-)) +eng (English) +enm (English, Middle (1100-1500)) +epo (Esperanto) +equ (Math / equation detection module) +est (Estonian) +eus (Basque) +fas (Persian) +fin (Finnish) +fra (French) +frk (Frankish) +frm (French, Middle (ca.1400-1600)) +gle (Irish) +glg (Galician) +grc (Greek, Ancient (to 1453)) +guj (Gujarati) +hat (Haitian; Haitian Creole) +heb (Hebrew) +hin (Hindi) +hrv (Croatian) +hun (Hungarian) +iku (Inuktitut) +ind (Indonesian) +isl (Icelandic) +ita (Italian) +ita_old (Italian - Old) +jav (Javanese) +jpn (Japanese) +kan (Kannada) +kat (Georgian) +kat_old (Georgian - Old) +kaz (Kazakh) +khm (Central Khmer) +kir (Kirghiz; Kyrgyz) +kor (Korean) +kur (Kurdish) +lao (Lao) +lat (Latin) +lav (Latvian) +lit (Lithuanian) +mal (Malayalam) +mar (Marathi) +mkd (Macedonian) +mlt (Maltese) +msa (Malay) +mya (Burmese) +nep (Nepali) +nld (Dutch; Flemish) +nor (Norwegian) +ori (Oriya) +osd (Orientation and script detection module) +pan (Panjabi; Punjabi) +pol (Polish) +por (Portuguese) +pus (Pushto; Pashto) +ron (Romanian; Moldavian; Moldovan) +rus (Russian) +san (Sanskrit) +sin (Sinhala; Sinhalese) +slk (Slovak) +slk_frak (Slovak - Fraktur) +slv (Slovenian) +spa (Spanish; Castilian) +spa_old (Spanish; Castilian - Old) +sqi (Albanian) +srp (Serbian) +srp_latn (Serbian - Latin) +swa (Swahili) +swe (Swedish) +syr (Syriac) +tam (Tamil) +tel (Telugu) +tgk (Tajik) +tgl (Tagalog) +tha (Thai) +tir (Tigrinya) +tur (Turkish) +uig (Uighur; Uyghur) +ukr (Ukrainian) +urd (Urdu) +uzb (Uzbek) +uzb_cyrl (Uzbek - Cyrilic) +vie (Vietnamese) +yid (Yiddish) +To use a non-standard language pack named foo.traineddata, set the +TESSDATA_PREFIX environment variable so the file can be found at +TESSDATA_PREFIX/tessdata/foo.traineddata and give Tesseract the +argument -l foo. + + +CONFIG FILES AND AUGMENTING WITH USER DATA +Tesseract config files consist of lines with variable-value pairs (space +separated). The variables are documented as flags in the source code like +the following one in tesseractclass.h: +STRING_VAR_H(tessedit_char_blacklist, "", + "Blacklist of chars not to recognize"); +These variables may enable or disable various features of the engine, and +may cause it to load (or not load) various data. For instance, let’s suppose +you want to OCR in English, but suppress the normal dictionary and load an +alternative word list and an alternative list of patterns — these two files +are the most commonly used extra data files. +If your language pack is in /path/to/eng.traineddata and the hocr config +is in /path/to/configs/hocr then create three new files: +/path/to/eng.user-words: +
+the +quick +brown +fox +jumped +
+/path/to/eng.user-patterns: +
+1-\d\d\d-GOOG-411 +www.\n\\\*.com +
+/path/to/configs/bazaar: +
+load_system_dawg F +load_freq_dawg F +user_words_suffix user-words +user_patterns_suffix user-patterns +
+Now, if you pass the word bazaar as a trailing command line parameter +to Tesseract, Tesseract will not bother loading the system dictionary nor +the dictionary of frequent words and will load and use the eng.user-words +and eng.user-patterns files you provided. The former is a simple word list, +one per line. The format of the latter is documented in dict/trie.h +on read_pattern_list(). +
+ +HISTORY +The engine was developed at Hewlett Packard Laboratories Bristol and at +Hewlett Packard Co, Greeley Colorado between 1985 and 1994, with some more +changes made in 1996 to port to Windows, and some C++izing in 1998. A +lot of the code was written in C, and then some more was written in C++. +The C++ code makes heavy use of a list system using macros. This predates +stl, was portable before stl, and is more efficient than stl lists, but has +the big negative that if you do get a segmentation violation, it is hard to +debug. +Version 2.00 brought Unicode (UTF-8) support, six languages, and the ability +to train Tesseract. +Tesseract was included in UNLV’s Fourth Annual Test of OCR Accuracy. +See https://github.com/tesseract-ocr/docs/blob/master/AT-1995.pdf. With Tesseract 2.00, +scripts are now included to allow anyone to reproduce some of these tests. +See https://github.com/tesseract-ocr/tesseract/wiki/TestingTesseract for more +details. +Tesseract 3.00 adds a number of new languages, including Chinese, Japanese, +and Korean. It also introduces a new, single-file based system of managing +language data. +Tesseract 3.02 adds BiDirectional text support, the ability to recognize +multiple languages in a single image, and improved layout analysis. +For further details, see the file ReleaseNotes included with the distribution. + + +RESOURCES +Main web site: https://github.com/tesseract-ocr +Information on training: https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract + + +SEE ALSO +ambiguous_words(1), cntraining(1), combine_tessdata(1), dawg2wordlist(1), +shape_training(1), mftraining(1), unicharambigs(5), unicharset(5), +unicharset_extractor(1), wordlist2dawg(1) + + +AUTHOR +Tesseract development was led at Hewlett-Packard and Google by Ray Smith. +The development team has included: +Ahmad Abdulkader, Chris Newton, Dan Johnson, Dar-Shyang Lee, David Eger, +Eric Wiseblatt, Faisal Shafait, Hiroshi Takenaka, Joe Liu, Joern Wanke, +Mark Seaman, Mickey Namiki, Nicholas Beato, Oded Fuhrmann, Phil Cheatle, +Pingping Xiu, Pong Eksombatchai (Chantat), Ranjith Unnikrishnan, Raquel +Romano, Ray Smith, Rika Antonova, Robert Moss, Samuel Charron, Sheelagh +Lloyd, Shobhit Saxena, and Thomas Kielbus. + + +COPYING +Licensed under the Apache License, Version 2.0 + +
diff --git a/doc/unicharambigs.5.asc b/doc/unicharambigs.5.asc index 7ce25e44..079f6d53 100644 --- a/doc/unicharambigs.5.asc +++ b/doc/unicharambigs.5.asc @@ -38,7 +38,7 @@ EXAMPLE 3 i i i 1 m 0 ............................... -In this example, all instances of the '2' character sequence '''' will +In this example, all instances of the '2' character sequence '''' will *always* be replaced by the '1' character sequence '"'; a '1' character sequence 'm' *may* be replaced by the '2' character sequence 'rn', and the '3' character sequence *may* be replaced by the '1' character diff --git a/doc/unicharambigs.5.html b/doc/unicharambigs.5.html index c6a645e6..bb9fb291 100644 --- a/doc/unicharambigs.5.html +++ b/doc/unicharambigs.5.html @@ -1,875 +1,875 @@ - - - - - -UNICHARAMBIGS(5) - - - - - -
-
-

DESCRIPTION

-
-

The unicharambigs file (a component of traineddata, see combine_tessdata(1) ) -is used by Tesseract to represent possible ambiguities between characters, -or groups of characters.

-

The file contains a number of lines, laid out as follow:

-
-
-
[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num]
-
-
- - - - - - - - - - - - - - - - - - - - -
-Field one -
-
-

-the number of characters contained in field two -

-
-Field two -
-
-

-the character sequence to be replaced -

-
-Field three -
-
-

-the number of characters contained in field four -

-
-Field four -
-
-

-the character sequence used to replace field two -

-
-Field five -
-
-

-contains either 1 or 0. 1 denotes a mandatory -replacement, 0 denotes an optional replacement. -

-
-

Characters appearing in fields two and four should appear in -unicharset. The numbers in fields one and three refer to the -number of unichars (not bytes).

-
-
-
-

EXAMPLE

-
-
-
-
2       ' '     1       "     1
-1       m       2       r n   0
-3       i i i   1       m     0
-
-

In this example, all instances of the 2 character sequence '' will -always be replaced by the 1 character sequence "; a 1 character -sequence m may be replaced by the 2 character sequence rn, and -the 3 character sequence may be replaced by the 1 character -sequence m.

-
-
-
-

HISTORY

-
-

The unicharambigs file first appeared in Tesseract 3.00; prior to that, a -similar format, called DangAmbigs (dangerous ambiguities) was used: the -format was almost identical, except only mandatory replacements could be -specified, and field 5 was absent.

-
-
-
-

BUGS

-
-

This is a documentation "bug": it’s not currently clear what should be done -in the case of ligatures (such as fi) which may also appear as regular -letters in the unicharset.

-
-
-
-

SEE ALSO

-
-

tesseract(1), unicharset(5)

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +UNICHARAMBIGS(5) + + + + + +
+
+

DESCRIPTION

+
+

The unicharambigs file (a component of traineddata, see combine_tessdata(1) ) +is used by Tesseract to represent possible ambiguities between characters, +or groups of characters.

+

The file contains a number of lines, laid out as follow:

+
+
+
[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num]
+
+
+ + + + + + + + + + + + + + + + + + + + +
+Field one +
+
+

+the number of characters contained in field two +

+
+Field two +
+
+

+the character sequence to be replaced +

+
+Field three +
+
+

+the number of characters contained in field four +

+
+Field four +
+
+

+the character sequence used to replace field two +

+
+Field five +
+
+

+contains either 1 or 0. 1 denotes a mandatory +replacement, 0 denotes an optional replacement. +

+
+

Characters appearing in fields two and four should appear in +unicharset. The numbers in fields one and three refer to the +number of unichars (not bytes).

+
+
+
+

EXAMPLE

+
+
+
+
2       ' '     1       "     1
+1       m       2       r n   0
+3       i i i   1       m     0
+
+

In this example, all instances of the 2 character sequence '' will +always be replaced by the 1 character sequence "; a 1 character +sequence m may be replaced by the 2 character sequence rn, and +the 3 character sequence may be replaced by the 1 character +sequence m.

+
+
+
+

HISTORY

+
+

The unicharambigs file first appeared in Tesseract 3.00; prior to that, a +similar format, called DangAmbigs (dangerous ambiguities) was used: the +format was almost identical, except only mandatory replacements could be +specified, and field 5 was absent.

+
+
+
+

BUGS

+
+

This is a documentation "bug": it’s not currently clear what should be done +in the case of ligatures (such as fi) which may also appear as regular +letters in the unicharset.

+
+
+
+

SEE ALSO

+
+

tesseract(1), unicharset(5)

+
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/unicharambigs.5.xml b/doc/unicharambigs.5.xml index 75b3c664..cbc0f50e 100644 --- a/doc/unicharambigs.5.xml +++ b/doc/unicharambigs.5.xml @@ -1,126 +1,126 @@ - - - - - - - UNICHARAMBIGS(5) - - -unicharambigs -5 -  -  - - - unicharambigs - Tesseract unicharset ambiguities - - -DESCRIPTION -The unicharambigs file (a component of traineddata, see combine_tessdata(1) ) -is used by Tesseract to represent possible ambiguities between characters, -or groups of characters. -The file contains a number of lines, laid out as follow: -[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num] - - - - -Field one - - - - -the number of characters contained in field two - - - - - - -Field two - - - - -the character sequence to be replaced - - - - - - -Field three - - - - -the number of characters contained in field four - - - - - - -Field four - - - - -the character sequence used to replace field two - - - - - - -Field five - - - - -contains either 1 or 0. 1 denotes a mandatory -replacement, 0 denotes an optional replacement. - - - - -Characters appearing in fields two and four should appear in -unicharset. The numbers in fields one and three refer to the -number of unichars (not bytes). - - -EXAMPLE -2 ' ' 1 " 1 -1 m 2 r n 0 -3 i i i 1 m 0 -In this example, all instances of the 2 character sequence '' will -always be replaced by the 1 character sequence "; a 1 character -sequence m may be replaced by the 2 character sequence rn, and -the 3 character sequence may be replaced by the 1 character -sequence m. - - -HISTORY -The unicharambigs file first appeared in Tesseract 3.00; prior to that, a -similar format, called DangAmbigs (dangerous ambiguities) was used: the -format was almost identical, except only mandatory replacements could be -specified, and field 5 was absent. - - -BUGS -This is a documentation "bug": it’s not currently clear what should be done -in the case of ligatures (such as fi) which may also appear as regular -letters in the unicharset. - - -SEE ALSO -tesseract(1), unicharset(5) - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + UNICHARAMBIGS(5) + + +unicharambigs +5 +  +  + + + unicharambigs + Tesseract unicharset ambiguities + + +DESCRIPTION +The unicharambigs file (a component of traineddata, see combine_tessdata(1) ) +is used by Tesseract to represent possible ambiguities between characters, +or groups of characters. +The file contains a number of lines, laid out as follow: +[num] <TAB> [char(s)] <TAB> [num] <TAB> [char(s)] <TAB> [num] + + + + +Field one + + + + +the number of characters contained in field two + + + + + + +Field two + + + + +the character sequence to be replaced + + + + + + +Field three + + + + +the number of characters contained in field four + + + + + + +Field four + + + + +the character sequence used to replace field two + + + + + + +Field five + + + + +contains either 1 or 0. 1 denotes a mandatory +replacement, 0 denotes an optional replacement. + + + + +Characters appearing in fields two and four should appear in +unicharset. The numbers in fields one and three refer to the +number of unichars (not bytes). + + +EXAMPLE +2 ' ' 1 " 1 +1 m 2 r n 0 +3 i i i 1 m 0 +In this example, all instances of the 2 character sequence '' will +always be replaced by the 1 character sequence "; a 1 character +sequence m may be replaced by the 2 character sequence rn, and +the 3 character sequence may be replaced by the 1 character +sequence m. + + +HISTORY +The unicharambigs file first appeared in Tesseract 3.00; prior to that, a +similar format, called DangAmbigs (dangerous ambiguities) was used: the +format was almost identical, except only mandatory replacements could be +specified, and field 5 was absent. + + +BUGS +This is a documentation "bug": it’s not currently clear what should be done +in the case of ligatures (such as fi) which may also appear as regular +letters in the unicharset. + + +SEE ALSO +tesseract(1), unicharset(5) + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/doc/unicharset.5.html b/doc/unicharset.5.html index 0f16c9e5..f3c3e7a9 100644 --- a/doc/unicharset.5.html +++ b/doc/unicharset.5.html @@ -1,965 +1,965 @@ - - - - - -UNICHARSET(5) - - - - - -
-
-

DESCRIPTION

-
-

Tesseract’s unicharset file contains information on each symbol -(unichar) the Tesseract OCR engine is trained to recognize.

-

A unicharset file (i.e. eng.unicharset) is distributed as part of a -Tesseract language pack (i.e. eng.traineddata). For information on -extracting the unicharset file, see combine_tessdata(1).

-

The first line of a unicharset file contains the number of unichars in -the file. After this line, each subsequent line provides information for -a single unichar. The first such line contains a placeholder reserved for -the space character. Each unichar is referred to within Tesseract by its -Unichar ID, which is the line number (minus 1) within the unicharset file. -Therefore, space gets unichar 0.

-

Each unichar line in the unicharset file (v2+) may have four space-separated fields:

-
-
-
'character' 'properties' 'script' 'id'
-
-

Starting with Tesseract v3.02, more information may be given for each unichar:

-
-
-
'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form'
-
-

Entries:

-
-
-character -
-
-

-The UTF-8 encoded string to be produced for this unichar. -

-
-
-properties -
-
-

-An integer mask of character properties, one per bit. - From least to most significant bit, these are: isalpha, islower, isupper, - isdigit, ispunctuation. -

-
-
-glyph_metrics -
-
-

-Ten comma-separated integers representing various standards - for where this glyph is to be found within a baseline-normalized coordinate - system where 128 is normalized to x-height. -

-
    -
  • -

    -min_bottom, max_bottom: the ranges where the bottom of the character can - be found. -

    -
  • -
  • -

    -min_top, max_top: the ranges where the top of the character may be found. -

    -
  • -
  • -

    -min_width, max_width: horizontal width of the character. -

    -
  • -
  • -

    -min_bearing, max_bearing: how far from the usual start position does the - leftmost part of the character begin. -

    -
  • -
  • -

    -min_advance, max_advance: how far from the printer’s cell left do we - advance to begin the next character. -

    -
  • -
-
-
-script -
-
-

-Name of the script (Latin, Common, Greek, Cyrillic, Han, null). -

-
-
-other_case -
-
-

-The Unichar ID of the other case version of this character - (upper or lower). -

-
-
-direction -
-
-

-The Unicode BiDi direction of this character, as defined by - ICU’s enum UCharDirection. (0 = Left to Right, 1 = Right to Left, - 2 = European Number…) -

-
-
-mirror -
-
-

-The Unichar ID of the BiDirectional mirror of this character. - For example the mirror of open paren is close paren, but Latin Capital C - has no mirror, so it remains a Latin Capital C. -

-
-
-normed_form -
-
-

-The UTF-8 representation of a "normalized form" of this unichar - for the purpose of blaming a module for errors given ground truth text. - For instance, a left or right single quote may normalize to an ASCII quote. -

-
-
-
-
-
-

EXAMPLE (v2)

-
-
-
-
; 10 Common 46
-b 3 Latin 59
-W 5 Latin 40
-7 8 Common 66
-= 0 Common 93
-
-

";" is a punctuation character. Its properties are thus represented by the -binary number 10000 (10 in hexadecimal).

-

"b" is an alphabetic character and a lower case character. Its properties are -thus represented by the binary number 00011 (3 in hexadecimal).

-

"W" is an alphabetic character and an upper case character. Its properties are -thus represented by the binary number 00101 (5 in hexadecimal).

-

"7" is just a digit. Its properties are thus represented by the binary number -01000 (8 in hexadecimal).

-

"=" is not punctuation nor a digit nor an alphabetic character. Its properties -are thus represented by the binary number 00000 (0 in hexadecimal).

-

Japanese or Chinese alphabetic character properties are represented by the -binary number 00001 (1 in hexadecimal): they are alphabetic, but neither -upper nor lower case.

-
-
-
-

EXAMPLE (v3.02)

-
-
-
-
110
-NULL 0 NULL 0
-N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N
-Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y
-1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1
-9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9
-a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a
-. . .
-
-
-
-
-

CAVEATS

-
-

Although the unicharset reader maintains the ability to read unicharsets -of older formats and will assign default values to missing fields, -the accuracy will be degraded.

-

Further, most other data files are indexed by the unicharset file, -so changing it without re-generating the others is likely to have dire -consequences.

-
-
-
-

HISTORY

-
-

The unicharset format first appeared with Tesseract 2.00, which was the -first version to support languages other than English. The unicharset file -contained only the first two fields, and the "ispunctuation" property was -absent (punctuation was regarded as "0", as "=" is in the above example.

-
-
-
-

SEE ALSO

-
-

tesseract(1), combine_tessdata(1), unicharset_extractor(1)

- -
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +UNICHARSET(5) + + + + + +
+
+

DESCRIPTION

+
+

Tesseract’s unicharset file contains information on each symbol +(unichar) the Tesseract OCR engine is trained to recognize.

+

A unicharset file (i.e. eng.unicharset) is distributed as part of a +Tesseract language pack (i.e. eng.traineddata). For information on +extracting the unicharset file, see combine_tessdata(1).

+

The first line of a unicharset file contains the number of unichars in +the file. After this line, each subsequent line provides information for +a single unichar. The first such line contains a placeholder reserved for +the space character. Each unichar is referred to within Tesseract by its +Unichar ID, which is the line number (minus 1) within the unicharset file. +Therefore, space gets unichar 0.

+

Each unichar line in the unicharset file (v2+) may have four space-separated fields:

+
+
+
'character' 'properties' 'script' 'id'
+
+

Starting with Tesseract v3.02, more information may be given for each unichar:

+
+
+
'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form'
+
+

Entries:

+
+
+character +
+
+

+The UTF-8 encoded string to be produced for this unichar. +

+
+
+properties +
+
+

+An integer mask of character properties, one per bit. + From least to most significant bit, these are: isalpha, islower, isupper, + isdigit, ispunctuation. +

+
+
+glyph_metrics +
+
+

+Ten comma-separated integers representing various standards + for where this glyph is to be found within a baseline-normalized coordinate + system where 128 is normalized to x-height. +

+
    +
  • +

    +min_bottom, max_bottom: the ranges where the bottom of the character can + be found. +

    +
  • +
  • +

    +min_top, max_top: the ranges where the top of the character may be found. +

    +
  • +
  • +

    +min_width, max_width: horizontal width of the character. +

    +
  • +
  • +

    +min_bearing, max_bearing: how far from the usual start position does the + leftmost part of the character begin. +

    +
  • +
  • +

    +min_advance, max_advance: how far from the printer’s cell left do we + advance to begin the next character. +

    +
  • +
+
+
+script +
+
+

+Name of the script (Latin, Common, Greek, Cyrillic, Han, null). +

+
+
+other_case +
+
+

+The Unichar ID of the other case version of this character + (upper or lower). +

+
+
+direction +
+
+

+The Unicode BiDi direction of this character, as defined by + ICU’s enum UCharDirection. (0 = Left to Right, 1 = Right to Left, + 2 = European Number…) +

+
+
+mirror +
+
+

+The Unichar ID of the BiDirectional mirror of this character. + For example the mirror of open paren is close paren, but Latin Capital C + has no mirror, so it remains a Latin Capital C. +

+
+
+normed_form +
+
+

+The UTF-8 representation of a "normalized form" of this unichar + for the purpose of blaming a module for errors given ground truth text. + For instance, a left or right single quote may normalize to an ASCII quote. +

+
+
+
+
+
+

EXAMPLE (v2)

+
+
+
+
; 10 Common 46
+b 3 Latin 59
+W 5 Latin 40
+7 8 Common 66
+= 0 Common 93
+
+

";" is a punctuation character. Its properties are thus represented by the +binary number 10000 (10 in hexadecimal).

+

"b" is an alphabetic character and a lower case character. Its properties are +thus represented by the binary number 00011 (3 in hexadecimal).

+

"W" is an alphabetic character and an upper case character. Its properties are +thus represented by the binary number 00101 (5 in hexadecimal).

+

"7" is just a digit. Its properties are thus represented by the binary number +01000 (8 in hexadecimal).

+

"=" is not punctuation nor a digit nor an alphabetic character. Its properties +are thus represented by the binary number 00000 (0 in hexadecimal).

+

Japanese or Chinese alphabetic character properties are represented by the +binary number 00001 (1 in hexadecimal): they are alphabetic, but neither +upper nor lower case.

+
+
+
+

EXAMPLE (v3.02)

+
+
+
+
110
+NULL 0 NULL 0
+N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N
+Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y
+1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1
+9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9
+a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a
+. . .
+
+
+
+
+

CAVEATS

+
+

Although the unicharset reader maintains the ability to read unicharsets +of older formats and will assign default values to missing fields, +the accuracy will be degraded.

+

Further, most other data files are indexed by the unicharset file, +so changing it without re-generating the others is likely to have dire +consequences.

+
+
+
+

HISTORY

+
+

The unicharset format first appeared with Tesseract 2.00, which was the +first version to support languages other than English. The unicharset file +contained only the first two fields, and the "ispunctuation" property was +absent (punctuation was regarded as "0", as "=" is in the above example.

+
+
+
+

SEE ALSO

+
+

tesseract(1), combine_tessdata(1), unicharset_extractor(1)

+ +
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/unicharset.5.xml b/doc/unicharset.5.xml index 9ae6257e..40e03c6e 100644 --- a/doc/unicharset.5.xml +++ b/doc/unicharset.5.xml @@ -1,219 +1,219 @@ - - - - - - - UNICHARSET(5) - - -unicharset -5 -  -  - - - unicharset - character properties file used by tesseract(1) - - -DESCRIPTION -Tesseract’s unicharset file contains information on each symbol -(unichar) the Tesseract OCR engine is trained to recognize. -A unicharset file (i.e. eng.unicharset) is distributed as part of a -Tesseract language pack (i.e. eng.traineddata). For information on -extracting the unicharset file, see combine_tessdata(1). -The first line of a unicharset file contains the number of unichars in -the file. After this line, each subsequent line provides information for -a single unichar. The first such line contains a placeholder reserved for -the space character. Each unichar is referred to within Tesseract by its -Unichar ID, which is the line number (minus 1) within the unicharset file. -Therefore, space gets unichar 0. -Each unichar line in the unicharset file (v2+) may have four space-separated fields: -'character' 'properties' 'script' 'id' -Starting with Tesseract v3.02, more information may be given for each unichar: -'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form' -Entries: - - - -character - - - -The UTF-8 encoded string to be produced for this unichar. - - - - - -properties - - - -An integer mask of character properties, one per bit. - From least to most significant bit, these are: isalpha, islower, isupper, - isdigit, ispunctuation. - - - - - -glyph_metrics - - - -Ten comma-separated integers representing various standards - for where this glyph is to be found within a baseline-normalized coordinate - system where 128 is normalized to x-height. - - - - -min_bottom, max_bottom: the ranges where the bottom of the character can - be found. - - - - -min_top, max_top: the ranges where the top of the character may be found. - - - - -min_width, max_width: horizontal width of the character. - - - - -min_bearing, max_bearing: how far from the usual start position does the - leftmost part of the character begin. - - - - -min_advance, max_advance: how far from the printer’s cell left do we - advance to begin the next character. - - - - - - - -script - - - -Name of the script (Latin, Common, Greek, Cyrillic, Han, null). - - - - - -other_case - - - -The Unichar ID of the other case version of this character - (upper or lower). - - - - - -direction - - - -The Unicode BiDi direction of this character, as defined by - ICU’s enum UCharDirection. (0 = Left to Right, 1 = Right to Left, - 2 = European Number…) - - - - - -mirror - - - -The Unichar ID of the BiDirectional mirror of this character. - For example the mirror of open paren is close paren, but Latin Capital C - has no mirror, so it remains a Latin Capital C. - - - - - -normed_form - - - -The UTF-8 representation of a "normalized form" of this unichar - for the purpose of blaming a module for errors given ground truth text. - For instance, a left or right single quote may normalize to an ASCII quote. - - - - - - -EXAMPLE (v2) -; 10 Common 46 -b 3 Latin 59 -W 5 Latin 40 -7 8 Common 66 -= 0 Common 93 -";" is a punctuation character. Its properties are thus represented by the -binary number 10000 (10 in hexadecimal). -"b" is an alphabetic character and a lower case character. Its properties are -thus represented by the binary number 00011 (3 in hexadecimal). -"W" is an alphabetic character and an upper case character. Its properties are -thus represented by the binary number 00101 (5 in hexadecimal). -"7" is just a digit. Its properties are thus represented by the binary number -01000 (8 in hexadecimal). -"=" is not punctuation nor a digit nor an alphabetic character. Its properties -are thus represented by the binary number 00000 (0 in hexadecimal). -Japanese or Chinese alphabetic character properties are represented by the -binary number 00001 (1 in hexadecimal): they are alphabetic, but neither -upper nor lower case. - - -EXAMPLE (v3.02) -110 -NULL 0 NULL 0 -N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N -Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y -1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1 -9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9 -a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a -. . . - - -CAVEATS -Although the unicharset reader maintains the ability to read unicharsets -of older formats and will assign default values to missing fields, -the accuracy will be degraded. -Further, most other data files are indexed by the unicharset file, -so changing it without re-generating the others is likely to have dire -consequences. - - -HISTORY -The unicharset format first appeared with Tesseract 2.00, which was the -first version to support languages other than English. The unicharset file -contained only the first two fields, and the "ispunctuation" property was -absent (punctuation was regarded as "0", as "=" is in the above example. - - -SEE ALSO -tesseract(1), combine_tessdata(1), unicharset_extractor(1) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + UNICHARSET(5) + + +unicharset +5 +  +  + + + unicharset + character properties file used by tesseract(1) + + +DESCRIPTION +Tesseract’s unicharset file contains information on each symbol +(unichar) the Tesseract OCR engine is trained to recognize. +A unicharset file (i.e. eng.unicharset) is distributed as part of a +Tesseract language pack (i.e. eng.traineddata). For information on +extracting the unicharset file, see combine_tessdata(1). +The first line of a unicharset file contains the number of unichars in +the file. After this line, each subsequent line provides information for +a single unichar. The first such line contains a placeholder reserved for +the space character. Each unichar is referred to within Tesseract by its +Unichar ID, which is the line number (minus 1) within the unicharset file. +Therefore, space gets unichar 0. +Each unichar line in the unicharset file (v2+) may have four space-separated fields: +'character' 'properties' 'script' 'id' +Starting with Tesseract v3.02, more information may be given for each unichar: +'character' 'properties' 'glyph_metrics' 'script' 'other_case' 'direction' 'mirror' 'normed_form' +Entries: + + + +character + + + +The UTF-8 encoded string to be produced for this unichar. + + + + + +properties + + + +An integer mask of character properties, one per bit. + From least to most significant bit, these are: isalpha, islower, isupper, + isdigit, ispunctuation. + + + + + +glyph_metrics + + + +Ten comma-separated integers representing various standards + for where this glyph is to be found within a baseline-normalized coordinate + system where 128 is normalized to x-height. + + + + +min_bottom, max_bottom: the ranges where the bottom of the character can + be found. + + + + +min_top, max_top: the ranges where the top of the character may be found. + + + + +min_width, max_width: horizontal width of the character. + + + + +min_bearing, max_bearing: how far from the usual start position does the + leftmost part of the character begin. + + + + +min_advance, max_advance: how far from the printer’s cell left do we + advance to begin the next character. + + + + + + + +script + + + +Name of the script (Latin, Common, Greek, Cyrillic, Han, null). + + + + + +other_case + + + +The Unichar ID of the other case version of this character + (upper or lower). + + + + + +direction + + + +The Unicode BiDi direction of this character, as defined by + ICU’s enum UCharDirection. (0 = Left to Right, 1 = Right to Left, + 2 = European Number…) + + + + + +mirror + + + +The Unichar ID of the BiDirectional mirror of this character. + For example the mirror of open paren is close paren, but Latin Capital C + has no mirror, so it remains a Latin Capital C. + + + + + +normed_form + + + +The UTF-8 representation of a "normalized form" of this unichar + for the purpose of blaming a module for errors given ground truth text. + For instance, a left or right single quote may normalize to an ASCII quote. + + + + + + +EXAMPLE (v2) +; 10 Common 46 +b 3 Latin 59 +W 5 Latin 40 +7 8 Common 66 += 0 Common 93 +";" is a punctuation character. Its properties are thus represented by the +binary number 10000 (10 in hexadecimal). +"b" is an alphabetic character and a lower case character. Its properties are +thus represented by the binary number 00011 (3 in hexadecimal). +"W" is an alphabetic character and an upper case character. Its properties are +thus represented by the binary number 00101 (5 in hexadecimal). +"7" is just a digit. Its properties are thus represented by the binary number +01000 (8 in hexadecimal). +"=" is not punctuation nor a digit nor an alphabetic character. Its properties +are thus represented by the binary number 00000 (0 in hexadecimal). +Japanese or Chinese alphabetic character properties are represented by the +binary number 00001 (1 in hexadecimal): they are alphabetic, but neither +upper nor lower case. + + +EXAMPLE (v3.02) +110 +NULL 0 NULL 0 +N 5 59,68,216,255,87,236,0,27,104,227 Latin 11 0 1 N +Y 5 59,68,216,255,91,205,0,47,91,223 Latin 33 0 2 Y +1 8 59,69,203,255,45,128,0,66,74,173 Common 3 2 3 1 +9 8 18,66,203,255,89,156,0,39,104,173 Common 4 2 4 9 +a 3 58,65,186,198,85,164,0,26,97,185 Latin 56 0 5 a +. . . + + +CAVEATS +Although the unicharset reader maintains the ability to read unicharsets +of older formats and will assign default values to missing fields, +the accuracy will be degraded. +Further, most other data files are indexed by the unicharset file, +so changing it without re-generating the others is likely to have dire +consequences. + + +HISTORY +The unicharset format first appeared with Tesseract 2.00, which was the +first version to support languages other than English. The unicharset file +contained only the first two fields, and the "ispunctuation" property was +absent (punctuation was regarded as "0", as "=" is in the above example. + + +SEE ALSO +tesseract(1), combine_tessdata(1), unicharset_extractor(1) +https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/doc/unicharset_extractor.1.asc b/doc/unicharset_extractor.1.asc index c972783a..bde21ab3 100644 --- a/doc/unicharset_extractor.1.asc +++ b/doc/unicharset_extractor.1.asc @@ -11,9 +11,9 @@ SYNOPSIS DESCRIPTION ----------- -Tesseract needs to know the set of possible characters it can output. -To generate the unicharset data file, use the unicharset_extractor -program on the same training pages bounding box files as used for +Tesseract needs to know the set of possible characters it can output. +To generate the unicharset data file, use the unicharset_extractor +program on the same training pages bounding box files as used for clustering: unicharset_extractor fontfile_1.box fontfile_2.box ... @@ -21,19 +21,19 @@ clustering: The unicharset will be put into the file 'dir/unicharset', or simply './unicharset' if no output directory is provided. -Tesseract also needs to have access to character properties isalpha, -isdigit, isupper, islower, ispunctuation. all of this auxilury data +Tesseract also needs to have access to character properties isalpha, +isdigit, isupper, islower, ispunctuation. all of this auxilury data and more is encoded in this file. (See unicharset(5)) -If your system supports the wctype functions, these values will be set -automatically by unicharset_extractor and there is no need to edit the -unicharset file. On some older systems (eg Windows 95), the unicharset +If your system supports the wctype functions, these values will be set +automatically by unicharset_extractor and there is no need to edit the +unicharset file. On some older systems (eg Windows 95), the unicharset file must be edited by hand to add these property description codes. -*NOTE* The unicharset file must be regenerated whenever inttemp, normproto -and pffmtable are generated (i.e. they must all be recreated when the box -file is changed) as they have to be in sync. This is made easier than in -previous versions by running unicharset_extractor before mftraining and +*NOTE* The unicharset file must be regenerated whenever inttemp, normproto +and pffmtable are generated (i.e. they must all be recreated when the box +file is changed) as they have to be in sync. This is made easier than in +previous versions by running unicharset_extractor before mftraining and cntraining, and giving the unicharset to mftraining. SEE ALSO diff --git a/doc/unicharset_extractor.1.html b/doc/unicharset_extractor.1.html index a6ac9e89..6fdeb5e9 100644 --- a/doc/unicharset_extractor.1.html +++ b/doc/unicharset_extractor.1.html @@ -1,815 +1,815 @@ - - - - - -UNICHARSET_EXTRACTOR(1) - - - - - -
-
-

SYNOPSIS

-
-

unicharset_extractor [-D dir] FILE

-
-
-
-

DESCRIPTION

-
-

Tesseract needs to know the set of possible characters it can output. -To generate the unicharset data file, use the unicharset_extractor -program on the same training pages bounding box files as used for -clustering:

-
-
-
unicharset_extractor fontfile_1.box fontfile_2.box ...
-
-

The unicharset will be put into the file dir/unicharset, or simply -./unicharset if no output directory is provided.

-

Tesseract also needs to have access to character properties isalpha, -isdigit, isupper, islower, ispunctuation. all of this auxilury data -and more is encoded in this file. (See unicharset(5))

-

If your system supports the wctype functions, these values will be set -automatically by unicharset_extractor and there is no need to edit the -unicharset file. On some older systems (eg Windows 95), the unicharset -file must be edited by hand to add these property description codes.

-

NOTE The unicharset file must be regenerated whenever inttemp, normproto -and pffmtable are generated (i.e. they must all be recreated when the box -file is changed) as they have to be in sync. This is made easier than in -previous versions by running unicharset_extractor before mftraining and -cntraining, and giving the unicharset to mftraining.

-
-
-
-

SEE ALSO

- -
-
-

HISTORY

-
-

unicharset_extractor first appeared in Tesseract 2.00.

-
-
-
-

COPYING

-
-

Copyright (C) 2006, Google Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +UNICHARSET_EXTRACTOR(1) + + + + + +
+
+

SYNOPSIS

+
+

unicharset_extractor [-D dir] FILE

+
+
+
+

DESCRIPTION

+
+

Tesseract needs to know the set of possible characters it can output. +To generate the unicharset data file, use the unicharset_extractor +program on the same training pages bounding box files as used for +clustering:

+
+
+
unicharset_extractor fontfile_1.box fontfile_2.box ...
+
+

The unicharset will be put into the file dir/unicharset, or simply +./unicharset if no output directory is provided.

+

Tesseract also needs to have access to character properties isalpha, +isdigit, isupper, islower, ispunctuation. all of this auxilury data +and more is encoded in this file. (See unicharset(5))

+

If your system supports the wctype functions, these values will be set +automatically by unicharset_extractor and there is no need to edit the +unicharset file. On some older systems (eg Windows 95), the unicharset +file must be edited by hand to add these property description codes.

+

NOTE The unicharset file must be regenerated whenever inttemp, normproto +and pffmtable are generated (i.e. they must all be recreated when the box +file is changed) as they have to be in sync. This is made easier than in +previous versions by running unicharset_extractor before mftraining and +cntraining, and giving the unicharset to mftraining.

+
+
+
+

SEE ALSO

+ +
+
+

HISTORY

+
+

unicharset_extractor first appeared in Tesseract 2.00.

+
+
+
+

COPYING

+
+

Copyright (C) 2006, Google Inc. +Licensed under the Apache License, Version 2.0

+
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/unicharset_extractor.1.xml b/doc/unicharset_extractor.1.xml index bea4d1e1..45087a8c 100644 --- a/doc/unicharset_extractor.1.xml +++ b/doc/unicharset_extractor.1.xml @@ -1,63 +1,63 @@ - - - - - - - UNICHARSET_EXTRACTOR(1) - - -unicharset_extractor -1 -  -  - - - unicharset_extractor - extract unicharset from Tesseract boxfiles - - -unicharset_extractor [-D dir] FILE - - -DESCRIPTION -Tesseract needs to know the set of possible characters it can output. -To generate the unicharset data file, use the unicharset_extractor -program on the same training pages bounding box files as used for -clustering: -unicharset_extractor fontfile_1.box fontfile_2.box ... -The unicharset will be put into the file dir/unicharset, or simply -./unicharset if no output directory is provided. -Tesseract also needs to have access to character properties isalpha, -isdigit, isupper, islower, ispunctuation. all of this auxilury data -and more is encoded in this file. (See unicharset(5)) -If your system supports the wctype functions, these values will be set -automatically by unicharset_extractor and there is no need to edit the -unicharset file. On some older systems (eg Windows 95), the unicharset -file must be edited by hand to add these property description codes. -NOTE The unicharset file must be regenerated whenever inttemp, normproto -and pffmtable are generated (i.e. they must all be recreated when the box -file is changed) as they have to be in sync. This is made easier than in -previous versions by running unicharset_extractor before mftraining and -cntraining, and giving the unicharset to mftraining. - - -SEE ALSO -tesseract(1), unicharset(5) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -HISTORY -unicharset_extractor first appeared in Tesseract 2.00. - - -COPYING -Copyright (C) 2006, Google Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + UNICHARSET_EXTRACTOR(1) + + +unicharset_extractor +1 +  +  + + + unicharset_extractor + extract unicharset from Tesseract boxfiles + + +unicharset_extractor [-D dir] FILE + + +DESCRIPTION +Tesseract needs to know the set of possible characters it can output. +To generate the unicharset data file, use the unicharset_extractor +program on the same training pages bounding box files as used for +clustering: +unicharset_extractor fontfile_1.box fontfile_2.box ... +The unicharset will be put into the file dir/unicharset, or simply +./unicharset if no output directory is provided. +Tesseract also needs to have access to character properties isalpha, +isdigit, isupper, islower, ispunctuation. all of this auxilury data +and more is encoded in this file. (See unicharset(5)) +If your system supports the wctype functions, these values will be set +automatically by unicharset_extractor and there is no need to edit the +unicharset file. On some older systems (eg Windows 95), the unicharset +file must be edited by hand to add these property description codes. +NOTE The unicharset file must be regenerated whenever inttemp, normproto +and pffmtable are generated (i.e. they must all be recreated when the box +file is changed) as they have to be in sync. This is made easier than in +previous versions by running unicharset_extractor before mftraining and +cntraining, and giving the unicharset to mftraining. + + +SEE ALSO +tesseract(1), unicharset(5) +https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract + + +HISTORY +unicharset_extractor first appeared in Tesseract 2.00. + + +COPYING +Copyright (C) 2006, Google Inc. +Licensed under the Apache License, Version 2.0 + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/doc/wordlist2dawg.1.html b/doc/wordlist2dawg.1.html index 58e5cab4..73357051 100644 --- a/doc/wordlist2dawg.1.html +++ b/doc/wordlist2dawg.1.html @@ -1,820 +1,820 @@ - - - - - -WORDLIST2DAWG(1) - - - - - -
-
-

SYNOPSIS

-
-

wordlist2dawg WORDLIST DAWG lang.unicharset

-

wordlist2dawg -t WORDLIST DAWG lang.unicharset

-

wordlist2dawg -r 1 WORDLIST DAWG lang.unicharset

-

wordlist2dawg -r 2 WORDLIST DAWG lang.unicharset

-

wordlist2dawg -l <short> <long> WORDLIST DAWG lang.unicharset

-
-
-
-

DESCRIPTION

-
-

wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph -(DAWG) for use with Tesseract. A DAWG is a compressed, space and time -efficient representation of a word list.

-
-
-
-

OPTIONS

-
-

-t - Verify that a given dawg file is equivalent to a given wordlist.

-

-r 1 - Reverse a word if it contains an RTL character.

-

-r 2 - Reverse all words.

-

-l <short> <long> - Produce a file with several dawgs in it, one each for words - of length <short>, <short+1>,… <long>

-
-
-
-

ARGUMENTS

-
-

WORDLIST - A plain text file in UTF-8, one word per line.

-

DAWG - The output DAWG to write.

-

lang.unicharset - The unicharset of the language. This is the unicharset - generated by mftraining(1).

-
-
-
-

SEE ALSO

-
-

tesseract(1), combine_tessdata(1), dawg2wordlist(1)

- -
-
-
-

COPYING

-
-

Copyright (C) 2006 Google, Inc. -Licensed under the Apache License, Version 2.0

-
-
-
-

AUTHOR

-
-

The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present).

-
-
-
-

- - - + + + + + +WORDLIST2DAWG(1) + + + + + +
+
+

SYNOPSIS

+
+

wordlist2dawg WORDLIST DAWG lang.unicharset

+

wordlist2dawg -t WORDLIST DAWG lang.unicharset

+

wordlist2dawg -r 1 WORDLIST DAWG lang.unicharset

+

wordlist2dawg -r 2 WORDLIST DAWG lang.unicharset

+

wordlist2dawg -l <short> <long> WORDLIST DAWG lang.unicharset

+
+
+
+

DESCRIPTION

+
+

wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph +(DAWG) for use with Tesseract. A DAWG is a compressed, space and time +efficient representation of a word list.

+
+
+
+

OPTIONS

+
+

-t + Verify that a given dawg file is equivalent to a given wordlist.

+

-r 1 + Reverse a word if it contains an RTL character.

+

-r 2 + Reverse all words.

+

-l <short> <long> + Produce a file with several dawgs in it, one each for words + of length <short>, <short+1>,… <long>

+
+
+
+

ARGUMENTS

+
+

WORDLIST + A plain text file in UTF-8, one word per line.

+

DAWG + The output DAWG to write.

+

lang.unicharset + The unicharset of the language. This is the unicharset + generated by mftraining(1).

+
+
+
+

SEE ALSO

+
+

tesseract(1), combine_tessdata(1), dawg2wordlist(1)

+ +
+
+
+

COPYING

+
+

Copyright (C) 2006 Google, Inc. +Licensed under the Apache License, Version 2.0

+
+
+
+

AUTHOR

+
+

The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present).

+
+
+
+

+ + + diff --git a/doc/wordlist2dawg.1.xml b/doc/wordlist2dawg.1.xml index 907d3a57..bad256fe 100644 --- a/doc/wordlist2dawg.1.xml +++ b/doc/wordlist2dawg.1.xml @@ -1,69 +1,69 @@ - - - - - - - WORDLIST2DAWG(1) - - -wordlist2dawg -1 -  -  - - - wordlist2dawg - convert a wordlist to a DAWG for Tesseract - - -wordlist2dawg WORDLIST DAWG lang.unicharset -wordlist2dawg -t WORDLIST DAWG lang.unicharset -wordlist2dawg -r 1 WORDLIST DAWG lang.unicharset -wordlist2dawg -r 2 WORDLIST DAWG lang.unicharset -wordlist2dawg -l <short> <long> WORDLIST DAWG lang.unicharset - - -DESCRIPTION -wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph -(DAWG) for use with Tesseract. A DAWG is a compressed, space and time -efficient representation of a word list. - - -OPTIONS --t - Verify that a given dawg file is equivalent to a given wordlist. --r 1 - Reverse a word if it contains an RTL character. --r 2 - Reverse all words. --l <short> <long> - Produce a file with several dawgs in it, one each for words - of length <short>, <short+1>,… <long> - - -ARGUMENTS -WORDLIST - A plain text file in UTF-8, one word per line. -DAWG - The output DAWG to write. -lang.unicharset - The unicharset of the language. This is the unicharset - generated by mftraining(1). - - -SEE ALSO -tesseract(1), combine_tessdata(1), dawg2wordlist(1) -https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract - - -COPYING -Copyright (C) 2006 Google, Inc. -Licensed under the Apache License, Version 2.0 - - -AUTHOR -The Tesseract OCR engine was written by Ray Smith and his research groups -at Hewlett Packard (1985-1995) and Google (2006-present). - - + + + + + + + WORDLIST2DAWG(1) + + +wordlist2dawg +1 +  +  + + + wordlist2dawg + convert a wordlist to a DAWG for Tesseract + + +wordlist2dawg WORDLIST DAWG lang.unicharset +wordlist2dawg -t WORDLIST DAWG lang.unicharset +wordlist2dawg -r 1 WORDLIST DAWG lang.unicharset +wordlist2dawg -r 2 WORDLIST DAWG lang.unicharset +wordlist2dawg -l <short> <long> WORDLIST DAWG lang.unicharset + + +DESCRIPTION +wordlist2dawg(1) converts a wordlist to a Directed Acyclic Word Graph +(DAWG) for use with Tesseract. A DAWG is a compressed, space and time +efficient representation of a word list. + + +OPTIONS +-t + Verify that a given dawg file is equivalent to a given wordlist. +-r 1 + Reverse a word if it contains an RTL character. +-r 2 + Reverse all words. +-l <short> <long> + Produce a file with several dawgs in it, one each for words + of length <short>, <short+1>,… <long> + + +ARGUMENTS +WORDLIST + A plain text file in UTF-8, one word per line. +DAWG + The output DAWG to write. +lang.unicharset + The unicharset of the language. This is the unicharset + generated by mftraining(1). + + +SEE ALSO +tesseract(1), combine_tessdata(1), dawg2wordlist(1) +https://github.com/tesseract-ocr/tesseract/wiki/TrainingTesseract + + +COPYING +Copyright (C) 2006 Google, Inc. +Licensed under the Apache License, Version 2.0 + + +AUTHOR +The Tesseract OCR engine was written by Ray Smith and his research groups +at Hewlett Packard (1985-1995) and Google (2006-present). + + diff --git a/java/Makefile.am b/java/Makefile.am index fddbc6f9..af3b1885 100644 --- a/java/Makefile.am +++ b/java/Makefile.am @@ -36,19 +36,20 @@ SCROLLVIEW_CLASSES = \ com/google/scrollview/ScrollView.class SCROLLVIEW_LIBS = \ - $(srcdir)/piccolo2d-core-3.0.jar \ - $(srcdir)/piccolo2d-extras-3.0.jar + piccolo2d-core-3.0.jar \ + piccolo2d-extras-3.0.jar -CLASSPATH = $(srcdir)/piccolo2d-core-3.0.jar:$(srcdir)/piccolo2d-extras-3.0.jar +CLASSPATH = piccolo2d-core-3.0.jar:piccolo2d-extras-3.0.jar ScrollView.jar : $(SCROLLVIEW_CLASSES) - $(JAR) cfm $@ Manifest.txt com/google/scrollview/*.class \ + $(JAR) cfm $@ $(srcdir)/Manifest.txt com/google/scrollview/*.class \ com/google/scrollview/events/*.class com/google/scrollview/ui/*.class -$(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES) +$(SCROLLVIEW_CLASSES) : $(SCROLLVIEW_FILES) $(SCROLLVIEW_LIBS) $(JAVAC) -encoding UTF8 -sourcepath $(srcdir) -classpath $(CLASSPATH) $(SCROLLVIEW_FILES) -d $(builddir) -fetch-jars : +.PHONY: fetch-jars +fetch-jars $(SCROLLVIEW_LIBS): curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-core/3.0/piccolo2d-core-3.0.jar > piccolo2d-core-3.0.jar curl -L http://search.maven.org/remotecontent?filepath=org/piccolo2d/piccolo2d-extras/3.0/piccolo2d-extras-3.0.jar > piccolo2d-extras-3.0.jar @@ -64,7 +65,7 @@ uninstall: endif clean : - rm -f ScrollView.jar *.class $(srcdir)/*.class + rm -f ScrollView.jar $(SCROLLVIEW_CLASSES) # all-am does nothing, to make the java part optional. all all-am install : diff --git a/lstm/Makefile.am b/lstm/Makefile.am new file mode 100644 index 00000000..81e7a7b6 --- /dev/null +++ b/lstm/Makefile.am @@ -0,0 +1,39 @@ +AM_CPPFLAGS += \ + -I$(top_srcdir)/ccutil -I$(top_srcdir)/cutil -I$(top_srcdir)/ccstruct \ + -I$(top_srcdir)/arch -I$(top_srcdir)/viewer -I$(top_srcdir)/classify \ + -I$(top_srcdir)/dict -I$(top_srcdir)/lstm +AUTOMAKE_OPTIONS = subdir-objects +SUBDIRS = +AM_CXXFLAGS = $(OPENMP_CXXFLAGS) + +if !NO_TESSDATA_PREFIX +AM_CXXFLAGS += -DTESSDATA_PREFIX=@datadir@/ +endif + +if VISIBILITY +AM_CXXFLAGS += -fvisibility=hidden -fvisibility-inlines-hidden +AM_CPPFLAGS += -DTESS_EXPORTS +endif + +include_HEADERS = \ + convolve.h ctc.h fullyconnected.h functions.h input.h \ + lstm.h lstmrecognizer.h lstmtrainer.h maxpool.h \ + networkbuilder.h network.h networkio.h networkscratch.h \ + parallel.h plumbing.h recodebeam.h reconfig.h reversed.h \ + series.h static_shape.h stridemap.h tfnetwork.h weightmatrix.h + +noinst_HEADERS = + +if !USING_MULTIPLELIBS +noinst_LTLIBRARIES = libtesseract_lstm.la +else +lib_LTLIBRARIES = libtesseract_lstm.la +libtesseract_lstm_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION) +endif + +libtesseract_lstm_la_SOURCES = \ + convolve.cpp ctc.cpp fullyconnected.cpp functions.cpp input.cpp \ + lstm.cpp lstmrecognizer.cpp lstmtrainer.cpp maxpool.cpp \ + networkbuilder.cpp network.cpp networkio.cpp \ + parallel.cpp plumbing.cpp recodebeam.cpp reconfig.cpp reversed.cpp \ + series.cpp stridemap.cpp tfnetwork.cpp weightmatrix.cpp diff --git a/lstm/convolve.cpp b/lstm/convolve.cpp new file mode 100644 index 00000000..f89ca3ba --- /dev/null +++ b/lstm/convolve.cpp @@ -0,0 +1,124 @@ +/////////////////////////////////////////////////////////////////////// +// File: convolve.cpp +// Description: Convolutional layer that stacks the inputs over its rectangle +// and pulls in random data to fill out-of-input inputs. +// Output is therefore same size as its input, but deeper. +// Author: Ray Smith +// Created: Tue Mar 18 16:56:06 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "convolve.h" + +#include "networkscratch.h" +#include "serialis.h" + +namespace tesseract { + +Convolve::Convolve(const STRING& name, int ni, int half_x, int half_y) + : Network(NT_CONVOLVE, name, ni, ni * (2*half_x + 1) * (2*half_y + 1)), + half_x_(half_x), half_y_(half_y) { +} + +Convolve::~Convolve() { +} + +// Writes to the given file. Returns false in case of error. +bool Convolve::Serialize(TFile* fp) const { + if (!Network::Serialize(fp)) return false; + if (fp->FWrite(&half_x_, sizeof(half_x_), 1) != 1) return false; + if (fp->FWrite(&half_y_, sizeof(half_y_), 1) != 1) return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool Convolve::DeSerialize(bool swap, TFile* fp) { + if (fp->FRead(&half_x_, sizeof(half_x_), 1) != 1) return false; + if (fp->FRead(&half_y_, sizeof(half_y_), 1) != 1) return false; + if (swap) { + ReverseN(&half_x_, sizeof(half_x_)); + ReverseN(&half_y_, sizeof(half_y_)); + } + no_ = ni_ * (2*half_x_ + 1) * (2*half_y_ + 1); + return true; +} + +// Runs forward propagation of activations on the input line. +// See NetworkCpp for a detailed discussion of the arguments. +void Convolve::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + output->Resize(input, no_); + int y_scale = 2 * half_y_ + 1; + StrideMap::Index dest_index(output->stride_map()); + do { + // Stack x_scale groups of y_scale * ni_ inputs together. + int t = dest_index.t(); + int out_ix = 0; + for (int x = -half_x_; x <= half_x_; ++x, out_ix += y_scale * ni_) { + StrideMap::Index x_index(dest_index); + if (!x_index.AddOffset(x, FD_WIDTH)) { + // This x is outside the image. + output->Randomize(t, out_ix, y_scale * ni_, randomizer_); + } else { + int out_iy = out_ix; + for (int y = -half_y_; y <= half_y_; ++y, out_iy += ni_) { + StrideMap::Index y_index(x_index); + if (!y_index.AddOffset(y, FD_HEIGHT)) { + // This y is outside the image. + output->Randomize(t, out_iy, ni_, randomizer_); + } else { + output->CopyTimeStepGeneral(t, out_iy, ni_, input, y_index.t(), 0); + } + } + } + } + } while (dest_index.Increment()); + if (debug) DisplayForward(*output); +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool Convolve::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + back_deltas->Resize(fwd_deltas, ni_); + NetworkScratch::IO delta_sum; + delta_sum.ResizeFloat(fwd_deltas, ni_, scratch); + delta_sum->Zero(); + int y_scale = 2 * half_y_ + 1; + StrideMap::Index src_index(fwd_deltas.stride_map()); + do { + // Stack x_scale groups of y_scale * ni_ inputs together. + int t = src_index.t(); + int out_ix = 0; + for (int x = -half_x_; x <= half_x_; ++x, out_ix += y_scale * ni_) { + StrideMap::Index x_index(src_index); + if (x_index.AddOffset(x, FD_WIDTH)) { + int out_iy = out_ix; + for (int y = -half_y_; y <= half_y_; ++y, out_iy += ni_) { + StrideMap::Index y_index(x_index); + if (y_index.AddOffset(y, FD_HEIGHT)) { + fwd_deltas.AddTimeStepPart(t, out_iy, ni_, + delta_sum->f(y_index.t())); + } + } + } + } + } while (src_index.Increment()); + back_deltas->CopyWithNormalization(*delta_sum, fwd_deltas); + return true; +} + +} // namespace tesseract. diff --git a/lstm/convolve.h b/lstm/convolve.h new file mode 100644 index 00000000..a05dc1d8 --- /dev/null +++ b/lstm/convolve.h @@ -0,0 +1,74 @@ +/////////////////////////////////////////////////////////////////////// +// File: convolve.h +// Description: Convolutional layer that stacks the inputs over its rectangle +// and pulls in random data to fill out-of-input inputs. +// Output is therefore same size as its input, but deeper. +// Author: Ray Smith +// Created: Tue Mar 18 16:45:34 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_CONVOLVE_H_ +#define TESSERACT_LSTM_CONVOLVE_H_ + +#include "genericvector.h" +#include "matrix.h" +#include "network.h" + +namespace tesseract { + +// Makes each time-step deeper by stacking inputs over its rectangle. Does not +// affect the size of its input. Achieves this by bringing in random values in +// out-of-input areas. +class Convolve : public Network { + public: + // The area of convolution is 2*half_x + 1 by 2*half_y + 1, forcing it to + // always be odd, so the center is the current pixel. + Convolve(const STRING& name, int ni, int half_x, int half_y); + virtual ~Convolve(); + + virtual STRING spec() const { + STRING spec; + spec.add_str_int("C", half_x_ * 2 + 1); + spec.add_str_int(",", half_y_ * 2 + 1); + return spec; + } + + // Writes to the given file. Returns false in case of error. + virtual bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + virtual bool DeSerialize(bool swap, TFile* fp); + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + + // Runs backward propagation of errors on the deltas line. + // See Network for a detailed discussion of the arguments. + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas); + + protected: + // Serialized data. + inT32 half_x_; + inT32 half_y_; +}; + +} // namespace tesseract. + + +#endif // TESSERACT_LSTM_SUBSAMPLE_H_ diff --git a/lstm/ctc.cpp b/lstm/ctc.cpp new file mode 100644 index 00000000..7a841088 --- /dev/null +++ b/lstm/ctc.cpp @@ -0,0 +1,412 @@ +/////////////////////////////////////////////////////////////////////// +// File: ctc.cpp +// Description: Slightly improved standard CTC to compute the targets. +// Author: Ray Smith +// Created: Wed Jul 13 15:50:06 PDT 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// +#include "ctc.h" + +#include + +#include "genericvector.h" +#include "host.h" +#include "matrix.h" +#include "networkio.h" + +#include "network.h" +#include "scrollview.h" + +namespace tesseract { + +// Magic constants that keep CTC stable. +// Minimum probability limit for softmax input to ctc_loss. +const float CTC::kMinProb_ = 1e-12; +// Maximum absolute argument to exp(). +const double CTC::kMaxExpArg_ = 80.0; +// Minimum probability for total prob in time normalization. +const double CTC::kMinTotalTimeProb_ = 1e-8; +// Minimum probability for total prob in final normalization. +const double CTC::kMinTotalFinalProb_ = 1e-6; + +// Builds a target using CTC. Slightly improved as follows: +// Includes normalizations and clipping for stability. +// labels should be pre-padded with nulls everywhere. +// labels can be longer than the time sequence, but the total number of +// essential labels (non-null plus nulls between equal labels) must not exceed +// the number of timesteps in outputs. +// outputs is the output of the network, and should have already been +// normalized with NormalizeProbs. +// On return targets is filled with the computed targets. +// Returns false if there is insufficient time for the labels. +/* static */ +bool CTC::ComputeCTCTargets(const GenericVector& labels, int null_char, + const GENERIC_2D_ARRAY& outputs, + NetworkIO* targets) { + std::unique_ptr ctc(new CTC(labels, null_char, outputs)); + if (!ctc->ComputeLabelLimits()) { + return false; // Not enough time. + } + // Generate simple targets purely from the truth labels by spreading them + // evenly over time. + GENERIC_2D_ARRAY simple_targets; + ctc->ComputeSimpleTargets(&simple_targets); + // Add the simple targets as a starter bias to the network outputs. + float bias_fraction = ctc->CalculateBiasFraction(); + simple_targets *= bias_fraction; + ctc->outputs_ += simple_targets; + NormalizeProbs(&ctc->outputs_); + // Run regular CTC on the biased outputs. + // Run forward and backward + GENERIC_2D_ARRAY log_alphas, log_betas; + ctc->Forward(&log_alphas); + ctc->Backward(&log_betas); + // Normalize and come out of log space with a clipped softmax over time. + log_alphas += log_betas; + ctc->NormalizeSequence(&log_alphas); + ctc->LabelsToClasses(log_alphas, targets); + NormalizeProbs(targets); + return true; +} + +CTC::CTC(const GenericVector& labels, int null_char, + const GENERIC_2D_ARRAY& outputs) + : labels_(labels), outputs_(outputs), null_char_(null_char) { + num_timesteps_ = outputs.dim1(); + num_classes_ = outputs.dim2(); + num_labels_ = labels_.size(); +} + +// Computes vectors of min and max label index for each timestep, based on +// whether skippability of nulls makes it possible to complete a valid path. +bool CTC::ComputeLabelLimits() { + min_labels_.init_to_size(num_timesteps_, 0); + max_labels_.init_to_size(num_timesteps_, 0); + int min_u = num_labels_ - 1; + if (labels_[min_u] == null_char_) --min_u; + for (int t = num_timesteps_ - 1; t >= 0; --t) { + min_labels_[t] = min_u; + if (min_u > 0) { + --min_u; + if (labels_[min_u] == null_char_ && min_u > 0 && + labels_[min_u + 1] != labels_[min_u - 1]) { + --min_u; + } + } + } + int max_u = labels_[0] == null_char_; + for (int t = 0; t < num_timesteps_; ++t) { + max_labels_[t] = max_u; + if (max_labels_[t] < min_labels_[t]) return false; // Not enough room. + if (max_u + 1 < num_labels_) { + ++max_u; + if (labels_[max_u] == null_char_ && max_u + 1 < num_labels_ && + labels_[max_u + 1] != labels_[max_u - 1]) { + ++max_u; + } + } + } + return true; +} + +// Computes targets based purely on the labels by spreading the labels evenly +// over the available timesteps. +void CTC::ComputeSimpleTargets(GENERIC_2D_ARRAY* targets) const { + // Initialize all targets to zero. + targets->Resize(num_timesteps_, num_classes_, 0.0f); + GenericVector half_widths; + GenericVector means; + ComputeWidthsAndMeans(&half_widths, &means); + for (int l = 0; l < num_labels_; ++l) { + int label = labels_[l]; + float left_half_width = half_widths[l]; + float right_half_width = left_half_width; + int mean = means[l]; + if (label == null_char_) { + if (!NeededNull(l)) { + if ((l > 0 && mean == means[l - 1]) || + (l + 1 < num_labels_ && mean == means[l + 1])) { + continue; // Drop overlapping null. + } + } + // Make sure that no space is left unoccupied and that non-nulls always + // peak at 1 by stretching nulls to meet their neighbors. + if (l > 0) left_half_width = mean - means[l - 1]; + if (l + 1 < num_labels_) right_half_width = means[l + 1] - mean; + } + if (mean >= 0 && mean < num_timesteps_) targets->put(mean, label, 1.0f); + for (int offset = 1; offset < left_half_width && mean >= offset; ++offset) { + float prob = 1.0f - offset / left_half_width; + if (mean - offset < num_timesteps_ && + prob > targets->get(mean - offset, label)) { + targets->put(mean - offset, label, prob); + } + } + for (int offset = 1; + offset < right_half_width && mean + offset < num_timesteps_; + ++offset) { + float prob = 1.0f - offset / right_half_width; + if (mean + offset >= 0 && prob > targets->get(mean + offset, label)) { + targets->put(mean + offset, label, prob); + } + } + } +} + +// Computes mean positions and half widths of the simple targets by spreading +// the labels evenly over the available timesteps. +void CTC::ComputeWidthsAndMeans(GenericVector* half_widths, + GenericVector* means) const { + // Count the number of labels of each type, in regexp terms, counts plus + // (non-null or necessary null, which must occur at least once) and star + // (optional null). + int num_plus = 0, num_star = 0; + for (int i = 0; i < num_labels_; ++i) { + if (labels_[i] != null_char_ || NeededNull(i)) + ++num_plus; + else + ++num_star; + } + // Compute the size for each type. If there is enough space for everything + // to have size>=1, then all are equal, otherwise plus_size=1 and star gets + // whatever is left-over. + float plus_size = 1.0f, star_size = 0.0f; + float total_floating = num_plus + num_star; + if (total_floating <= num_timesteps_) { + plus_size = star_size = num_timesteps_ / total_floating; + } else if (num_star > 0) { + star_size = static_cast(num_timesteps_ - num_plus) / num_star; + } + // Set the width and compute the mean of each. + float mean_pos = 0.0f; + for (int i = 0; i < num_labels_; ++i) { + float half_width; + if (labels_[i] != null_char_ || NeededNull(i)) { + half_width = plus_size / 2.0f; + } else { + half_width = star_size / 2.0f; + } + mean_pos += half_width; + means->push_back(static_cast(mean_pos)); + mean_pos += half_width; + half_widths->push_back(half_width); + } +} + +// Helper returns the index of the highest probability label at timestep t. +static int BestLabel(const GENERIC_2D_ARRAY& outputs, int t) { + int result = 0; + int num_classes = outputs.dim2(); + const float* outputs_t = outputs[t]; + for (int c = 1; c < num_classes; ++c) { + if (outputs_t[c] > outputs_t[result]) result = c; + } + return result; +} + +// Calculates and returns a suitable fraction of the simple targets to add +// to the network outputs. +float CTC::CalculateBiasFraction() { + // Compute output labels via basic decoding. + GenericVector output_labels; + for (int t = 0; t < num_timesteps_; ++t) { + int label = BestLabel(outputs_, t); + while (t + 1 < num_timesteps_ && BestLabel(outputs_, t + 1) == label) ++t; + if (label != null_char_) output_labels.push_back(label); + } + // Simple bag of labels error calculation. + GenericVector truth_counts(num_classes_, 0); + GenericVector output_counts(num_classes_, 0); + for (int l = 0; l < num_labels_; ++l) { + ++truth_counts[labels_[l]]; + } + for (int l = 0; l < output_labels.size(); ++l) { + ++output_counts[output_labels[l]]; + } + // Count the number of true and false positive non-nulls and truth labels. + int true_pos = 0, false_pos = 0, total_labels = 0; + for (int c = 0; c < num_classes_; ++c) { + if (c == null_char_) continue; + int truth_count = truth_counts[c]; + int ocr_count = output_counts[c]; + if (truth_count > 0) { + total_labels += truth_count; + if (ocr_count > truth_count) { + true_pos += truth_count; + false_pos += ocr_count - truth_count; + } else { + true_pos += ocr_count; + } + } + // We don't need to count classes that don't exist in the truth as + // false positives, because they don't affect CTC at all. + } + if (total_labels == 0) return 0.0f; + return exp(MAX(true_pos - false_pos, 1) * log(kMinProb_) / total_labels); +} + +// Given ln(x) and ln(y), returns ln(x + y), using: +// ln(x + y) = ln(y) + ln(1 + exp(ln(y) - ln(x)), ensuring that ln(x) is the +// bigger number to maximize precision. +static double LogSumExp(double ln_x, double ln_y) { + if (ln_x >= ln_y) { + return ln_x + log1p(exp(ln_y - ln_x)); + } else { + return ln_y + log1p(exp(ln_x - ln_y)); + } +} + +// Runs the forward CTC pass, filling in log_probs. +void CTC::Forward(GENERIC_2D_ARRAY* log_probs) const { + log_probs->Resize(num_timesteps_, num_labels_, -MAX_FLOAT32); + log_probs->put(0, 0, log(outputs_(0, labels_[0]))); + if (labels_[0] == null_char_) + log_probs->put(0, 1, log(outputs_(0, labels_[1]))); + for (int t = 1; t < num_timesteps_; ++t) { + const float* outputs_t = outputs_[t]; + for (int u = min_labels_[t]; u <= max_labels_[t]; ++u) { + // Continuing the same label. + double log_sum = log_probs->get(t - 1, u); + // Change from previous label. + if (u > 0) { + log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 1)); + } + // Skip the null if allowed. + if (u >= 2 && labels_[u - 1] == null_char_ && + labels_[u] != labels_[u - 2]) { + log_sum = LogSumExp(log_sum, log_probs->get(t - 1, u - 2)); + } + // Add in the log prob of the current label. + double label_prob = outputs_t[labels_[u]]; + log_sum += log(label_prob); + log_probs->put(t, u, log_sum); + } + } +} + +// Runs the backward CTC pass, filling in log_probs. +void CTC::Backward(GENERIC_2D_ARRAY* log_probs) const { + log_probs->Resize(num_timesteps_, num_labels_, -MAX_FLOAT32); + log_probs->put(num_timesteps_ - 1, num_labels_ - 1, 0.0); + if (labels_[num_labels_ - 1] == null_char_) + log_probs->put(num_timesteps_ - 1, num_labels_ - 2, 0.0); + for (int t = num_timesteps_ - 2; t >= 0; --t) { + const float* outputs_tp1 = outputs_[t + 1]; + for (int u = min_labels_[t]; u <= max_labels_[t]; ++u) { + // Continuing the same label. + double log_sum = log_probs->get(t + 1, u) + log(outputs_tp1[labels_[u]]); + // Change from previous label. + if (u + 1 < num_labels_) { + double prev_prob = outputs_tp1[labels_[u + 1]]; + log_sum = + LogSumExp(log_sum, log_probs->get(t + 1, u + 1) + log(prev_prob)); + } + // Skip the null if allowed. + if (u + 2 < num_labels_ && labels_[u + 1] == null_char_ && + labels_[u] != labels_[u + 2]) { + double skip_prob = outputs_tp1[labels_[u + 2]]; + log_sum = + LogSumExp(log_sum, log_probs->get(t + 1, u + 2) + log(skip_prob)); + } + log_probs->put(t, u, log_sum); + } + } +} + +// Normalizes and brings probs out of log space with a softmax over time. +void CTC::NormalizeSequence(GENERIC_2D_ARRAY* probs) const { + double max_logprob = probs->Max(); + for (int u = 0; u < num_labels_; ++u) { + double total = 0.0; + for (int t = 0; t < num_timesteps_; ++t) { + // Separate impossible path from unlikely probs. + double prob = probs->get(t, u); + if (prob > -MAX_FLOAT32) + prob = ClippedExp(prob - max_logprob); + else + prob = 0.0; + total += prob; + probs->put(t, u, prob); + } + // Note that although this is a probability distribution over time and + // therefore should sum to 1, it is important to allow some labels to be + // all zero, (or at least tiny) as it is necessary to skip some blanks. + if (total < kMinTotalTimeProb_) total = kMinTotalTimeProb_; + for (int t = 0; t < num_timesteps_; ++t) + probs->put(t, u, probs->get(t, u) / total); + } +} + +// For each timestep computes the max prob for each class over all +// instances of the class in the labels_, and sets the targets to +// the max observed prob. +void CTC::LabelsToClasses(const GENERIC_2D_ARRAY& probs, + NetworkIO* targets) const { + // For each timestep compute the max prob for each class over all + // instances of the class in the labels_. + GenericVector class_probs; + for (int t = 0; t < num_timesteps_; ++t) { + float* targets_t = targets->f(t); + class_probs.init_to_size(num_classes_, 0.0); + for (int u = 0; u < num_labels_; ++u) { + double prob = probs(t, u); + // Note that although Graves specifies sum over all labels of the same + // class, we need to allow skipped blanks to go to zero, so they don't + // interfere with the non-blanks, so max is better than sum. + if (prob > class_probs[labels_[u]]) class_probs[labels_[u]] = prob; + // class_probs[labels_[u]] += prob; + } + int best_class = 0; + for (int c = 0; c < num_classes_; ++c) { + targets_t[c] = class_probs[c]; + if (class_probs[c] > class_probs[best_class]) best_class = c; + } + } +} + +// Normalizes the probabilities such that no target has a prob below min_prob, +// and, provided that the initial total is at least min_total_prob, then all +// probs will sum to 1, otherwise to sum/min_total_prob. The maximum output +// probability is thus 1 - (num_classes-1)*min_prob. +/* static */ +void CTC::NormalizeProbs(GENERIC_2D_ARRAY* probs) { + int num_timesteps = probs->dim1(); + int num_classes = probs->dim2(); + for (int t = 0; t < num_timesteps; ++t) { + float* probs_t = (*probs)[t]; + // Compute the total and clip that to prevent amplification of noise. + double total = 0.0; + for (int c = 0; c < num_classes; ++c) total += probs_t[c]; + if (total < kMinTotalFinalProb_) total = kMinTotalFinalProb_; + // Compute the increased total as a result of clipping. + double increment = 0.0; + for (int c = 0; c < num_classes; ++c) { + double prob = probs_t[c] / total; + if (prob < kMinProb_) increment += kMinProb_ - prob; + } + // Now normalize with clipping. Any additional clipping is negligible. + total += increment; + for (int c = 0; c < num_classes; ++c) { + float prob = probs_t[c] / total; + probs_t[c] = MAX(prob, kMinProb_); + } + } +} + +// Returns true if the label at index is a needed null. +bool CTC::NeededNull(int index) const { + return labels_[index] == null_char_ && index > 0 && index + 1 < num_labels_ && + labels_[index + 1] == labels_[index - 1]; +} + +} // namespace tesseract diff --git a/lstm/ctc.h b/lstm/ctc.h new file mode 100644 index 00000000..47fba674 --- /dev/null +++ b/lstm/ctc.h @@ -0,0 +1,130 @@ +/////////////////////////////////////////////////////////////////////// +// File: ctc.h +// Description: Slightly improved standard CTC to compute the targets. +// Author: Ray Smith +// Created: Wed Jul 13 15:17:06 PDT 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_CTC_H_ +#define TESSERACT_LSTM_CTC_H_ + +#include "genericvector.h" +#include "network.h" +#include "networkio.h" +#include "scrollview.h" + +namespace tesseract { + +// Class to encapsulate CTC and simple target generation. +class CTC { + public: + // Normalizes the probabilities such that no target has a prob below min_prob, + // and, provided that the initial total is at least min_total_prob, then all + // probs will sum to 1, otherwise to sum/min_total_prob. The maximum output + // probability is thus 1 - (num_classes-1)*min_prob. + static void NormalizeProbs(NetworkIO* probs) { + NormalizeProbs(probs->mutable_float_array()); + } + + // Builds a target using CTC. Slightly improved as follows: + // Includes normalizations and clipping for stability. + // labels should be pre-padded with nulls wherever desired, but they don't + // have to be between all labels. Allows for multi-label codes with no + // nulls between. + // labels can be longer than the time sequence, but the total number of + // essential labels (non-null plus nulls between equal labels) must not exceed + // the number of timesteps in outputs. + // outputs is the output of the network, and should have already been + // normalized with NormalizeProbs. + // On return targets is filled with the computed targets. + // Returns false if there is insufficient time for the labels. + static bool ComputeCTCTargets(const GenericVector& truth_labels, + int null_char, + const GENERIC_2D_ARRAY& outputs, + NetworkIO* targets); + + private: + // Constructor is private as the instance only holds information specific to + // the current labels, outputs etc, and is built by the static function. + CTC(const GenericVector& labels, int null_char, + const GENERIC_2D_ARRAY& outputs); + + // Computes vectors of min and max label index for each timestep, based on + // whether skippability of nulls makes it possible to complete a valid path. + bool ComputeLabelLimits(); + // Computes targets based purely on the labels by spreading the labels evenly + // over the available timesteps. + void ComputeSimpleTargets(GENERIC_2D_ARRAY* targets) const; + // Computes mean positions and half widths of the simple targets by spreading + // the labels even over the available timesteps. + void ComputeWidthsAndMeans(GenericVector* half_widths, + GenericVector* means) const; + // Calculates and returns a suitable fraction of the simple targets to add + // to the network outputs. + float CalculateBiasFraction(); + // Runs the forward CTC pass, filling in log_probs. + void Forward(GENERIC_2D_ARRAY* log_probs) const; + // Runs the backward CTC pass, filling in log_probs. + void Backward(GENERIC_2D_ARRAY* log_probs) const; + // Normalizes and brings probs out of log space with a softmax over time. + void NormalizeSequence(GENERIC_2D_ARRAY* probs) const; + // For each timestep computes the max prob for each class over all + // instances of the class in the labels_, and sets the targets to + // the max observed prob. + void LabelsToClasses(const GENERIC_2D_ARRAY& probs, + NetworkIO* targets) const; + // Normalizes the probabilities such that no target has a prob below min_prob, + // and, provided that the initial total is at least min_total_prob, then all + // probs will sum to 1, otherwise to sum/min_total_prob. The maximum output + // probability is thus 1 - (num_classes-1)*min_prob. + static void NormalizeProbs(GENERIC_2D_ARRAY* probs); + // Returns true if the label at index is a needed null. + bool NeededNull(int index) const; + // Returns exp(clipped(x)), clipping x to a reasonable range to prevent over/ + // underflow. + static double ClippedExp(double x) { + if (x < -kMaxExpArg_) return exp(-kMaxExpArg_); + if (x > kMaxExpArg_) return exp(kMaxExpArg_); + return exp(x); + } + + // Minimum probability limit for softmax input to ctc_loss. + static const float kMinProb_; + // Maximum absolute argument to exp(). + static const double kMaxExpArg_; + // Minimum probability for total prob in time normalization. + static const double kMinTotalTimeProb_; + // Minimum probability for total prob in final normalization. + static const double kMinTotalFinalProb_; + + // The truth label indices that are to be matched to outputs_. + const GenericVector& labels_; + // The network outputs. + GENERIC_2D_ARRAY outputs_; + // The null or "blank" label. + int null_char_; + // Number of timesteps in outputs_. + int num_timesteps_; + // Number of classes in outputs_. + int num_classes_; + // Number of labels in labels_. + int num_labels_; + // Min and max valid label indices for each timestep. + GenericVector min_labels_; + GenericVector max_labels_; +}; + +} // namespace tesseract + +#endif // TESSERACT_LSTM_CTC_H_ diff --git a/lstm/fullyconnected.cpp b/lstm/fullyconnected.cpp new file mode 100644 index 00000000..c5b92768 --- /dev/null +++ b/lstm/fullyconnected.cpp @@ -0,0 +1,296 @@ +/////////////////////////////////////////////////////////////////////// +// File: fullyconnected.cpp +// Description: Simple feed-forward layer with various non-linearities. +// Author: Ray Smith +// Created: Wed Feb 26 14:49:15 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "fullyconnected.h" + +#ifdef _OPENMP +#include +#endif +#include +#include + +#include "functions.h" +#include "networkscratch.h" + +// Number of threads to use for parallel calculation of Forward and Backward. +const int kNumThreads = 4; + +namespace tesseract { + +FullyConnected::FullyConnected(const STRING& name, int ni, int no, + NetworkType type) + : Network(type, name, ni, no), external_source_(NULL), int_mode_(false) { +} + +FullyConnected::~FullyConnected() { +} + +// Returns the shape output from the network given an input shape (which may +// be partially unknown ie zero). +StaticShape FullyConnected::OutputShape(const StaticShape& input_shape) const { + LossType loss_type = LT_NONE; + if (type_ == NT_SOFTMAX) + loss_type = LT_CTC; + else if (type_ == NT_SOFTMAX_NO_CTC) + loss_type = LT_SOFTMAX; + else if (type_ == NT_LOGISTIC) + loss_type = LT_LOGISTIC; + StaticShape result(input_shape); + result.set_depth(no_); + result.set_loss_type(loss_type); + return result; +} + +// Suspends/Enables training by setting the training_ flag. Serialize and +// DeSerialize only operate on the run-time data if state is false. +void FullyConnected::SetEnableTraining(TrainingState state) { + if (state == TS_RE_ENABLE) { + if (training_ == TS_DISABLED) weights_.InitBackward(false); + training_ = TS_ENABLED; + } else { + training_ = state; + } +} + +// Sets up the network for training. Initializes weights using weights of +// scale `range` picked according to the random number generator `randomizer`. +int FullyConnected::InitWeights(float range, TRand* randomizer) { + Network::SetRandomizer(randomizer); + num_weights_ = weights_.InitWeightsFloat(no_, ni_ + 1, TestFlag(NF_ADA_GRAD), + range, randomizer); + return num_weights_; +} + +// Converts a float network to an int network. +void FullyConnected::ConvertToInt() { + weights_.ConvertToInt(); +} + +// Provides debug output on the weights. +void FullyConnected::DebugWeights() { + weights_.Debug2D(name_.string()); +} + +// Writes to the given file. Returns false in case of error. +bool FullyConnected::Serialize(TFile* fp) const { + if (!Network::Serialize(fp)) return false; + if (!weights_.Serialize(IsTraining(), fp)) return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool FullyConnected::DeSerialize(bool swap, TFile* fp) { + if (!weights_.DeSerialize(IsTraining(), swap, fp)) return false; + return true; +} + +// Runs forward propagation of activations on the input line. +// See NetworkCpp for a detailed discussion of the arguments. +void FullyConnected::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + int width = input.Width(); + if (type_ == NT_SOFTMAX) + output->ResizeFloat(input, no_); + else + output->Resize(input, no_); + SetupForward(input, input_transpose); + GenericVector temp_lines; + temp_lines.init_to_size(kNumThreads, NetworkScratch::FloatVec()); + GenericVector curr_input; + curr_input.init_to_size(kNumThreads, NetworkScratch::FloatVec()); + for (int i = 0; i < temp_lines.size(); ++i) { + temp_lines[i].Init(no_, scratch); + curr_input[i].Init(ni_, scratch); + } +#ifdef _OPENMP +#pragma omp parallel for num_threads(kNumThreads) + for (int t = 0; t < width; ++t) { + // Thread-local pointer to temporary storage. + int thread_id = omp_get_thread_num(); +#else + for (int t = 0; t < width; ++t) { + // Thread-local pointer to temporary storage. + int thread_id = 0; +#endif + double* temp_line = temp_lines[thread_id]; + const double* d_input = NULL; + const inT8* i_input = NULL; + if (input.int_mode()) { + i_input = input.i(t); + } else { + input.ReadTimeStep(t, curr_input[thread_id]); + d_input = curr_input[thread_id]; + } + ForwardTimeStep(d_input, i_input, t, temp_line); + output->WriteTimeStep(t, temp_line); + if (IsTraining() && type_ != NT_SOFTMAX) { + acts_.CopyTimeStepFrom(t, *output, t); + } + } + // Zero all the elements that are in the padding around images that allows + // multiple different-sized images to exist in a single array. + // acts_ is only used if this is not a softmax op. + if (IsTraining() && type_ != NT_SOFTMAX) { + acts_.ZeroInvalidElements(); + } + output->ZeroInvalidElements(); +#if DEBUG_DETAIL > 0 + tprintf("F Output:%s\n", name_.string()); + output->Print(10); +#endif + if (debug) DisplayForward(*output); +} + +// Components of Forward so FullyConnected can be reused inside LSTM. +void FullyConnected::SetupForward(const NetworkIO& input, + const TransposedArray* input_transpose) { + // Softmax output is always float, so save the input type. + int_mode_ = input.int_mode(); + if (IsTraining()) { + acts_.Resize(input, no_); + // Source_ is a transposed copy of input. It isn't needed if provided. + external_source_ = input_transpose; + if (external_source_ == NULL) source_t_.ResizeNoInit(ni_, input.Width()); + } +} + +void FullyConnected::ForwardTimeStep(const double* d_input, const inT8* i_input, + int t, double* output_line) { + // input is copied to source_ line-by-line for cache coherency. + if (IsTraining() && external_source_ == NULL && d_input != NULL) + source_t_.WriteStrided(t, d_input); + if (d_input != NULL) + weights_.MatrixDotVector(d_input, output_line); + else + weights_.MatrixDotVector(i_input, output_line); + if (type_ == NT_TANH) { + FuncInplace(no_, output_line); + } else if (type_ == NT_LOGISTIC) { + FuncInplace(no_, output_line); + } else if (type_ == NT_POSCLIP) { + FuncInplace(no_, output_line); + } else if (type_ == NT_SYMCLIP) { + FuncInplace(no_, output_line); + } else if (type_ == NT_RELU) { + FuncInplace(no_, output_line); + } else if (type_ == NT_SOFTMAX || type_ == NT_SOFTMAX_NO_CTC) { + SoftmaxInPlace(no_, output_line); + } else if (type_ != NT_LINEAR) { + ASSERT_HOST("Invalid fully-connected type!" == NULL); + } +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool FullyConnected::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + if (debug) DisplayBackward(fwd_deltas); + back_deltas->Resize(fwd_deltas, ni_); + GenericVector errors; + errors.init_to_size(kNumThreads, NetworkScratch::FloatVec()); + for (int i = 0; i < errors.size(); ++i) errors[i].Init(no_, scratch); + GenericVector temp_backprops; + if (needs_to_backprop_) { + temp_backprops.init_to_size(kNumThreads, NetworkScratch::FloatVec()); + for (int i = 0; i < kNumThreads; ++i) temp_backprops[i].Init(ni_, scratch); + } + int width = fwd_deltas.Width(); + NetworkScratch::GradientStore errors_t; + errors_t.Init(no_, width, scratch); +#ifdef _OPENMP +#pragma omp parallel for num_threads(kNumThreads) + for (int t = 0; t < width; ++t) { + int thread_id = omp_get_thread_num(); +#else + for (int t = 0; t < width; ++t) { + int thread_id = 0; +#endif + double* backprop = NULL; + if (needs_to_backprop_) backprop = temp_backprops[thread_id]; + double* curr_errors = errors[thread_id]; + BackwardTimeStep(fwd_deltas, t, curr_errors, errors_t.get(), backprop); + if (backprop != NULL) { + back_deltas->WriteTimeStep(t, backprop); + } + } + FinishBackward(*errors_t.get()); + if (needs_to_backprop_) { + back_deltas->ZeroInvalidElements(); + back_deltas->CopyWithNormalization(*back_deltas, fwd_deltas); +#if DEBUG_DETAIL > 0 + tprintf("F Backprop:%s\n", name_.string()); + back_deltas->Print(10); +#endif + return true; + } + return false; // No point going further back. +} + +void FullyConnected::BackwardTimeStep(const NetworkIO& fwd_deltas, int t, + double* curr_errors, + TransposedArray* errors_t, + double* backprop) { + if (type_ == NT_TANH) + acts_.FuncMultiply(fwd_deltas, t, curr_errors); + else if (type_ == NT_LOGISTIC) + acts_.FuncMultiply(fwd_deltas, t, curr_errors); + else if (type_ == NT_POSCLIP) + acts_.FuncMultiply(fwd_deltas, t, curr_errors); + else if (type_ == NT_SYMCLIP) + acts_.FuncMultiply(fwd_deltas, t, curr_errors); + else if (type_ == NT_RELU) + acts_.FuncMultiply(fwd_deltas, t, curr_errors); + else if (type_ == NT_SOFTMAX || type_ == NT_SOFTMAX_NO_CTC || + type_ == NT_LINEAR) + fwd_deltas.ReadTimeStep(t, curr_errors); // fwd_deltas are the errors. + else + ASSERT_HOST("Invalid fully-connected type!" == NULL); + // Generate backprop only if needed by the lower layer. + if (backprop != NULL) weights_.VectorDotMatrix(curr_errors, backprop); + errors_t->WriteStrided(t, curr_errors); +} + +void FullyConnected::FinishBackward(const TransposedArray& errors_t) { + if (external_source_ == NULL) + weights_.SumOuterTransposed(errors_t, source_t_, true); + else + weights_.SumOuterTransposed(errors_t, *external_source_, true); +} + +// Updates the weights using the given learning rate and momentum. +// num_samples is the quotient to be used in the adagrad computation iff +// use_ada_grad_ is true. +void FullyConnected::Update(float learning_rate, float momentum, + int num_samples) { + weights_.Update(learning_rate, momentum, num_samples); +} + +// Sums the products of weight updates in *this and other, splitting into +// positive (same direction) in *same and negative (different direction) in +// *changed. +void FullyConnected::CountAlternators(const Network& other, double* same, + double* changed) const { + ASSERT_HOST(other.type() == type_); + const FullyConnected* fc = reinterpret_cast(&other); + weights_.CountAlternators(fc->weights_, same, changed); +} + +} // namespace tesseract. diff --git a/lstm/fullyconnected.h b/lstm/fullyconnected.h new file mode 100644 index 00000000..f5a59390 --- /dev/null +++ b/lstm/fullyconnected.h @@ -0,0 +1,134 @@ +/////////////////////////////////////////////////////////////////////// +// File: fullyconnected.h +// Description: Simple feed-forward layer with various non-linearities. +// Author: Ray Smith +// Created: Wed Feb 26 14:46:06 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_FULLYCONNECTED_H_ +#define TESSERACT_LSTM_FULLYCONNECTED_H_ + +#include "network.h" +#include "networkscratch.h" + +namespace tesseract { + +// C++ Implementation of the Softmax (output) class from lstm.py. +class FullyConnected : public Network { + public: + FullyConnected(const STRING& name, int ni, int no, NetworkType type); + virtual ~FullyConnected(); + + // Returns the shape output from the network given an input shape (which may + // be partially unknown ie zero). + virtual StaticShape OutputShape(const StaticShape& input_shape) const; + + virtual STRING spec() const { + STRING spec; + if (type_ == NT_TANH) + spec.add_str_int("Ft", no_); + else if (type_ == NT_LOGISTIC) + spec.add_str_int("Fs", no_); + else if (type_ == NT_RELU) + spec.add_str_int("Fr", no_); + else if (type_ == NT_LINEAR) + spec.add_str_int("Fl", no_); + else if (type_ == NT_POSCLIP) + spec.add_str_int("Fp", no_); + else if (type_ == NT_SYMCLIP) + spec.add_str_int("Fs", no_); + else if (type_ == NT_SOFTMAX) + spec.add_str_int("Fc", no_); + else + spec.add_str_int("Fm", no_); + return spec; + } + + // Changes the type to the given type. Used to commute a softmax to a + // non-output type for adding on other networks. + void ChangeType(NetworkType type) { + type_ = type; + } + + // Suspends/Enables training by setting the training_ flag. Serialize and + // DeSerialize only operate on the run-time data if state is false. + virtual void SetEnableTraining(TrainingState state); + + // Sets up the network for training. Initializes weights using weights of + // scale `range` picked according to the random number generator `randomizer`. + virtual int InitWeights(float range, TRand* randomizer); + + // Converts a float network to an int network. + virtual void ConvertToInt(); + + // Provides debug output on the weights. + virtual void DebugWeights(); + + // Writes to the given file. Returns false in case of error. + virtual bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + virtual bool DeSerialize(bool swap, TFile* fp); + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + // Components of Forward so FullyConnected can be reused inside LSTM. + void SetupForward(const NetworkIO& input, + const TransposedArray* input_transpose); + void ForwardTimeStep(const double* d_input, const inT8* i_input, int t, + double* output_line); + + // Runs backward propagation of errors on the deltas line. + // See Network for a detailed discussion of the arguments. + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas); + // Components of Backward so FullyConnected can be reused inside LSTM. + void BackwardTimeStep(const NetworkIO& fwd_deltas, int t, double* curr_errors, + TransposedArray* errors_t, double* backprop); + void FinishBackward(const TransposedArray& errors_t); + + // Updates the weights using the given learning rate and momentum. + // num_samples is the quotient to be used in the adagrad computation iff + // use_ada_grad_ is true. + virtual void Update(float learning_rate, float momentum, int num_samples); + // Sums the products of weight updates in *this and other, splitting into + // positive (same direction) in *same and negative (different direction) in + // *changed. + virtual void CountAlternators(const Network& other, double* same, + double* changed) const; + + protected: + // Weight arrays of size [no, ni + 1]. + WeightMatrix weights_; + // Transposed copy of input used during training of size [ni, width]. + TransposedArray source_t_; + // Pointer to transposed input stored elsewhere. If not null, this is used + // in preference to calculating the transpose and storing it in source_t_. + const TransposedArray* external_source_; + // Activations from forward pass of size [width, no]. + NetworkIO acts_; + // Memory of the integer mode input to forward as softmax always outputs + // float, so the information is otherwise lost. + bool int_mode_; +}; + +} // namespace tesseract. + + + +#endif // TESSERACT_LSTM_FULLYCONNECTED_H_ diff --git a/lstm/functions.cpp b/lstm/functions.cpp new file mode 100644 index 00000000..644530c3 --- /dev/null +++ b/lstm/functions.cpp @@ -0,0 +1,26 @@ +/////////////////////////////////////////////////////////////////////// +// File: functions.cpp +// Description: Static initialize-on-first-use non-linearity functions. +// Author: Ray Smith +// Created: Tue Jul 17 14:02:59 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "functions.h" + +namespace tesseract { + +double TanhTable[kTableSize]; +double LogisticTable[kTableSize]; + +} // namespace tesseract. diff --git a/lstm/functions.h b/lstm/functions.h new file mode 100644 index 00000000..d633e6bf --- /dev/null +++ b/lstm/functions.h @@ -0,0 +1,249 @@ +/////////////////////////////////////////////////////////////////////// +// File: functions.h +// Description: Collection of function-objects used by the network layers. +// Author: Ray Smith +// Created: Fri Jun 20 10:45:37 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_FUNCTIONS_H_ +#define TESSERACT_LSTM_FUNCTIONS_H_ + +#include +#include "helpers.h" +#include "tprintf.h" + +// Setting this to 1 or more causes massive dumps of debug data: weights, +// updates, internal calculations etc, and reduces the number of test iterations +// to a small number, so outputs can be diffed. +#define DEBUG_DETAIL 0 +#if DEBUG_DETAIL > 0 +#undef _OPENMP // Disable open mp to get the outputs in sync. +#endif + +namespace tesseract { + +// Size of static tables. +const int kTableSize = 4096; +// Scale factor for float arg to int index. +const double kScaleFactor = 256.0; + +extern double TanhTable[]; +extern double LogisticTable[]; + +// Non-linearity (sigmoid) functions with cache tables and clipping. +inline double Tanh(double x) { + if (x < 0.0) return -Tanh(-x); + if (x >= (kTableSize - 1) / kScaleFactor) return 1.0; + x *= kScaleFactor; + int index = static_cast(floor(x)); + if (TanhTable[index] == 0.0 && index > 0) { + // Generate the entry. + TanhTable[index] = tanh(index / kScaleFactor); + } + if (index == kTableSize - 1) return TanhTable[kTableSize - 1]; + if (TanhTable[index + 1] == 0.0) { + // Generate the entry. + TanhTable[index + 1] = tanh((index + 1) / kScaleFactor); + } + double offset = x - index; + return TanhTable[index] * (1.0 - offset) + TanhTable[index + 1] * offset; +} + +inline double Logistic(double x) { + if (x < 0.0) return 1.0 - Logistic(-x); + if (x >= (kTableSize - 1) / kScaleFactor) return 1.0; + x *= kScaleFactor; + int index = static_cast(floor(x)); + if (LogisticTable[index] == 0.0) { + // Generate the entry. + LogisticTable[index] = 1.0 / (1.0 + exp(-index / kScaleFactor)); + } + if (index == kTableSize - 1) return LogisticTable[kTableSize - 1]; + if (LogisticTable[index + 1] == 0.0) { + // Generate the entry. + LogisticTable[index + 1] = 1.0 / (1.0 + exp(-(index + 1) / kScaleFactor)); + } + double offset = x - index; + return LogisticTable[index] * (1.0 - offset) + + LogisticTable[index + 1] * offset; +} + +// Non-linearity (sigmoid) functions and their derivatives. +struct FFunc { + inline double operator()(double x) const { return Logistic(x); } +}; +struct FPrime { + inline double operator()(double y) const { return y * (1.0 - y); } +}; +struct ClipFFunc { + inline double operator()(double x) const { + if (x <= 0.0) return 0.0; + if (x >= 1.0) return 1.0; + return x; + } +}; +struct ClipFPrime { + inline double operator()(double y) const { + return 0.0 < y && y < 1.0 ? 1.0 : 0.0; + } +}; +struct Relu { + inline double operator()(double x) const { + if (x <= 0.0) return 0.0; + return x; + } +}; +struct ReluPrime { + inline double operator()(double y) const { return 0.0 < y ? 1.0 : 0.0; } +}; +struct GFunc { + inline double operator()(double x) const { return Tanh(x); } +}; +struct GPrime { + inline double operator()(double y) const { return 1.0 - y * y; } +}; +struct ClipGFunc { + inline double operator()(double x) const { + if (x <= -1.0) return -1.0; + if (x >= 1.0) return 1.0; + return x; + } +}; +struct ClipGPrime { + inline double operator()(double y) const { + return -1.0 < y && y < 1.0 ? 1.0 : 0.0; + } +}; +struct HFunc { + inline double operator()(double x) const { return Tanh(x); } +}; +struct HPrime { + inline double operator()(double y) const { + double u = Tanh(y); + return 1.0 - u * u; + } +}; +struct UnityFunc { + inline double operator()(double x) const { return 1.0; } +}; +struct IdentityFunc { + inline double operator()(double x) const { return x; } +}; + +// Applies Func in-place to inout, of size n. +template +inline void FuncInplace(int n, double* inout) { + Func f; + for (int i = 0; i < n; ++i) { + inout[i] = f(inout[i]); + } +} +// Applies Func to u and multiplies the result by v component-wise, +// putting the product in out, all of size n. +template +inline void FuncMultiply(const double* u, const double* v, int n, double* out) { + Func f; + for (int i = 0; i < n; ++i) { + out[i] = f(u[i]) * v[i]; + } +} +// Applies the Softmax function in-place to inout, of size n. +template +inline void SoftmaxInPlace(int n, T* inout) { + if (n <= 0) return; + // A limit on the negative range input to exp to guarantee non-zero output. + const T kMaxSoftmaxActivation = 86.0f; + + T max_output = inout[0]; + for (int i = 1; i < n; i++) { + T output = inout[i]; + if (output > max_output) max_output = output; + } + T prob_total = 0.0; + for (int i = 0; i < n; i++) { + T prob = inout[i] - max_output; + prob = exp(ClipToRange(prob, -kMaxSoftmaxActivation, static_cast(0))); + prob_total += prob; + inout[i] = prob; + } + if (prob_total > 0.0) { + for (int i = 0; i < n; i++) inout[i] /= prob_total; + } +} + +// Copies n values of the given src vector to dest. +inline void CopyVector(int n, const double* src, double* dest) { + memcpy(dest, src, n * sizeof(dest[0])); +} + +// Adds n values of the given src vector to dest. +inline void AccumulateVector(int n, const double* src, double* dest) { + for (int i = 0; i < n; ++i) dest[i] += src[i]; +} + +// Multiplies n values of inout in-place element-wise by the given src vector. +inline void MultiplyVectorsInPlace(int n, const double* src, double* inout) { + for (int i = 0; i < n; ++i) inout[i] *= src[i]; +} + +// Multiplies n values of u by v, element-wise, accumulating to out. +inline void MultiplyAccumulate(int n, const double* u, const double* v, + double* out) { + for (int i = 0; i < n; i++) { + out[i] += u[i] * v[i]; + } +} + +// Sums the given 5 n-vectors putting the result into sum. +inline void SumVectors(int n, const double* v1, const double* v2, + const double* v3, const double* v4, const double* v5, + double* sum) { + for (int i = 0; i < n; ++i) { + sum[i] = v1[i] + v2[i] + v3[i] + v4[i] + v5[i]; + } +} + +// Sets the given n-vector vec to 0. +template +inline void ZeroVector(int n, T* vec) { + memset(vec, 0, n * sizeof(*vec)); +} + +// Clips the given vector vec, of size n to [lower, upper]. +template +inline void ClipVector(int n, T lower, T upper, T* vec) { + for (int i = 0; i < n; ++i) vec[i] = ClipToRange(vec[i], lower, upper); +} + +// Converts the given n-vector to a binary encoding of the maximum value, +// encoded as vector of nf binary values. +inline void CodeInBinary(int n, int nf, double* vec) { + if (nf <= 0 || n < nf) return; + int index = 0; + double best_score = vec[0]; + for (int i = 1; i < n; ++i) { + if (vec[i] > best_score) { + best_score = vec[i]; + index = i; + } + } + int mask = 1; + for (int i = 0; i < nf; ++i, mask *= 2) { + vec[i] = (index & mask) ? 1.0 : 0.0; + } +} + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_FUNCTIONS_H_ diff --git a/lstm/input.cpp b/lstm/input.cpp new file mode 100644 index 00000000..1bcf367e --- /dev/null +++ b/lstm/input.cpp @@ -0,0 +1,156 @@ +/////////////////////////////////////////////////////////////////////// +// File: input.cpp +// Description: Input layer class for neural network implementations. +// Author: Ray Smith +// Created: Thu Mar 13 09:10:34 PDT 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "input.h" + +#include "allheaders.h" +#include "imagedata.h" +#include "pageres.h" +#include "scrollview.h" + +namespace tesseract { + +// Max height for variable height inputs before scaling anyway. +const int kMaxInputHeight = 48; + +Input::Input(const STRING& name, int ni, int no) + : Network(NT_INPUT, name, ni, no), cached_x_scale_(1) {} +Input::Input(const STRING& name, const StaticShape& shape) + : Network(NT_INPUT, name, shape.height(), shape.depth()), + shape_(shape), + cached_x_scale_(1) { + if (shape.height() == 1) ni_ = shape.depth(); +} + +Input::~Input() { +} + +// Writes to the given file. Returns false in case of error. +bool Input::Serialize(TFile* fp) const { + if (!Network::Serialize(fp)) return false; + if (fp->FWrite(&shape_, sizeof(shape_), 1) != 1) return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool Input::DeSerialize(bool swap, TFile* fp) { + if (fp->FRead(&shape_, sizeof(shape_), 1) != 1) return false; + // TODO(rays) swaps! + return true; +} + +// Returns an integer reduction factor that the network applies to the +// time sequence. Assumes that any 2-d is already eliminated. Used for +// scaling bounding boxes of truth data. +int Input::XScaleFactor() const { + return 1; +} + +// Provides the (minimum) x scale factor to the network (of interest only to +// input units) so they can determine how to scale bounding boxes. +void Input::CacheXScaleFactor(int factor) { + cached_x_scale_ = factor; +} + +// Runs forward propagation of activations on the input line. +// See Network for a detailed discussion of the arguments. +void Input::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + *output = input; +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool Input::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + tprintf("Input::Backward should not be called!!\n"); + return false; +} + +// Creates and returns a Pix of appropriate size for the network from the +// image_data. If non-null, *image_scale returns the image scale factor used. +// Returns nullptr on error. +/* static */ +Pix* Input::PrepareLSTMInputs(const ImageData& image_data, + const Network* network, int min_width, + TRand* randomizer, float* image_scale) { + // Note that NumInputs() is defined as input image height. + int target_height = network->NumInputs(); + int width, height; + Pix* pix = image_data.PreScale(target_height, kMaxInputHeight, image_scale, + &width, &height, nullptr); + if (pix == nullptr) { + tprintf("Bad pix from ImageData!\n"); + return nullptr; + } + if (width <= min_width || height < min_width) { + tprintf("Image too small to scale!! (%dx%d vs min width of %d)\n", width, + height, min_width); + pixDestroy(&pix); + return nullptr; + } + return pix; +} + +// Converts the given pix to a NetworkIO of height and depth appropriate to the +// given StaticShape: +// If depth == 3, convert to 24 bit color, otherwise normalized grey. +// Scale to target height, if the shape's height is > 1, or its depth if the +// height == 1. If height == 0 then no scaling. +// NOTE: It isn't safe for multiple threads to call this on the same pix. +/* static */ +void Input::PreparePixInput(const StaticShape& shape, const Pix* pix, + TRand* randomizer, NetworkIO* input) { + bool color = shape.depth() == 3; + Pix* var_pix = const_cast(pix); + int depth = pixGetDepth(var_pix); + Pix* normed_pix = nullptr; + // On input to BaseAPI, an image is forced to be 1, 8 or 24 bit, without + // colormap, so we just have to deal with depth conversion here. + if (color) { + // Force RGB. + if (depth == 32) + normed_pix = pixClone(var_pix); + else + normed_pix = pixConvertTo32(var_pix); + } else { + // Convert non-8-bit images to 8 bit. + if (depth == 8) + normed_pix = pixClone(var_pix); + else + normed_pix = pixConvertTo8(var_pix, false); + } + int height = pixGetHeight(normed_pix); + int target_height = shape.height(); + if (target_height == 1) target_height = shape.depth(); + if (target_height == 0) target_height = height; + float im_factor = static_cast(target_height) / height; + if (im_factor != 1.0f) { + // Get the scaled image. + Pix* scaled_pix = pixScale(normed_pix, im_factor, im_factor); + pixDestroy(&normed_pix); + normed_pix = scaled_pix; + } + input->FromPix(shape, normed_pix, randomizer); + pixDestroy(&normed_pix); +} + +} // namespace tesseract. diff --git a/lstm/input.h b/lstm/input.h new file mode 100644 index 00000000..7a750a56 --- /dev/null +++ b/lstm/input.h @@ -0,0 +1,107 @@ +/////////////////////////////////////////////////////////////////////// +// File: input.h +// Description: Input layer class for neural network implementations. +// Author: Ray Smith +// Created: Thu Mar 13 08:56:26 PDT 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_INPUT_H_ +#define TESSERACT_LSTM_INPUT_H_ + +#include "network.h" + +class ScrollView; + +namespace tesseract { + +class Input : public Network { + public: + Input(const STRING& name, int ni, int no); + Input(const STRING& name, const StaticShape& shape); + virtual ~Input(); + + virtual STRING spec() const { + STRING spec; + spec.add_str_int("", shape_.batch()); + spec.add_str_int(",", shape_.height()); + spec.add_str_int(",", shape_.width()); + spec.add_str_int(",", shape_.depth()); + return spec; + } + + // Returns the required shape input to the network. + virtual StaticShape InputShape() const { return shape_; } + // Returns the shape output from the network given an input shape (which may + // be partially unknown ie zero). + virtual StaticShape OutputShape(const StaticShape& input_shape) const { + return shape_; + } + // Writes to the given file. Returns false in case of error. + // Should be overridden by subclasses, but called by their Serialize. + virtual bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + // Should be overridden by subclasses, but NOT called by their DeSerialize. + virtual bool DeSerialize(bool swap, TFile* fp); + + // Returns an integer reduction factor that the network applies to the + // time sequence. Assumes that any 2-d is already eliminated. Used for + // scaling bounding boxes of truth data. + // WARNING: if GlobalMinimax is used to vary the scale, this will return + // the last used scale factor. Call it before any forward, and it will return + // the minimum scale factor of the paths through the GlobalMinimax. + virtual int XScaleFactor() const; + + // Provides the (minimum) x scale factor to the network (of interest only to + // input units) so they can determine how to scale bounding boxes. + virtual void CacheXScaleFactor(int factor); + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + + // Runs backward propagation of errors on the deltas line. + // See Network for a detailed discussion of the arguments. + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas); + // Creates and returns a Pix of appropriate size for the network from the + // image_data. If non-null, *image_scale returns the image scale factor used. + // Returns nullptr on error. + /* static */ + static Pix* PrepareLSTMInputs(const ImageData& image_data, + const Network* network, int min_width, + TRand* randomizer, float* image_scale); + // Converts the given pix to a NetworkIO of height and depth appropriate to + // the given StaticShape: + // If depth == 3, convert to 24 bit color, otherwise normalized grey. + // Scale to target height, if the shape's height is > 1, or its depth if the + // height == 1. If height == 0 then no scaling. + // NOTE: It isn't safe for multiple threads to call this on the same pix. + static void PreparePixInput(const StaticShape& shape, const Pix* pix, + TRand* randomizer, NetworkIO* input); + + private: + // Input shape determines how images are dealt with. + StaticShape shape_; + // Cached total network x scale factor for scaling bounding boxes. + int cached_x_scale_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_INPUT_H_ + diff --git a/lstm/lstm.cpp b/lstm/lstm.cpp new file mode 100644 index 00000000..9fe16cf8 --- /dev/null +++ b/lstm/lstm.cpp @@ -0,0 +1,728 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstm.cpp +// Description: Long-term-short-term-memory Recurrent neural network. +// Author: Ray Smith +// Created: Wed May 01 17:43:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "lstm.h" + +#ifdef _OPENMP +#include +#endif +#include +#include + +#include "fullyconnected.h" +#include "functions.h" +#include "networkscratch.h" +#include "tprintf.h" + +// Macros for openmp code if it is available, otherwise empty macros. +#ifdef _OPENMP +#define PARALLEL_IF_OPENMP(__num_threads) \ + PRAGMA(omp parallel if (__num_threads > 1) num_threads(__num_threads)) { \ + PRAGMA(omp sections nowait) { \ + PRAGMA(omp section) { +#define SECTION_IF_OPENMP \ + } \ + PRAGMA(omp section) \ + { + +#define END_PARALLEL_IF_OPENMP \ + } \ + } /* end of sections */ \ + } /* end of parallel section */ + +// Define the portable PRAGMA macro. +#ifdef _MSC_VER // Different _Pragma +#define PRAGMA(x) __pragma(x) +#else +#define PRAGMA(x) _Pragma(#x) +#endif // _MSC_VER + +#else // _OPENMP +#define PARALLEL_IF_OPENMP(__num_threads) +#define SECTION_IF_OPENMP +#define END_PARALLEL_IF_OPENMP +#endif // _OPENMP + + +namespace tesseract { + +// Max absolute value of state_. It is reasonably high to enable the state +// to count things. +const double kStateClip = 100.0; +// Max absolute value of gate_errors (the gradients). +const double kErrClip = 1.0f; + +LSTM::LSTM(const STRING& name, int ni, int ns, int no, bool two_dimensional, + NetworkType type) + : Network(type, name, ni, no), + na_(ni + ns), + ns_(ns), + nf_(0), + is_2d_(two_dimensional), + softmax_(NULL), + input_width_(0) { + if (two_dimensional) na_ += ns_; + if (type_ == NT_LSTM || type_ == NT_LSTM_SUMMARY) { + nf_ = 0; + // networkbuilder ensures this is always true. + ASSERT_HOST(no == ns); + } else if (type_ == NT_LSTM_SOFTMAX || type_ == NT_LSTM_SOFTMAX_ENCODED) { + nf_ = type_ == NT_LSTM_SOFTMAX ? no_ : IntCastRounded(ceil(log2(no_))); + softmax_ = new FullyConnected("LSTM Softmax", ns_, no_, NT_SOFTMAX); + } else { + tprintf("%d is invalid type of LSTM!\n", type); + ASSERT_HOST(false); + } + na_ += nf_; +} + +LSTM::~LSTM() { delete softmax_; } + +// Returns the shape output from the network given an input shape (which may +// be partially unknown ie zero). +StaticShape LSTM::OutputShape(const StaticShape& input_shape) const { + StaticShape result = input_shape; + result.set_depth(no_); + if (type_ == NT_LSTM_SUMMARY) result.set_width(1); + if (softmax_ != NULL) return softmax_->OutputShape(result); + return result; +} + +// Suspends/Enables training by setting the training_ flag. Serialize and +// DeSerialize only operate on the run-time data if state is false. +void LSTM::SetEnableTraining(TrainingState state) { + if (state == TS_RE_ENABLE) { + if (training_ == TS_DISABLED) { + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + gate_weights_[w].InitBackward(false); + } + } + training_ = TS_ENABLED; + } else { + training_ = state; + } + if (softmax_ != NULL) softmax_->SetEnableTraining(state); +} + +// Sets up the network for training. Initializes weights using weights of +// scale `range` picked according to the random number generator `randomizer`. +int LSTM::InitWeights(float range, TRand* randomizer) { + Network::SetRandomizer(randomizer); + num_weights_ = 0; + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + num_weights_ += gate_weights_[w].InitWeightsFloat( + ns_, na_ + 1, TestFlag(NF_ADA_GRAD), range, randomizer); + } + if (softmax_ != NULL) { + num_weights_ += softmax_->InitWeights(range, randomizer); + } + return num_weights_; +} + +// Converts a float network to an int network. +void LSTM::ConvertToInt() { + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + gate_weights_[w].ConvertToInt(); + } + if (softmax_ != NULL) { + softmax_->ConvertToInt(); + } +} + +// Sets up the network for training using the given weight_range. +void LSTM::DebugWeights() { + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + STRING msg = name_; + msg.add_str_int(" Gate weights ", w); + gate_weights_[w].Debug2D(msg.string()); + } + if (softmax_ != NULL) { + softmax_->DebugWeights(); + } +} + +// Writes to the given file. Returns false in case of error. +bool LSTM::Serialize(TFile* fp) const { + if (!Network::Serialize(fp)) return false; + if (fp->FWrite(&na_, sizeof(na_), 1) != 1) return false; + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + if (!gate_weights_[w].Serialize(IsTraining(), fp)) return false; + } + if (softmax_ != NULL && !softmax_->Serialize(fp)) return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool LSTM::DeSerialize(bool swap, TFile* fp) { + if (fp->FRead(&na_, sizeof(na_), 1) != 1) return false; + if (swap) ReverseN(&na_, sizeof(na_)); + if (type_ == NT_LSTM_SOFTMAX) { + nf_ = no_; + } else if (type_ == NT_LSTM_SOFTMAX_ENCODED) { + nf_ = IntCastRounded(ceil(log2(no_))); + } else { + nf_ = 0; + } + is_2d_ = false; + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + if (!gate_weights_[w].DeSerialize(IsTraining(), swap, fp)) return false; + if (w == CI) { + ns_ = gate_weights_[CI].NumOutputs(); + is_2d_ = na_ - nf_ == ni_ + 2 * ns_; + } + } + delete softmax_; + if (type_ == NT_LSTM_SOFTMAX || type_ == NT_LSTM_SOFTMAX_ENCODED) { + softmax_ = + reinterpret_cast(Network::CreateFromFile(swap, fp)); + if (softmax_ == NULL) return false; + } else { + softmax_ = NULL; + } + return true; +} + +// Runs forward propagation of activations on the input line. +// See NetworkCpp for a detailed discussion of the arguments. +void LSTM::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + input_map_ = input.stride_map(); + input_width_ = input.Width(); + if (softmax_ != NULL) + output->ResizeFloat(input, no_); + else if (type_ == NT_LSTM_SUMMARY) + output->ResizeXTo1(input, no_); + else + output->Resize(input, no_); + ResizeForward(input); + // Temporary storage of forward computation for each gate. + NetworkScratch::FloatVec temp_lines[WT_COUNT]; + for (int i = 0; i < WT_COUNT; ++i) temp_lines[i].Init(ns_, scratch); + // Single timestep buffers for the current/recurrent output and state. + NetworkScratch::FloatVec curr_state, curr_output; + curr_state.Init(ns_, scratch); + ZeroVector(ns_, curr_state); + curr_output.Init(ns_, scratch); + ZeroVector(ns_, curr_output); + // Rotating buffers of width buf_width allow storage of the state and output + // for the other dimension, used only when working in true 2D mode. The width + // is enough to hold an entire strip of the major direction. + int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1; + GenericVector states, outputs; + if (Is2D()) { + states.init_to_size(buf_width, NetworkScratch::FloatVec()); + outputs.init_to_size(buf_width, NetworkScratch::FloatVec()); + for (int i = 0; i < buf_width; ++i) { + states[i].Init(ns_, scratch); + ZeroVector(ns_, states[i]); + outputs[i].Init(ns_, scratch); + ZeroVector(ns_, outputs[i]); + } + } + // Used only if a softmax LSTM. + NetworkScratch::FloatVec softmax_output; + NetworkScratch::IO int_output; + if (softmax_ != NULL) { + softmax_output.Init(no_, scratch); + ZeroVector(no_, softmax_output); + if (input.int_mode()) int_output.Resize2d(true, 1, ns_, scratch); + softmax_->SetupForward(input, NULL); + } + NetworkScratch::FloatVec curr_input; + curr_input.Init(na_, scratch); + StrideMap::Index src_index(input_map_); + // Used only by NT_LSTM_SUMMARY. + StrideMap::Index dest_index(output->stride_map()); + do { + int t = src_index.t(); + // True if there is a valid old state for the 2nd dimension. + bool valid_2d = Is2D(); + if (valid_2d) { + StrideMap::Index dim_index(src_index); + if (!dim_index.AddOffset(-1, FD_HEIGHT)) valid_2d = false; + } + // Index of the 2-D revolving buffers (outputs, states). + int mod_t = Modulo(t, buf_width); // Current timestep. + // Setup the padded input in source. + source_.CopyTimeStepGeneral(t, 0, ni_, input, t, 0); + if (softmax_ != NULL) { + source_.WriteTimeStepPart(t, ni_, nf_, softmax_output); + } + source_.WriteTimeStepPart(t, ni_ + nf_, ns_, curr_output); + if (Is2D()) + source_.WriteTimeStepPart(t, ni_ + nf_ + ns_, ns_, outputs[mod_t]); + if (!source_.int_mode()) source_.ReadTimeStep(t, curr_input); + // Matrix multiply the inputs with the source. + PARALLEL_IF_OPENMP(GFS) + // It looks inefficient to create the threads on each t iteration, but the + // alternative of putting the parallel outside the t loop, a single around + // the t-loop and then tasks in place of the sections is a *lot* slower. + // Cell inputs. + if (source_.int_mode()) + gate_weights_[CI].MatrixDotVector(source_.i(t), temp_lines[CI]); + else + gate_weights_[CI].MatrixDotVector(curr_input, temp_lines[CI]); + FuncInplace(ns_, temp_lines[CI]); + + SECTION_IF_OPENMP + // Input Gates. + if (source_.int_mode()) + gate_weights_[GI].MatrixDotVector(source_.i(t), temp_lines[GI]); + else + gate_weights_[GI].MatrixDotVector(curr_input, temp_lines[GI]); + FuncInplace(ns_, temp_lines[GI]); + + SECTION_IF_OPENMP + // 1-D forget gates. + if (source_.int_mode()) + gate_weights_[GF1].MatrixDotVector(source_.i(t), temp_lines[GF1]); + else + gate_weights_[GF1].MatrixDotVector(curr_input, temp_lines[GF1]); + FuncInplace(ns_, temp_lines[GF1]); + + // 2-D forget gates. + if (Is2D()) { + if (source_.int_mode()) + gate_weights_[GFS].MatrixDotVector(source_.i(t), temp_lines[GFS]); + else + gate_weights_[GFS].MatrixDotVector(curr_input, temp_lines[GFS]); + FuncInplace(ns_, temp_lines[GFS]); + } + + SECTION_IF_OPENMP + // Output gates. + if (source_.int_mode()) + gate_weights_[GO].MatrixDotVector(source_.i(t), temp_lines[GO]); + else + gate_weights_[GO].MatrixDotVector(curr_input, temp_lines[GO]); + FuncInplace(ns_, temp_lines[GO]); + END_PARALLEL_IF_OPENMP + + // Apply forget gate to state. + MultiplyVectorsInPlace(ns_, temp_lines[GF1], curr_state); + if (Is2D()) { + // Max-pool the forget gates (in 2-d) instead of blindly adding. + inT8* which_fg_col = which_fg_[t]; + memset(which_fg_col, 1, ns_ * sizeof(which_fg_col[0])); + if (valid_2d) { + const double* stepped_state = states[mod_t]; + for (int i = 0; i < ns_; ++i) { + if (temp_lines[GF1][i] < temp_lines[GFS][i]) { + curr_state[i] = temp_lines[GFS][i] * stepped_state[i]; + which_fg_col[i] = 2; + } + } + } + } + MultiplyAccumulate(ns_, temp_lines[CI], temp_lines[GI], curr_state); + // Clip curr_state to a sane range. + ClipVector(ns_, -kStateClip, kStateClip, curr_state); + if (IsTraining()) { + // Save the gate node values. + node_values_[CI].WriteTimeStep(t, temp_lines[CI]); + node_values_[GI].WriteTimeStep(t, temp_lines[GI]); + node_values_[GF1].WriteTimeStep(t, temp_lines[GF1]); + node_values_[GO].WriteTimeStep(t, temp_lines[GO]); + if (Is2D()) node_values_[GFS].WriteTimeStep(t, temp_lines[GFS]); + } + FuncMultiply(curr_state, temp_lines[GO], ns_, curr_output); + if (IsTraining()) state_.WriteTimeStep(t, curr_state); + if (softmax_ != NULL) { + if (input.int_mode()) { + int_output->WriteTimeStep(0, curr_output); + softmax_->ForwardTimeStep(NULL, int_output->i(0), t, softmax_output); + } else { + softmax_->ForwardTimeStep(curr_output, NULL, t, softmax_output); + } + output->WriteTimeStep(t, softmax_output); + if (type_ == NT_LSTM_SOFTMAX_ENCODED) { + CodeInBinary(no_, nf_, softmax_output); + } + } else if (type_ == NT_LSTM_SUMMARY) { + // Output only at the end of a row. + if (src_index.IsLast(FD_WIDTH)) { + output->WriteTimeStep(dest_index.t(), curr_output); + dest_index.Increment(); + } + } else { + output->WriteTimeStep(t, curr_output); + } + // Save states for use by the 2nd dimension only if needed. + if (Is2D()) { + CopyVector(ns_, curr_state, states[mod_t]); + CopyVector(ns_, curr_output, outputs[mod_t]); + } + // Always zero the states at the end of every row, but only for the major + // direction. The 2-D state remains intact. + if (src_index.IsLast(FD_WIDTH)) { + ZeroVector(ns_, curr_state); + ZeroVector(ns_, curr_output); + } + } while (src_index.Increment()); +#if DEBUG_DETAIL > 0 + tprintf("Source:%s\n", name_.string()); + source_.Print(10); + tprintf("State:%s\n", name_.string()); + state_.Print(10); + tprintf("Output:%s\n", name_.string()); + output->Print(10); +#endif + if (debug) DisplayForward(*output); +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool LSTM::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + if (debug) DisplayBackward(fwd_deltas); + back_deltas->ResizeToMap(fwd_deltas.int_mode(), input_map_, ni_); + // ======Scratch space.====== + // Output errors from deltas with recurrence from sourceerr. + NetworkScratch::FloatVec outputerr; + outputerr.Init(ns_, scratch); + // Recurrent error in the state/source. + NetworkScratch::FloatVec curr_stateerr, curr_sourceerr; + curr_stateerr.Init(ns_, scratch); + curr_sourceerr.Init(na_, scratch); + ZeroVector(ns_, curr_stateerr); + ZeroVector(na_, curr_sourceerr); + // Errors in the gates. + NetworkScratch::FloatVec gate_errors[WT_COUNT]; + for (int g = 0; g < WT_COUNT; ++g) gate_errors[g].Init(ns_, scratch); + // Rotating buffers of width buf_width allow storage of the recurrent time- + // steps used only for true 2-D. Stores one full strip of the major direction. + int buf_width = Is2D() ? input_map_.Size(FD_WIDTH) : 1; + GenericVector stateerr, sourceerr; + if (Is2D()) { + stateerr.init_to_size(buf_width, NetworkScratch::FloatVec()); + sourceerr.init_to_size(buf_width, NetworkScratch::FloatVec()); + for (int t = 0; t < buf_width; ++t) { + stateerr[t].Init(ns_, scratch); + sourceerr[t].Init(na_, scratch); + ZeroVector(ns_, stateerr[t]); + ZeroVector(na_, sourceerr[t]); + } + } + // Parallel-generated sourceerr from each of the gates. + NetworkScratch::FloatVec sourceerr_temps[WT_COUNT]; + for (int w = 0; w < WT_COUNT; ++w) + sourceerr_temps[w].Init(na_, scratch); + int width = input_width_; + // Transposed gate errors stored over all timesteps for sum outer. + NetworkScratch::GradientStore gate_errors_t[WT_COUNT]; + for (int w = 0; w < WT_COUNT; ++w) { + gate_errors_t[w].Init(ns_, width, scratch); + } + // Used only if softmax_ != NULL. + NetworkScratch::FloatVec softmax_errors; + NetworkScratch::GradientStore softmax_errors_t; + if (softmax_ != NULL) { + softmax_errors.Init(no_, scratch); + softmax_errors_t.Init(no_, width, scratch); + } + double state_clip = Is2D() ? 9.0 : 4.0; +#if DEBUG_DETAIL > 1 + tprintf("fwd_deltas:%s\n", name_.string()); + fwd_deltas.Print(10); +#endif + StrideMap::Index dest_index(input_map_); + dest_index.InitToLast(); + // Used only by NT_LSTM_SUMMARY. + StrideMap::Index src_index(fwd_deltas.stride_map()); + src_index.InitToLast(); + do { + int t = dest_index.t(); + bool at_last_x = dest_index.IsLast(FD_WIDTH); + // up_pos is the 2-D back step, down_pos is the 2-D fwd step, and are only + // valid if >= 0, which is true if 2d and not on the top/bottom. + int up_pos = -1; + int down_pos = -1; + if (Is2D()) { + if (dest_index.index(FD_HEIGHT) > 0) { + StrideMap::Index up_index(dest_index); + if (up_index.AddOffset(-1, FD_HEIGHT)) up_pos = up_index.t(); + } + if (!dest_index.IsLast(FD_HEIGHT)) { + StrideMap::Index down_index(dest_index); + if (down_index.AddOffset(1, FD_HEIGHT)) down_pos = down_index.t(); + } + } + // Index of the 2-D revolving buffers (sourceerr, stateerr). + int mod_t = Modulo(t, buf_width); // Current timestep. + // Zero the state in the major direction only at the end of every row. + if (at_last_x) { + ZeroVector(na_, curr_sourceerr); + ZeroVector(ns_, curr_stateerr); + } + // Setup the outputerr. + if (type_ == NT_LSTM_SUMMARY) { + if (dest_index.IsLast(FD_WIDTH)) { + fwd_deltas.ReadTimeStep(src_index.t(), outputerr); + src_index.Decrement(); + } else { + ZeroVector(ns_, outputerr); + } + } else if (softmax_ == NULL) { + fwd_deltas.ReadTimeStep(t, outputerr); + } else { + softmax_->BackwardTimeStep(fwd_deltas, t, softmax_errors, + softmax_errors_t.get(), outputerr); + } + if (!at_last_x) + AccumulateVector(ns_, curr_sourceerr + ni_ + nf_, outputerr); + if (down_pos >= 0) + AccumulateVector(ns_, sourceerr[mod_t] + ni_ + nf_ + ns_, outputerr); + // Apply the 1-d forget gates. + if (!at_last_x) { + const float* next_node_gf1 = node_values_[GF1].f(t + 1); + for (int i = 0; i < ns_; ++i) { + curr_stateerr[i] *= next_node_gf1[i]; + } + } + if (Is2D() && t + 1 < width) { + for (int i = 0; i < ns_; ++i) { + if (which_fg_[t + 1][i] != 1) curr_stateerr[i] = 0.0; + } + if (down_pos >= 0) { + const float* right_node_gfs = node_values_[GFS].f(down_pos); + const double* right_stateerr = stateerr[mod_t]; + for (int i = 0; i < ns_; ++i) { + if (which_fg_[down_pos][i] == 2) { + curr_stateerr[i] += right_stateerr[i] * right_node_gfs[i]; + } + } + } + } + state_.FuncMultiply3Add(node_values_[GO], t, outputerr, + curr_stateerr); + // Clip stateerr_ to a sane range. + ClipVector(ns_, -state_clip, state_clip, curr_stateerr); +#if DEBUG_DETAIL > 1 + if (t + 10 > width) { + tprintf("t=%d, stateerr=", t); + for (int i = 0; i < ns_; ++i) + tprintf(" %g,%g,%g", curr_stateerr[i], outputerr[i], + curr_sourceerr[ni_ + nf_ + i]); + tprintf("\n"); + } +#endif + // Matrix multiply to get the source errors. + PARALLEL_IF_OPENMP(GFS) + + // Cell inputs. + node_values_[CI].FuncMultiply3(t, node_values_[GI], t, + curr_stateerr, gate_errors[CI]); + ClipVector(ns_, -kErrClip, kErrClip, gate_errors[CI].get()); + gate_weights_[CI].VectorDotMatrix(gate_errors[CI], sourceerr_temps[CI]); + gate_errors_t[CI].get()->WriteStrided(t, gate_errors[CI]); + + SECTION_IF_OPENMP + // Input Gates. + node_values_[GI].FuncMultiply3(t, node_values_[CI], t, + curr_stateerr, gate_errors[GI]); + ClipVector(ns_, -kErrClip, kErrClip, gate_errors[GI].get()); + gate_weights_[GI].VectorDotMatrix(gate_errors[GI], sourceerr_temps[GI]); + gate_errors_t[GI].get()->WriteStrided(t, gate_errors[GI]); + + SECTION_IF_OPENMP + // 1-D forget Gates. + if (t > 0) { + node_values_[GF1].FuncMultiply3(t, state_, t - 1, curr_stateerr, + gate_errors[GF1]); + ClipVector(ns_, -kErrClip, kErrClip, gate_errors[GF1].get()); + gate_weights_[GF1].VectorDotMatrix(gate_errors[GF1], + sourceerr_temps[GF1]); + } else { + memset(gate_errors[GF1], 0, ns_ * sizeof(gate_errors[GF1][0])); + memset(sourceerr_temps[GF1], 0, na_ * sizeof(*sourceerr_temps[GF1])); + } + gate_errors_t[GF1].get()->WriteStrided(t, gate_errors[GF1]); + + // 2-D forget Gates. + if (up_pos >= 0) { + node_values_[GFS].FuncMultiply3(t, state_, up_pos, curr_stateerr, + gate_errors[GFS]); + ClipVector(ns_, -kErrClip, kErrClip, gate_errors[GFS].get()); + gate_weights_[GFS].VectorDotMatrix(gate_errors[GFS], + sourceerr_temps[GFS]); + } else { + memset(gate_errors[GFS], 0, ns_ * sizeof(gate_errors[GFS][0])); + memset(sourceerr_temps[GFS], 0, na_ * sizeof(*sourceerr_temps[GFS])); + } + if (Is2D()) gate_errors_t[GFS].get()->WriteStrided(t, gate_errors[GFS]); + + SECTION_IF_OPENMP + // Output gates. + state_.Func2Multiply3(node_values_[GO], t, outputerr, + gate_errors[GO]); + ClipVector(ns_, -kErrClip, kErrClip, gate_errors[GO].get()); + gate_weights_[GO].VectorDotMatrix(gate_errors[GO], sourceerr_temps[GO]); + gate_errors_t[GO].get()->WriteStrided(t, gate_errors[GO]); + END_PARALLEL_IF_OPENMP + + SumVectors(na_, sourceerr_temps[CI], sourceerr_temps[GI], + sourceerr_temps[GF1], sourceerr_temps[GO], sourceerr_temps[GFS], + curr_sourceerr); + back_deltas->WriteTimeStep(t, curr_sourceerr); + // Save states for use by the 2nd dimension only if needed. + if (Is2D()) { + CopyVector(ns_, curr_stateerr, stateerr[mod_t]); + CopyVector(na_, curr_sourceerr, sourceerr[mod_t]); + } + } while (dest_index.Decrement()); +#if DEBUG_DETAIL > 2 + for (int w = 0; w < WT_COUNT; ++w) { + tprintf("%s gate errors[%d]\n", name_.string(), w); + gate_errors_t[w].get()->PrintUnTransposed(10); + } +#endif + // Transposed source_ used to speed-up SumOuter. + NetworkScratch::GradientStore source_t, state_t; + source_t.Init(na_, width, scratch); + source_.Transpose(source_t.get()); + state_t.Init(ns_, width, scratch); + state_.Transpose(state_t.get()); +#ifdef _OPENMP +#pragma omp parallel for num_threads(GFS) if (!Is2D()) +#endif + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + gate_weights_[w].SumOuterTransposed(*gate_errors_t[w], *source_t, false); + } + if (softmax_ != NULL) { + softmax_->FinishBackward(*softmax_errors_t); + } + if (needs_to_backprop_) { + // Normalize the inputerr in back_deltas. + back_deltas->CopyWithNormalization(*back_deltas, fwd_deltas); + return true; + } + return false; +} + +// Updates the weights using the given learning rate and momentum. +// num_samples is the quotient to be used in the adagrad computation iff +// use_ada_grad_ is true. +void LSTM::Update(float learning_rate, float momentum, int num_samples) { +#if DEBUG_DETAIL > 3 + PrintW(); +#endif + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + gate_weights_[w].Update(learning_rate, momentum, num_samples); + } + if (softmax_ != NULL) { + softmax_->Update(learning_rate, momentum, num_samples); + } +#if DEBUG_DETAIL > 3 + PrintDW(); +#endif +} + +// Sums the products of weight updates in *this and other, splitting into +// positive (same direction) in *same and negative (different direction) in +// *changed. +void LSTM::CountAlternators(const Network& other, double* same, + double* changed) const { + ASSERT_HOST(other.type() == type_); + const LSTM* lstm = reinterpret_cast(&other); + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + gate_weights_[w].CountAlternators(lstm->gate_weights_[w], same, changed); + } + if (softmax_ != NULL) { + softmax_->CountAlternators(*lstm->softmax_, same, changed); + } +} + +// Prints the weights for debug purposes. +void LSTM::PrintW() { + tprintf("Weight state:%s\n", name_.string()); + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + tprintf("Gate %d, inputs\n", w); + for (int i = 0; i < ni_; ++i) { + tprintf("Row %d:", i); + for (int s = 0; s < ns_; ++s) + tprintf(" %g", gate_weights_[w].GetWeights(s)[i]); + tprintf("\n"); + } + tprintf("Gate %d, outputs\n", w); + for (int i = ni_; i < ni_ + ns_; ++i) { + tprintf("Row %d:", i - ni_); + for (int s = 0; s < ns_; ++s) + tprintf(" %g", gate_weights_[w].GetWeights(s)[i]); + tprintf("\n"); + } + tprintf("Gate %d, bias\n", w); + for (int s = 0; s < ns_; ++s) + tprintf(" %g", gate_weights_[w].GetWeights(s)[na_]); + tprintf("\n"); + } +} + +// Prints the weight deltas for debug purposes. +void LSTM::PrintDW() { + tprintf("Delta state:%s\n", name_.string()); + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + tprintf("Gate %d, inputs\n", w); + for (int i = 0; i < ni_; ++i) { + tprintf("Row %d:", i); + for (int s = 0; s < ns_; ++s) + tprintf(" %g", gate_weights_[w].GetDW(s, i)); + tprintf("\n"); + } + tprintf("Gate %d, outputs\n", w); + for (int i = ni_; i < ni_ + ns_; ++i) { + tprintf("Row %d:", i - ni_); + for (int s = 0; s < ns_; ++s) + tprintf(" %g", gate_weights_[w].GetDW(s, i)); + tprintf("\n"); + } + tprintf("Gate %d, bias\n", w); + for (int s = 0; s < ns_; ++s) + tprintf(" %g", gate_weights_[w].GetDW(s, na_)); + tprintf("\n"); + } +} + +// Resizes forward data to cope with an input image of the given width. +void LSTM::ResizeForward(const NetworkIO& input) { + source_.Resize(input, na_); + which_fg_.ResizeNoInit(input.Width(), ns_); + if (IsTraining()) { + state_.ResizeFloat(input, ns_); + for (int w = 0; w < WT_COUNT; ++w) { + if (w == GFS && !Is2D()) continue; + node_values_[w].ResizeFloat(input, ns_); + } + } +} + + +} // namespace tesseract. diff --git a/lstm/lstm.h b/lstm/lstm.h new file mode 100644 index 00000000..f87fa681 --- /dev/null +++ b/lstm/lstm.h @@ -0,0 +1,161 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstm.h +// Description: Long-term-short-term-memory Recurrent neural network. +// Author: Ray Smith +// Created: Wed May 01 17:33:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_LSTM_H_ +#define TESSERACT_LSTM_LSTM_H_ + +#include "network.h" +#include "fullyconnected.h" + +namespace tesseract { + +// C++ Implementation of the LSTM class from lstm.py. +class LSTM : public Network { + public: + // Enum for the different weights in LSTM, to reduce some of the I/O and + // setup code to loops. The elements of the enum correspond to elements of an + // array of WeightMatrix or a corresponding array of NetworkIO. + enum WeightType { + CI, // Cell Inputs. + GI, // Gate at the input. + GF1, // Forget gate at the memory (1-d or looking back 1 timestep). + GO, // Gate at the output. + GFS, // Forget gate at the memory, looking back in the other dimension. + + WT_COUNT // Number of WeightTypes. + }; + + // Constructor for NT_LSTM (regular 1 or 2-d LSTM), NT_LSTM_SOFTMAX (LSTM with + // additional softmax layer included and fed back into the input at the next + // timestep), or NT_LSTM_SOFTMAX_ENCODED (as LSTM_SOFTMAX, but the feedback + // is binary encoded instead of categorical) only. + // 2-d and bidi softmax LSTMs are not rejected, but are impossible to build + // in the conventional way because the output feedback both forwards and + // backwards in time does become impossible. + LSTM(const STRING& name, int num_inputs, int num_states, int num_outputs, + bool two_dimensional, NetworkType type); + virtual ~LSTM(); + + // Returns the shape output from the network given an input shape (which may + // be partially unknown ie zero). + virtual StaticShape OutputShape(const StaticShape& input_shape) const; + + virtual STRING spec() const { + STRING spec; + if (type_ == NT_LSTM) + spec.add_str_int("Lfx", ns_); + else if (type_ == NT_LSTM_SUMMARY) + spec.add_str_int("Lfxs", ns_); + else if (type_ == NT_LSTM_SOFTMAX) + spec.add_str_int("LS", ns_); + else if (type_ == NT_LSTM_SOFTMAX_ENCODED) + spec.add_str_int("LE", ns_); + if (softmax_ != NULL) spec += softmax_->spec(); + return spec; + } + + // Suspends/Enables training by setting the training_ flag. Serialize and + // DeSerialize only operate on the run-time data if state is false. + virtual void SetEnableTraining(TrainingState state); + + // Sets up the network for training. Initializes weights using weights of + // scale `range` picked according to the random number generator `randomizer`. + virtual int InitWeights(float range, TRand* randomizer); + + // Converts a float network to an int network. + virtual void ConvertToInt(); + + // Provides debug output on the weights. + virtual void DebugWeights(); + + // Writes to the given file. Returns false in case of error. + virtual bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + virtual bool DeSerialize(bool swap, TFile* fp); + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + + // Runs backward propagation of errors on the deltas line. + // See Network for a detailed discussion of the arguments. + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas); + // Updates the weights using the given learning rate and momentum. + // num_samples is the quotient to be used in the adagrad computation iff + // use_ada_grad_ is true. + virtual void Update(float learning_rate, float momentum, int num_samples); + // Sums the products of weight updates in *this and other, splitting into + // positive (same direction) in *same and negative (different direction) in + // *changed. + virtual void CountAlternators(const Network& other, double* same, + double* changed) const; + // Prints the weights for debug purposes. + void PrintW(); + // Prints the weight deltas for debug purposes. + void PrintDW(); + + // Returns true of this is a 2-d lstm. + bool Is2D() const { + return is_2d_; + } + + private: + // Resizes forward data to cope with an input image of the given width. + void ResizeForward(const NetworkIO& input); + + private: + // Size of padded input to weight matrices = ni_ + no_ for 1-D operation + // and ni_ + 2 * no_ for 2-D operation. Note that there is a phantom 1 input + // for the bias that makes the weight matrices of size [na + 1][no]. + inT32 na_; + // Number of internal states. Equal to no_ except for a softmax LSTM. + // ns_ is NOT serialized, but is calculated from gate_weights_. + inT32 ns_; + // Number of additional feedback states. The softmax types feed back + // additional output information on top of the ns_ internal states. + // In the case of a binary-coded (EMBEDDED) softmax, nf_ < no_. + inT32 nf_; + // Flag indicating 2-D operation. + bool is_2d_; + + // Gate weight arrays of size [na + 1, no]. + WeightMatrix gate_weights_[WT_COUNT]; + // Used only if this is a softmax LSTM. + FullyConnected* softmax_; + // Input padded with previous output of size [width, na]. + NetworkIO source_; + // Internal state used during forward operation, of size [width, ns]. + NetworkIO state_; + // State of the 2-d maxpool, generated during forward, used during backward. + GENERIC_2D_ARRAY which_fg_; + // Internal state saved from forward, but used only during backward. + NetworkIO node_values_[WT_COUNT]; + // Preserved input stride_map used for Backward when NT_LSTM_SQUASHED. + StrideMap input_map_; + int input_width_; +}; + +} // namespace tesseract. + + +#endif // TESSERACT_LSTM_LSTM_H_ diff --git a/lstm/lstmrecognizer.cpp b/lstm/lstmrecognizer.cpp new file mode 100644 index 00000000..e4013aec --- /dev/null +++ b/lstm/lstmrecognizer.cpp @@ -0,0 +1,816 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstmrecognizer.cpp +// Description: Top-level line recognizer class for LSTM-based networks. +// Author: Ray Smith +// Created: Thu May 02 10:59:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +#include "lstmrecognizer.h" + +#include "allheaders.h" +#include "callcpp.h" +#include "dict.h" +#include "genericheap.h" +#include "helpers.h" +#include "imagedata.h" +#include "input.h" +#include "lstm.h" +#include "normalis.h" +#include "pageres.h" +#include "ratngs.h" +#include "recodebeam.h" +#include "scrollview.h" +#include "shapetable.h" +#include "statistc.h" +#include "tprintf.h" + +namespace tesseract { + +// Max number of blob choices to return in any given position. +const int kMaxChoices = 4; +// Default ratio between dict and non-dict words. +const double kDictRatio = 2.25; +// Default certainty offset to give the dictionary a chance. +const double kCertOffset = -0.085; + +LSTMRecognizer::LSTMRecognizer() + : network_(NULL), + training_flags_(0), + training_iteration_(0), + sample_iteration_(0), + null_char_(UNICHAR_BROKEN), + weight_range_(0.0f), + learning_rate_(0.0f), + momentum_(0.0f), + dict_(NULL), + search_(NULL), + debug_win_(NULL) {} + +LSTMRecognizer::~LSTMRecognizer() { + delete network_; + delete dict_; + delete search_; +} + +// Writes to the given file. Returns false in case of error. +bool LSTMRecognizer::Serialize(TFile* fp) const { + if (!network_->Serialize(fp)) return false; + if (!GetUnicharset().save_to_file(fp)) return false; + if (!network_str_.Serialize(fp)) return false; + if (fp->FWrite(&training_flags_, sizeof(training_flags_), 1) != 1) + return false; + if (fp->FWrite(&training_iteration_, sizeof(training_iteration_), 1) != 1) + return false; + if (fp->FWrite(&sample_iteration_, sizeof(sample_iteration_), 1) != 1) + return false; + if (fp->FWrite(&null_char_, sizeof(null_char_), 1) != 1) return false; + if (fp->FWrite(&weight_range_, sizeof(weight_range_), 1) != 1) return false; + if (fp->FWrite(&learning_rate_, sizeof(learning_rate_), 1) != 1) return false; + if (fp->FWrite(&momentum_, sizeof(momentum_), 1) != 1) return false; + if (IsRecoding() && !recoder_.Serialize(fp)) return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool LSTMRecognizer::DeSerialize(bool swap, TFile* fp) { + delete network_; + network_ = Network::CreateFromFile(swap, fp); + if (network_ == NULL) return false; + if (!ccutil_.unicharset.load_from_file(fp, false)) return false; + if (!network_str_.DeSerialize(swap, fp)) return false; + if (fp->FRead(&training_flags_, sizeof(training_flags_), 1) != 1) + return false; + if (fp->FRead(&training_iteration_, sizeof(training_iteration_), 1) != 1) + return false; + if (fp->FRead(&sample_iteration_, sizeof(sample_iteration_), 1) != 1) + return false; + if (fp->FRead(&null_char_, sizeof(null_char_), 1) != 1) return false; + if (fp->FRead(&weight_range_, sizeof(weight_range_), 1) != 1) return false; + if (fp->FRead(&learning_rate_, sizeof(learning_rate_), 1) != 1) return false; + if (fp->FRead(&momentum_, sizeof(momentum_), 1) != 1) return false; + if (IsRecoding()) { + if (!recoder_.DeSerialize(swap, fp)) return false; + RecodedCharID code; + recoder_.EncodeUnichar(UNICHAR_SPACE, &code); + if (code(0) != UNICHAR_SPACE) { + tprintf("Space was garbled in recoding!!\n"); + return false; + } + } + // TODO(rays) swaps! + network_->SetRandomizer(&randomizer_); + network_->CacheXScaleFactor(network_->XScaleFactor()); + return true; +} + +// Loads the dictionary if possible from the traineddata file. +// Prints a warning message, and returns false but otherwise fails silently +// and continues to work without it if loading fails. +// Note that dictionary load is independent from DeSerialize, but dependent +// on the unicharset matching. This enables training to deserialize a model +// from checkpoint or restore without having to go back and reload the +// dictionary. +bool LSTMRecognizer::LoadDictionary(const char* lang, TessdataManager* mgr) { + delete dict_; + dict_ = new Dict(&ccutil_); + dict_->SetupForLoad(Dict::GlobalDawgCache()); + dict_->LoadLSTM(lang, mgr); + if (dict_->FinishLoad()) return true; // Success. + tprintf("Failed to load any lstm-specific dictionaries for lang %s!!\n", + lang); + delete dict_; + dict_ = NULL; + return false; +} + +// Recognizes the line image, contained within image_data, returning the +// ratings matrix and matching box_word for each WERD_RES in the output. +void LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, + bool debug, double worst_dict_cert, + bool use_alternates, + const UNICHARSET* target_unicharset, + const TBOX& line_box, float score_ratio, + bool one_word, + PointerVector* words) { + NetworkIO outputs; + float label_threshold = use_alternates ? 0.75f : 0.0f; + float scale_factor; + NetworkIO inputs; + if (!RecognizeLine(image_data, invert, debug, false, label_threshold, + &scale_factor, &inputs, &outputs)) + return; + if (IsRecoding()) { + if (search_ == NULL) { + search_ = + new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_); + } + search_->Decode(outputs, kDictRatio, kCertOffset, worst_dict_cert, NULL); + search_->ExtractBestPathAsWords(line_box, scale_factor, debug, + &GetUnicharset(), words); + } else { + GenericVector label_coords; + GenericVector labels; + LabelsFromOutputs(outputs, label_threshold, &labels, &label_coords); + WordsFromOutputs(outputs, labels, label_coords, line_box, debug, + use_alternates, one_word, score_ratio, scale_factor, + target_unicharset, words); + } +} + +// Builds a set of tesseract-compatible WERD_RESs aligned to line_box, +// corresponding to the network output in outputs, labels, label_coords. +// one_word generates a single word output, that may include spaces inside. +// use_alternates generates alternative BLOB_CHOICEs and segmentation paths. +// If not NULL, we attempt to translate the output to target_unicharset, but do +// not guarantee success, due to mismatches. In that case the output words are +// marked with our UNICHARSET, not the caller's. +void LSTMRecognizer::WordsFromOutputs( + const NetworkIO& outputs, const GenericVector& labels, + const GenericVector label_coords, const TBOX& line_box, bool debug, + bool use_alternates, bool one_word, float score_ratio, float scale_factor, + const UNICHARSET* target_unicharset, PointerVector* words) { + // Convert labels to unichar-ids. + int word_end = 0; + float prev_space_cert = 0.0f; + for (int i = 0; i < labels.size(); i = word_end) { + word_end = i + 1; + if (labels[i] == null_char_ || labels[i] == UNICHAR_SPACE) { + continue; + } + float space_cert = 0.0f; + if (one_word) { + word_end = labels.size(); + } else { + // Find the end of the word at the first null_char_ that leads to the + // first UNICHAR_SPACE. + while (word_end < labels.size() && labels[word_end] != UNICHAR_SPACE) + ++word_end; + if (word_end < labels.size()) { + float rating; + outputs.ScoresOverRange(label_coords[word_end], + label_coords[word_end] + 1, UNICHAR_SPACE, + null_char_, &rating, &space_cert); + } + while (word_end > i && labels[word_end - 1] == null_char_) --word_end; + } + ASSERT_HOST(word_end > i); + // Create a WERD_RES for the output word. + if (debug) + tprintf("Creating word from outputs over [%d,%d)\n", i, word_end); + WERD_RES* word = + WordFromOutput(line_box, outputs, i, word_end, score_ratio, + MIN(prev_space_cert, space_cert), debug, + use_alternates && !SimpleTextOutput(), target_unicharset, + labels, label_coords, scale_factor); + if (word == NULL && target_unicharset != NULL) { + // Unicharset translation failed - use decoder_ instead, and disable + // the segmentation search on output, as it won't understand the encoding. + word = WordFromOutput(line_box, outputs, i, word_end, score_ratio, + MIN(prev_space_cert, space_cert), debug, false, + NULL, labels, label_coords, scale_factor); + } + prev_space_cert = space_cert; + words->push_back(word); + } +} + +// Helper computes min and mean best results in the output. +void LSTMRecognizer::OutputStats(const NetworkIO& outputs, float* min_output, + float* mean_output, float* sd) { + const int kOutputScale = MAX_INT8; + STATS stats(0, kOutputScale + 1); + for (int t = 0; t < outputs.Width(); ++t) { + int best_label = outputs.BestLabel(t, NULL); + if (best_label != null_char_ || t == 0) { + float best_output = outputs.f(t)[best_label]; + stats.add(static_cast(kOutputScale * best_output), 1); + } + } + *min_output = static_cast(stats.min_bucket()) / kOutputScale; + *mean_output = stats.mean() / kOutputScale; + *sd = stats.sd() / kOutputScale; +} + +// Recognizes the image_data, returning the labels, +// scores, and corresponding pairs of start, end x-coords in coords. +// If label_threshold is positive, uses it for making the labels, otherwise +// uses standard ctc. +bool LSTMRecognizer::RecognizeLine(const ImageData& image_data, bool invert, + bool debug, bool re_invert, + float label_threshold, float* scale_factor, + NetworkIO* inputs, NetworkIO* outputs) { + // Maximum width of image to train on. + const int kMaxImageWidth = 2560; + // This ensures consistent recognition results. + SetRandomSeed(); + int min_width = network_->XScaleFactor(); + Pix* pix = Input::PrepareLSTMInputs(image_data, network_, min_width, + &randomizer_, scale_factor); + if (pix == NULL) { + tprintf("Line cannot be recognized!!\n"); + return false; + } + if (network_->IsTraining() && pixGetWidth(pix) > kMaxImageWidth) { + tprintf("Image too large to learn!! Size = %dx%d\n", pixGetWidth(pix), + pixGetHeight(pix)); + pixDestroy(&pix); + return false; + } + // Reduction factor from image to coords. + *scale_factor = min_width / *scale_factor; + inputs->set_int_mode(IsIntMode()); + SetRandomSeed(); + Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, inputs); + network_->Forward(debug, *inputs, NULL, &scratch_space_, outputs); + // Check for auto inversion. + float pos_min, pos_mean, pos_sd; + OutputStats(*outputs, &pos_min, &pos_mean, &pos_sd); + if (invert && pos_min < 0.5) { + // Run again inverted and see if it is any better. + NetworkIO inv_inputs, inv_outputs; + inv_inputs.set_int_mode(IsIntMode()); + SetRandomSeed(); + pixInvert(pix, pix); + Input::PreparePixInput(network_->InputShape(), pix, &randomizer_, + &inv_inputs); + network_->Forward(debug, inv_inputs, NULL, &scratch_space_, &inv_outputs); + float inv_min, inv_mean, inv_sd; + OutputStats(inv_outputs, &inv_min, &inv_mean, &inv_sd); + if (inv_min > pos_min && inv_mean > pos_mean && inv_sd < pos_sd) { + // Inverted did better. Use inverted data. + if (debug) { + tprintf("Inverting image: old min=%g, mean=%g, sd=%g, inv %g,%g,%g\n", + pos_min, pos_mean, pos_sd, inv_min, inv_mean, inv_sd); + } + *outputs = inv_outputs; + *inputs = inv_inputs; + } else if (re_invert) { + // Inverting was not an improvement, so undo and run again, so the + // outputs match the best forward result. + SetRandomSeed(); + network_->Forward(debug, *inputs, NULL, &scratch_space_, outputs); + } + } + pixDestroy(&pix); + if (debug) { + GenericVector labels, coords; + LabelsFromOutputs(*outputs, label_threshold, &labels, &coords); + DisplayForward(*inputs, labels, coords, "LSTMForward", &debug_win_); + DebugActivationPath(*outputs, labels, coords); + } + return true; +} + +// Returns a tesseract-compatible WERD_RES from the line recognizer outputs. +// line_box should be the bounding box of the line image in the main image, +// outputs the output of the network, +// [word_start, word_end) the interval over which to convert, +// score_ratio for choosing alternate classifier choices, +// use_alternates to control generation of alternative segmentations, +// labels, label_coords, scale_factor from RecognizeLine above. +// If target_unicharset is not NULL, attempts to translate the internal +// unichar_ids to the target_unicharset, but falls back to untranslated ids +// if the translation should fail. +WERD_RES* LSTMRecognizer::WordFromOutput( + const TBOX& line_box, const NetworkIO& outputs, int word_start, + int word_end, float score_ratio, float space_certainty, bool debug, + bool use_alternates, const UNICHARSET* target_unicharset, + const GenericVector& labels, const GenericVector& label_coords, + float scale_factor) { + WERD_RES* word_res = InitializeWord( + line_box, word_start, word_end, space_certainty, use_alternates, + target_unicharset, labels, label_coords, scale_factor); + int max_blob_run = word_res->ratings->bandwidth(); + for (int width = 1; width <= max_blob_run; ++width) { + int col = 0; + for (int i = word_start; i + width <= word_end; ++i) { + if (labels[i] != null_char_) { + // Starting at i, use width labels, but stop at the next null_char_. + // This forms all combinations of blobs between regions of null_char_. + int j = i + 1; + while (j - i < width && labels[j] != null_char_) ++j; + if (j - i == width) { + // Make the blob choices. + int end_coord = label_coords[j]; + if (j < word_end && labels[j] == null_char_) + end_coord = label_coords[j + 1]; + BLOB_CHOICE_LIST* choices = GetBlobChoices( + col, col + width - 1, debug, outputs, target_unicharset, + label_coords[i], end_coord, score_ratio); + if (choices == NULL) { + delete word_res; + return NULL; + } + word_res->ratings->put(col, col + width - 1, choices); + } + ++col; + } + } + } + if (use_alternates) { + // Merge adjacent single results over null_char boundaries. + int col = 0; + for (int i = word_start; i + 2 < word_end; ++i) { + if (labels[i] != null_char_ && labels[i + 1] == null_char_ && + labels[i + 2] != null_char_ && + (i == word_start || labels[i - 1] == null_char_) && + (i + 3 == word_end || labels[i + 3] == null_char_)) { + int end_coord = label_coords[i + 3]; + if (i + 3 < word_end && labels[i + 3] == null_char_) + end_coord = label_coords[i + 4]; + BLOB_CHOICE_LIST* choices = + GetBlobChoices(col, col + 1, debug, outputs, target_unicharset, + label_coords[i], end_coord, score_ratio); + if (choices == NULL) { + delete word_res; + return NULL; + } + word_res->ratings->put(col, col + 1, choices); + } + if (labels[i] != null_char_) ++col; + } + } else { + word_res->FakeWordFromRatings(TOP_CHOICE_PERM); + } + return word_res; +} + +// Sets up a word with the ratings matrix and fake blobs with boxes in the +// right places. +WERD_RES* LSTMRecognizer::InitializeWord(const TBOX& line_box, int word_start, + int word_end, float space_certainty, + bool use_alternates, + const UNICHARSET* target_unicharset, + const GenericVector& labels, + const GenericVector& label_coords, + float scale_factor) { + // Make a fake blob for each non-zero label. + C_BLOB_LIST blobs; + C_BLOB_IT b_it(&blobs); + // num_blobs is the length of the diagonal of the ratings matrix. + int num_blobs = 0; + // max_blob_run is the diagonal width of the ratings matrix + int max_blob_run = 0; + int blob_run = 0; + for (int i = word_start; i < word_end; ++i) { + if (IsRecoding() && !recoder_.IsValidFirstCode(labels[i])) continue; + if (labels[i] != null_char_) { + // Make a fake blob. + TBOX box(label_coords[i], 0, label_coords[i + 1], line_box.height()); + box.scale(scale_factor); + box.move(ICOORD(line_box.left(), line_box.bottom())); + box.set_top(line_box.top()); + b_it.add_after_then_move(C_BLOB::FakeBlob(box)); + ++num_blobs; + ++blob_run; + } + if (labels[i] == null_char_ || i + 1 == word_end) { + if (blob_run > max_blob_run) + max_blob_run = blob_run; + } + } + if (!use_alternates) max_blob_run = 1; + ASSERT_HOST(label_coords.size() >= word_end); + // Make a fake word from the blobs. + WERD* word = new WERD(&blobs, word_start > 1 ? 1 : 0, NULL); + // Make a WERD_RES from the word. + WERD_RES* word_res = new WERD_RES(word); + word_res->uch_set = + target_unicharset != NULL ? target_unicharset : &GetUnicharset(); + word_res->combination = true; // Give it ownership of the word. + word_res->space_certainty = space_certainty; + word_res->ratings = new MATRIX(num_blobs, max_blob_run); + return word_res; +} + +// Converts an array of labels to utf-8, whether or not the labels are +// augmented with character boundaries. +STRING LSTMRecognizer::DecodeLabels(const GenericVector& labels) { + STRING result; + int end = 1; + for (int start = 0; start < labels.size(); start = end) { + if (labels[start] == null_char_) { + end = start + 1; + } else { + result += DecodeLabel(labels, start, &end, NULL); + } + } + return result; +} + +// Displays the forward results in a window with the characters and +// boundaries as determined by the labels and label_coords. +void LSTMRecognizer::DisplayForward(const NetworkIO& inputs, + const GenericVector& labels, + const GenericVector& label_coords, + const char* window_name, + ScrollView** window) { +#ifndef GRAPHICS_DISABLED // do nothing if there's no graphics + Pix* input_pix = inputs.ToPix(); + Network::ClearWindow(false, window_name, pixGetWidth(input_pix), + pixGetHeight(input_pix), window); + int line_height = Network::DisplayImage(input_pix, *window); + DisplayLSTMOutput(labels, label_coords, line_height, *window); +#endif // GRAPHICS_DISABLED +} + +// Displays the labels and cuts at the corresponding xcoords. +// Size of labels should match xcoords. +void LSTMRecognizer::DisplayLSTMOutput(const GenericVector& labels, + const GenericVector& xcoords, + int height, ScrollView* window) { +#ifndef GRAPHICS_DISABLED // do nothing if there's no graphics + int x_scale = network_->XScaleFactor(); + window->TextAttributes("Arial", height / 4, false, false, false); + int end = 1; + for (int start = 0; start < labels.size(); start = end) { + int xpos = xcoords[start] * x_scale; + if (labels[start] == null_char_) { + end = start + 1; + window->Pen(ScrollView::RED); + } else { + window->Pen(ScrollView::GREEN); + const char* str = DecodeLabel(labels, start, &end, NULL); + if (*str == '\\') str = "\\\\"; + xpos = xcoords[(start + end) / 2] * x_scale; + window->Text(xpos, height, str); + } + window->Line(xpos, 0, xpos, height * 3 / 2); + } + window->Update(); +#endif // GRAPHICS_DISABLED +} + +// Prints debug output detailing the activation path that is implied by the +// label_coords. +void LSTMRecognizer::DebugActivationPath(const NetworkIO& outputs, + const GenericVector& labels, + const GenericVector& xcoords) { + if (xcoords[0] > 0) + DebugActivationRange(outputs, "", null_char_, 0, xcoords[0]); + int end = 1; + for (int start = 0; start < labels.size(); start = end) { + if (labels[start] == null_char_) { + end = start + 1; + DebugActivationRange(outputs, "", null_char_, xcoords[start], + xcoords[end]); + continue; + } else { + int decoded; + const char* label = DecodeLabel(labels, start, &end, &decoded); + DebugActivationRange(outputs, label, labels[start], xcoords[start], + xcoords[start + 1]); + for (int i = start + 1; i < end; ++i) { + DebugActivationRange(outputs, DecodeSingleLabel(labels[i]), labels[i], + xcoords[i], xcoords[i + 1]); + } + } + } +} + +// Prints debug output detailing activations and 2nd choice over a range +// of positions. +void LSTMRecognizer::DebugActivationRange(const NetworkIO& outputs, + const char* label, int best_choice, + int x_start, int x_end) { + tprintf("%s=%d On [%d, %d), scores=", label, best_choice, x_start, x_end); + double max_score = 0.0; + double mean_score = 0.0; + int width = x_end - x_start; + for (int x = x_start; x < x_end; ++x) { + const float* line = outputs.f(x); + double score = line[best_choice] * 100.0; + if (score > max_score) max_score = score; + mean_score += score / width; + int best_c = 0; + double best_score = 0.0; + for (int c = 0; c < outputs.NumFeatures(); ++c) { + if (c != best_choice && line[c] > best_score) { + best_c = c; + best_score = line[c]; + } + } + tprintf(" %.3g(%s=%d=%.3g)", score, DecodeSingleLabel(best_c), best_c, + best_score * 100.0); + } + tprintf(", Mean=%g, max=%g\n", mean_score, max_score); +} + +// Helper returns true if the null_char is the winner at t, and it beats the +// null_threshold, or the next choice is space, in which case we will use the +// null anyway. +static bool NullIsBest(const NetworkIO& output, float null_thr, + int null_char, int t) { + if (output.f(t)[null_char] >= null_thr) return true; + if (output.BestLabel(t, null_char, null_char, NULL) != UNICHAR_SPACE) + return false; + return output.f(t)[null_char] > output.f(t)[UNICHAR_SPACE]; +} + +// Converts the network output to a sequence of labels. Outputs labels, scores +// and start xcoords of each char, and each null_char_, with an additional +// final xcoord for the end of the output. +// The conversion method is determined by internal state. +void LSTMRecognizer::LabelsFromOutputs(const NetworkIO& outputs, float null_thr, + GenericVector* labels, + GenericVector* xcoords) { + if (SimpleTextOutput()) { + LabelsViaSimpleText(outputs, labels, xcoords); + } else if (IsRecoding()) { + LabelsViaReEncode(outputs, labels, xcoords); + } else if (null_thr <= 0.0) { + LabelsViaCTC(outputs, labels, xcoords); + } else { + LabelsViaThreshold(outputs, null_thr, labels, xcoords); + } +} + +// Converts the network output to a sequence of labels, using a threshold +// on the null_char_ to determine character boundaries. Outputs labels, scores +// and start xcoords of each char, and each null_char_, with an additional +// final xcoord for the end of the output. +// The label output is the one with the highest score in the interval between +// null_chars_. +void LSTMRecognizer::LabelsViaThreshold(const NetworkIO& output, + float null_thr, + GenericVector* labels, + GenericVector* xcoords) { + labels->truncate(0); + xcoords->truncate(0); + int width = output.Width(); + int t = 0; + // Skip any initial non-char. + while (t < width && NullIsBest(output, null_thr, null_char_, t)) { + ++t; + } + while (t < width) { + ASSERT_HOST(!std::isnan(output.f(t)[null_char_])); + int label = output.BestLabel(t, null_char_, null_char_, NULL); + int char_start = t++; + while (t < width && !NullIsBest(output, null_thr, null_char_, t) && + label == output.BestLabel(t, null_char_, null_char_, NULL)) { + ++t; + } + int char_end = t; + labels->push_back(label); + xcoords->push_back(char_start); + // Find the end of the non-char, and compute its score. + while (t < width && NullIsBest(output, null_thr, null_char_, t)) { + ++t; + } + if (t > char_end) { + labels->push_back(null_char_); + xcoords->push_back(char_end); + } + } + xcoords->push_back(width); +} + +// Converts the network output to a sequence of labels, with scores and +// start x-coords of the character labels. Retains the null_char_ as the +// end x-coord, where already present, otherwise the start of the next +// character is the end. +// The number of labels, scores, and xcoords is always matched, except that +// there is always an additional xcoord for the last end position. +void LSTMRecognizer::LabelsViaCTC(const NetworkIO& output, + GenericVector* labels, + GenericVector* xcoords) { + labels->truncate(0); + xcoords->truncate(0); + int width = output.Width(); + int t = 0; + while (t < width) { + float score = 0.0f; + int label = output.BestLabel(t, &score); + labels->push_back(label); + xcoords->push_back(t); + while (++t < width && output.BestLabel(t, NULL) == label) { + } + } + xcoords->push_back(width); +} + +// As LabelsViaCTC except that this function constructs the best path that +// contains only legal sequences of subcodes for CJK. +void LSTMRecognizer::LabelsViaReEncode(const NetworkIO& output, + GenericVector* labels, + GenericVector* xcoords) { + if (search_ == NULL) { + search_ = + new RecodeBeamSearch(recoder_, null_char_, SimpleTextOutput(), dict_); + } + search_->Decode(output, 1.0, 0.0, RecodeBeamSearch::kMinCertainty, NULL); + search_->ExtractBestPathAsLabels(labels, xcoords); +} + +// Converts the network output to a sequence of labels, with scores, using +// the simple character model (each position is a char, and the null_char_ is +// mainly intended for tail padding.) +void LSTMRecognizer::LabelsViaSimpleText(const NetworkIO& output, + GenericVector* labels, + GenericVector* xcoords) { + labels->truncate(0); + xcoords->truncate(0); + int width = output.Width(); + for (int t = 0; t < width; ++t) { + float score = 0.0f; + int label = output.BestLabel(t, &score); + if (label != null_char_) { + labels->push_back(label); + xcoords->push_back(t); + } + } + xcoords->push_back(width); +} + +// Helper returns a BLOB_CHOICE_LIST for the choices in a given x-range. +// Handles either LSTM labels or direct unichar-ids. +// Score ratio determines the worst ratio between top choice and remainder. +// If target_unicharset is not NULL, attempts to translate to the target +// unicharset, returning NULL on failure. +BLOB_CHOICE_LIST* LSTMRecognizer::GetBlobChoices( + int col, int row, bool debug, const NetworkIO& output, + const UNICHARSET* target_unicharset, int x_start, int x_end, + float score_ratio) { + float rating = 0.0f, certainty = 0.0f; + int label = output.BestChoiceOverRange(x_start, x_end, UNICHAR_SPACE, + null_char_, &rating, &certainty); + int unichar_id = label == null_char_ ? UNICHAR_SPACE : label; + if (debug) { + tprintf("Best choice over range %d,%d=unichar%d=%s r = %g, cert=%g\n", + x_start, x_end, unichar_id, DecodeSingleLabel(label), rating, + certainty); + } + BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST; + BLOB_CHOICE_IT bc_it(choices); + if (!AddBlobChoices(unichar_id, rating, certainty, col, row, + target_unicharset, &bc_it)) { + delete choices; + return NULL; + } + // Get the other choices. + double best_cert = certainty; + for (int c = 0; c < output.NumFeatures(); ++c) { + if (c == label || c == UNICHAR_SPACE || c == null_char_) continue; + // Compute the score over the range. + output.ScoresOverRange(x_start, x_end, c, null_char_, &rating, &certainty); + int unichar_id = c == null_char_ ? UNICHAR_SPACE : c; + if (certainty >= best_cert - score_ratio && + !AddBlobChoices(unichar_id, rating, certainty, col, row, + target_unicharset, &bc_it)) { + delete choices; + return NULL; + } + } + choices->sort(&BLOB_CHOICE::SortByRating); + if (bc_it.length() > kMaxChoices) { + bc_it.move_to_first(); + for (int i = 0; i < kMaxChoices; ++i) + bc_it.forward(); + while (!bc_it.at_first()) { + delete bc_it.extract(); + bc_it.forward(); + } + } + return choices; +} + +// Adds to the given iterator, the blob choices for the target_unicharset +// that correspond to the given LSTM unichar_id. +// Returns false if unicharset translation failed. +bool LSTMRecognizer::AddBlobChoices(int unichar_id, float rating, + float certainty, int col, int row, + const UNICHARSET* target_unicharset, + BLOB_CHOICE_IT* bc_it) { + int target_id = unichar_id; + if (target_unicharset != NULL) { + const char* utf8 = GetUnicharset().id_to_unichar(unichar_id); + if (target_unicharset->contains_unichar(utf8)) { + target_id = target_unicharset->unichar_to_id(utf8); + } else { + return false; + } + } + BLOB_CHOICE* choice = new BLOB_CHOICE(target_id, rating, certainty, -1, 1.0f, + static_cast(MAX_INT16), 0.0f, + BCC_STATIC_CLASSIFIER); + choice->set_matrix_cell(col, row); + bc_it->add_after_then_move(choice); + return true; +} + +// Returns a string corresponding to the label starting at start. Sets *end +// to the next start and if non-null, *decoded to the unichar id. +const char* LSTMRecognizer::DecodeLabel(const GenericVector& labels, + int start, int* end, int* decoded) { + *end = start + 1; + if (IsRecoding()) { + // Decode labels via recoder_. + RecodedCharID code; + if (labels[start] == null_char_) { + if (decoded != NULL) { + code.Set(0, null_char_); + *decoded = recoder_.DecodeUnichar(code); + } + return ""; + } + int index = start; + while (index < labels.size() && + code.length() < RecodedCharID::kMaxCodeLen) { + code.Set(code.length(), labels[index++]); + while (index < labels.size() && labels[index] == null_char_) ++index; + int uni_id = recoder_.DecodeUnichar(code); + // If the next label isn't a valid first code, then we need to continue + // extending even if we have a valid uni_id from this prefix. + if (uni_id != INVALID_UNICHAR_ID && + (index == labels.size() || + code.length() == RecodedCharID::kMaxCodeLen || + recoder_.IsValidFirstCode(labels[index]))) { + *end = index; + if (decoded != NULL) *decoded = uni_id; + if (uni_id == UNICHAR_SPACE) return " "; + return GetUnicharset().get_normed_unichar(uni_id); + } + } + return ""; + } else { + if (decoded != NULL) *decoded = labels[start]; + if (labels[start] == null_char_) return ""; + if (labels[start] == UNICHAR_SPACE) return " "; + return GetUnicharset().get_normed_unichar(labels[start]); + } +} + +// Returns a string corresponding to a given single label id, falling back to +// a default of ".." for part of a multi-label unichar-id. +const char* LSTMRecognizer::DecodeSingleLabel(int label) { + if (label == null_char_) return ""; + if (IsRecoding()) { + // Decode label via recoder_. + RecodedCharID code; + code.Set(0, label); + label = recoder_.DecodeUnichar(code); + if (label == INVALID_UNICHAR_ID) return ".."; // Part of a bigger code. + } + if (label == UNICHAR_SPACE) return " "; + return GetUnicharset().get_normed_unichar(label); +} + +} // namespace tesseract. diff --git a/lstm/lstmrecognizer.h b/lstm/lstmrecognizer.h new file mode 100644 index 00000000..87dc135d --- /dev/null +++ b/lstm/lstmrecognizer.h @@ -0,0 +1,394 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstmrecognizer.h +// Description: Top-level line recognizer class for LSTM-based networks. +// Author: Ray Smith +// Created: Thu May 02 08:57:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_LSTMRECOGNIZER_H_ +#define TESSERACT_LSTM_LSTMRECOGNIZER_H_ + +#include "ccutil.h" +#include "helpers.h" +#include "imagedata.h" +#include "matrix.h" +#include "network.h" +#include "networkscratch.h" +#include "recodebeam.h" +#include "series.h" +#include "strngs.h" +#include "unicharcompress.h" + +class BLOB_CHOICE_IT; +struct Pix; +class ROW_RES; +class ScrollView; +class TBOX; +class WERD_RES; + +namespace tesseract { + +class Dict; +class ImageData; + +// Enum indicating training mode control flags. +enum TrainingFlags { + TF_INT_MODE = 1, + TF_AUTO_HARDEN = 2, + TF_ROUND_ROBIN_TRAINING = 16, + TF_COMPRESS_UNICHARSET = 64, +}; + +// Top-level line recognizer class for LSTM-based networks. +// Note that a sub-class, LSTMTrainer is used for training. +class LSTMRecognizer { + public: + LSTMRecognizer(); + ~LSTMRecognizer(); + + int NumOutputs() const { + return network_->NumOutputs(); + } + int training_iteration() const { + return training_iteration_; + } + int sample_iteration() const { + return sample_iteration_; + } + double learning_rate() const { + return learning_rate_; + } + bool IsHardening() const { + return (training_flags_ & TF_AUTO_HARDEN) != 0; + } + LossType OutputLossType() const { + if (network_ == nullptr) return LT_NONE; + StaticShape shape; + shape = network_->OutputShape(shape); + return shape.loss_type(); + } + bool SimpleTextOutput() const { return OutputLossType() == LT_SOFTMAX; } + bool IsIntMode() const { return (training_flags_ & TF_INT_MODE) != 0; } + // True if recoder_ is active to re-encode text to a smaller space. + bool IsRecoding() const { + return (training_flags_ & TF_COMPRESS_UNICHARSET) != 0; + } + // Returns the cache strategy for the DocumentCache. + CachingStrategy CacheStrategy() const { + return training_flags_ & TF_ROUND_ROBIN_TRAINING ? CS_ROUND_ROBIN + : CS_SEQUENTIAL; + } + // Returns true if the network is a TensorFlow network. + bool IsTensorFlow() const { return network_->type() == NT_TENSORFLOW; } + // Returns a vector of layer ids that can be passed to other layer functions + // to access a specific layer. + GenericVector EnumerateLayers() const { + ASSERT_HOST(network_ != NULL && network_->type() == NT_SERIES); + Series* series = reinterpret_cast(network_); + GenericVector layers; + series->EnumerateLayers(NULL, &layers); + return layers; + } + // Returns a specific layer from its id (from EnumerateLayers). + Network* GetLayer(const STRING& id) const { + ASSERT_HOST(network_ != NULL && network_->type() == NT_SERIES); + ASSERT_HOST(id.length() > 1 && id[0] == ':'); + Series* series = reinterpret_cast(network_); + return series->GetLayer(&id[1]); + } + // Returns the learning rate of the layer from its id. + float GetLayerLearningRate(const STRING& id) const { + ASSERT_HOST(network_ != NULL && network_->type() == NT_SERIES); + if (network_->TestFlag(NF_LAYER_SPECIFIC_LR)) { + ASSERT_HOST(id.length() > 1 && id[0] == ':'); + Series* series = reinterpret_cast(network_); + return series->LayerLearningRate(&id[1]); + } else { + return learning_rate_; + } + } + // Multiplies the all the learning rate(s) by the given factor. + void ScaleLearningRate(double factor) { + ASSERT_HOST(network_ != NULL && network_->type() == NT_SERIES); + learning_rate_ *= factor; + if (network_->TestFlag(NF_LAYER_SPECIFIC_LR)) { + GenericVector layers = EnumerateLayers(); + for (int i = 0; i < layers.size(); ++i) { + ScaleLayerLearningRate(layers[i], factor); + } + } + } + // Multiplies the learning rate of the layer with id, by the given factor. + void ScaleLayerLearningRate(const STRING& id, double factor) { + ASSERT_HOST(network_ != NULL && network_->type() == NT_SERIES); + ASSERT_HOST(id.length() > 1 && id[0] == ':'); + Series* series = reinterpret_cast(network_); + series->ScaleLayerLearningRate(&id[1], factor); + } + + // True if the network is using adagrad to train. + bool IsUsingAdaGrad() const { return network_->TestFlag(NF_ADA_GRAD); } + // Provides access to the UNICHARSET that this classifier works with. + const UNICHARSET& GetUnicharset() const { return ccutil_.unicharset; } + // Provides access to the Dict that this classifier works with. + const Dict* GetDict() const { return dict_; } + // Sets the sample iteration to the given value. The sample_iteration_ + // determines the seed for the random number generator. The training + // iteration is incremented only by a successful training iteration. + void SetIteration(int iteration) { + sample_iteration_ = iteration; + } + // Accessors for textline image normalization. + int NumInputs() const { + return network_->NumInputs(); + } + int null_char() const { return null_char_; } + + // Writes to the given file. Returns false in case of error. + bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + bool DeSerialize(bool swap, TFile* fp); + // Loads the dictionary if possible from the traineddata file. + // Prints a warning message, and returns false but otherwise fails silently + // and continues to work without it if loading fails. + // Note that dictionary load is independent from DeSerialize, but dependent + // on the unicharset matching. This enables training to deserialize a model + // from checkpoint or restore without having to go back and reload the + // dictionary. + bool LoadDictionary(const char* lang, TessdataManager* mgr); + + // Recognizes the line image, contained within image_data, returning the + // ratings matrix and matching box_word for each WERD_RES in the output. + // If invert, tries inverted as well if the normal interpretation doesn't + // produce a good enough result. If use_alternates, the ratings matrix is + // filled with segmentation and classifier alternatives that may be searched + // using the standard beam search, otherwise, just a diagonal and prebuilt + // best_choice. The line_box is used for computing the box_word in the + // output words. Score_ratio is used to determine the classifier alternates. + // If one_word, then a single WERD_RES is formed, regardless of the spaces + // found during recognition. + // If not NULL, we attempt to translate the output to target_unicharset, but + // do not guarantee success, due to mismatches. In that case the output words + // are marked with our UNICHARSET, not the caller's. + void RecognizeLine(const ImageData& image_data, bool invert, bool debug, + double worst_dict_cert, bool use_alternates, + const UNICHARSET* target_unicharset, const TBOX& line_box, + float score_ratio, bool one_word, + PointerVector* words); + // Builds a set of tesseract-compatible WERD_RESs aligned to line_box, + // corresponding to the network output in outputs, labels, label_coords. + // one_word generates a single word output, that may include spaces inside. + // use_alternates generates alternative BLOB_CHOICEs and segmentation paths, + // with cut-offs determined by scale_factor. + // If not NULL, we attempt to translate the output to target_unicharset, but + // do not guarantee success, due to mismatches. In that case the output words + // are marked with our UNICHARSET, not the caller's. + void WordsFromOutputs(const NetworkIO& outputs, + const GenericVector& labels, + const GenericVector label_coords, + const TBOX& line_box, bool debug, bool use_alternates, + bool one_word, float score_ratio, float scale_factor, + const UNICHARSET* target_unicharset, + PointerVector* words); + + // Helper computes min and mean best results in the output. + void OutputStats(const NetworkIO& outputs, + float* min_output, float* mean_output, float* sd); + // Recognizes the image_data, returning the labels, + // scores, and corresponding pairs of start, end x-coords in coords. + // If label_threshold is positive, uses it for making the labels, otherwise + // uses standard ctc. Returned in scale_factor is the reduction factor + // between the image and the output coords, for computing bounding boxes. + // If re_invert is true, the input is inverted back to its original + // photometric interpretation if inversion is attempted but fails to + // improve the results. This ensures that outputs contains the correct + // forward outputs for the best photometric interpretation. + // inputs is filled with the used inputs to the network, and if not null, + // target boxes is filled with scaled truth boxes if present in image_data. + bool RecognizeLine(const ImageData& image_data, bool invert, bool debug, + bool re_invert, float label_threshold, float* scale_factor, + NetworkIO* inputs, NetworkIO* outputs); + // Returns a tesseract-compatible WERD_RES from the line recognizer outputs. + // line_box should be the bounding box of the line image in the main image, + // outputs the output of the network, + // [word_start, word_end) the interval over which to convert, + // score_ratio for choosing alternate classifier choices, + // use_alternates to control generation of alternative segmentations, + // labels, label_coords, scale_factor from RecognizeLine above. + // If target_unicharset is not NULL, attempts to translate the internal + // unichar_ids to the target_unicharset, but falls back to untranslated ids + // if the translation should fail. + WERD_RES* WordFromOutput(const TBOX& line_box, const NetworkIO& outputs, + int word_start, int word_end, float score_ratio, + float space_certainty, bool debug, + bool use_alternates, + const UNICHARSET* target_unicharset, + const GenericVector& labels, + const GenericVector& label_coords, + float scale_factor); + // Sets up a word with the ratings matrix and fake blobs with boxes in the + // right places. + WERD_RES* InitializeWord(const TBOX& line_box, int word_start, int word_end, + float space_certainty, bool use_alternates, + const UNICHARSET* target_unicharset, + const GenericVector& labels, + const GenericVector& label_coords, + float scale_factor); + + // Converts an array of labels to utf-8, whether or not the labels are + // augmented with character boundaries. + STRING DecodeLabels(const GenericVector& labels); + + // Displays the forward results in a window with the characters and + // boundaries as determined by the labels and label_coords. + void DisplayForward(const NetworkIO& inputs, + const GenericVector& labels, + const GenericVector& label_coords, + const char* window_name, + ScrollView** window); + + protected: + // Sets the random seed from the sample_iteration_; + void SetRandomSeed() { + inT64 seed = static_cast(sample_iteration_) * 0x10000001; + randomizer_.set_seed(seed); + randomizer_.IntRand(); + } + + // Displays the labels and cuts at the corresponding xcoords. + // Size of labels should match xcoords. + void DisplayLSTMOutput(const GenericVector& labels, + const GenericVector& xcoords, + int height, ScrollView* window); + + // Prints debug output detailing the activation path that is implied by the + // xcoords. + void DebugActivationPath(const NetworkIO& outputs, + const GenericVector& labels, + const GenericVector& xcoords); + + // Prints debug output detailing activations and 2nd choice over a range + // of positions. + void DebugActivationRange(const NetworkIO& outputs, const char* label, + int best_choice, int x_start, int x_end); + + // Converts the network output to a sequence of labels. Outputs labels, scores + // and start xcoords of each char, and each null_char_, with an additional + // final xcoord for the end of the output. + // The conversion method is determined by internal state. + void LabelsFromOutputs(const NetworkIO& outputs, float null_thr, + GenericVector* labels, + GenericVector* xcoords); + // Converts the network output to a sequence of labels, using a threshold + // on the null_char_ to determine character boundaries. Outputs labels, scores + // and start xcoords of each char, and each null_char_, with an additional + // final xcoord for the end of the output. + // The label output is the one with the highest score in the interval between + // null_chars_. + void LabelsViaThreshold(const NetworkIO& output, + float null_threshold, + GenericVector* labels, + GenericVector* xcoords); + // Converts the network output to a sequence of labels, with scores and + // start x-coords of the character labels. Retains the null_char_ character as + // the end x-coord, where already present, otherwise the start of the next + // character is the end. + // The number of labels, scores, and xcoords is always matched, except that + // there is always an additional xcoord for the last end position. + void LabelsViaCTC(const NetworkIO& output, + GenericVector* labels, + GenericVector* xcoords); + // As LabelsViaCTC except that this function constructs the best path that + // contains only legal sequences of subcodes for recoder_. + void LabelsViaReEncode(const NetworkIO& output, GenericVector* labels, + GenericVector* xcoords); + // Converts the network output to a sequence of labels, with scores, using + // the simple character model (each position is a char, and the null_char_ is + // mainly intended for tail padding.) + void LabelsViaSimpleText(const NetworkIO& output, + GenericVector* labels, + GenericVector* xcoords); + + // Helper returns a BLOB_CHOICE_LIST for the choices in a given x-range. + // Handles either LSTM labels or direct unichar-ids. + // Score ratio determines the worst ratio between top choice and remainder. + // If target_unicharset is not NULL, attempts to translate to the target + // unicharset, returning NULL on failure. + BLOB_CHOICE_LIST* GetBlobChoices(int col, int row, bool debug, + const NetworkIO& output, + const UNICHARSET* target_unicharset, + int x_start, int x_end, float score_ratio); + + // Adds to the given iterator, the blob choices for the target_unicharset + // that correspond to the given LSTM unichar_id. + // Returns false if unicharset translation failed. + bool AddBlobChoices(int unichar_id, float rating, float certainty, int col, + int row, const UNICHARSET* target_unicharset, + BLOB_CHOICE_IT* bc_it); + + // Returns a string corresponding to the label starting at start. Sets *end + // to the next start and if non-null, *decoded to the unichar id. + const char* DecodeLabel(const GenericVector& labels, int start, int* end, + int* decoded); + + // Returns a string corresponding to a given single label id, falling back to + // a default of ".." for part of a multi-label unichar-id. + const char* DecodeSingleLabel(int label); + + protected: + // The network hierarchy. + Network* network_; + // The unicharset. Only the unicharset element is serialized. + // Has to be a CCUtil, so Dict can point to it. + CCUtil ccutil_; + // For backward compatibility, recoder_ is serialized iff + // training_flags_ & TF_COMPRESS_UNICHARSET. + // Further encode/decode ccutil_.unicharset's ids to simplify the unicharset. + UnicharCompress recoder_; + + // ==Training parameters that are serialized to provide a record of them.== + STRING network_str_; + // Flags used to determine the training method of the network. + // See enum TrainingFlags above. + inT32 training_flags_; + // Number of actual backward training steps used. + inT32 training_iteration_; + // Index into training sample set. sample_iteration >= training_iteration_. + inT32 sample_iteration_; + // Index in softmax of null character. May take the value UNICHAR_BROKEN or + // ccutil_.unicharset.size(). + inT32 null_char_; + // Range used for the initial random numbers in the weights. + float weight_range_; + // Learning rate and momentum multipliers of deltas in backprop. + float learning_rate_; + float momentum_; + + // === NOT SERIALIZED. + TRand randomizer_; + NetworkScratch scratch_space_; + // Language model (optional) to use with the beam search. + Dict* dict_; + // Beam search held between uses to optimize memory allocation/use. + RecodeBeamSearch* search_; + + // == Debugging parameters.== + // Recognition debug display window. + ScrollView* debug_win_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_LSTMRECOGNIZER_H_ diff --git a/lstm/lstmtrainer.cpp b/lstm/lstmtrainer.cpp new file mode 100644 index 00000000..3539a71d --- /dev/null +++ b/lstm/lstmtrainer.cpp @@ -0,0 +1,1345 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstmtrainer.cpp +// Description: Top-level line trainer class for LSTM-based networks. +// Author: Ray Smith +// Created: Fir May 03 09:14:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +#include "lstmtrainer.h" +#include + +#include "allheaders.h" +#include "boxread.h" +#include "ctc.h" +#include "imagedata.h" +#include "input.h" +#include "networkbuilder.h" +#include "ratngs.h" +#include "recodebeam.h" +#ifdef INCLUDE_TENSORFLOW +#include "tfnetwork.h" +#endif +#include "tprintf.h" + +#include "callcpp.h" + +namespace tesseract { + +// Min actual error rate increase to constitute divergence. +const double kMinDivergenceRate = 50.0; +// Min iterations since last best before acting on a stall. +const int kMinStallIterations = 10000; +// Fraction of current char error rate that sub_trainer_ has to be ahead +// before we declare the sub_trainer_ a success and switch to it. +const double kSubTrainerMarginFraction = 3.0 / 128; +// Factor to reduce learning rate on divergence. +const double kLearningRateDecay = sqrt(0.5); +// LR adjustment iterations. +const int kNumAdjustmentIterations = 100; +// How often to add data to the error_graph_. +const int kErrorGraphInterval = 1000; +// Number of training images to train between calls to MaintainCheckpoints. +const int kNumPagesPerBatch = 100; +// Min percent error rate to consider start-up phase over. +const int kMinStartedErrorRate = 75; +// Error rate at which to transition to stage 1. +const double kStageTransitionThreshold = 10.0; +// Confidence beyond which the truth is more likely wrong than the recognizer. +const double kHighConfidence = 0.9375; // 15/16. +// Fraction of weight sign-changing total to constitute a definite improvement. +const double kImprovementFraction = 15.0 / 16.0; +// Fraction of last written best to make it worth writing another. +const double kBestCheckpointFraction = 31.0 / 32.0; +// Scale factor for display of target activations of CTC. +const int kTargetXScale = 5; +const int kTargetYScale = 100; + +LSTMTrainer::LSTMTrainer() + : training_data_(0), + file_reader_(LoadDataFromFile), + file_writer_(SaveDataToFile), + checkpoint_reader_( + NewPermanentTessCallback(this, &LSTMTrainer::ReadTrainingDump)), + checkpoint_writer_( + NewPermanentTessCallback(this, &LSTMTrainer::SaveTrainingDump)), + sub_trainer_(NULL) { + EmptyConstructor(); + debug_interval_ = 0; +} + +LSTMTrainer::LSTMTrainer(FileReader file_reader, FileWriter file_writer, + CheckPointReader checkpoint_reader, + CheckPointWriter checkpoint_writer, + const char* model_base, const char* checkpoint_name, + int debug_interval, inT64 max_memory) + : training_data_(max_memory), + file_reader_(file_reader), + file_writer_(file_writer), + checkpoint_reader_(checkpoint_reader), + checkpoint_writer_(checkpoint_writer), + sub_trainer_(NULL) { + EmptyConstructor(); + if (file_reader_ == NULL) file_reader_ = LoadDataFromFile; + if (file_writer_ == NULL) file_writer_ = SaveDataToFile; + if (checkpoint_reader_ == NULL) { + checkpoint_reader_ = + NewPermanentTessCallback(this, &LSTMTrainer::ReadTrainingDump); + } + if (checkpoint_writer_ == NULL) { + checkpoint_writer_ = + NewPermanentTessCallback(this, &LSTMTrainer::SaveTrainingDump); + } + debug_interval_ = debug_interval; + model_base_ = model_base; + checkpoint_name_ = checkpoint_name; +} + +LSTMTrainer::~LSTMTrainer() { + delete align_win_; + delete target_win_; + delete ctc_win_; + delete recon_win_; + delete checkpoint_reader_; + delete checkpoint_writer_; + delete sub_trainer_; +} + +// Tries to deserialize a trainer from the given file and silently returns +// false in case of failure. +bool LSTMTrainer::TryLoadingCheckpoint(const char* filename) { + GenericVector data; + if (!(*file_reader_)(filename, &data)) return false; + tprintf("Loaded file %s, unpacking...\n", filename); + return checkpoint_reader_->Run(data, this); +} + +// Initializes the character set encode/decode mechanism. +// train_flags control training behavior according to the TrainingFlags +// enum, including character set encoding. +// script_dir is required for TF_COMPRESS_UNICHARSET, and, if provided, +// fully initializes the unicharset from the universal unicharsets. +// Note: Call before InitNetwork! +void LSTMTrainer::InitCharSet(const UNICHARSET& unicharset, + const STRING& script_dir, int train_flags) { + EmptyConstructor(); + training_flags_ = train_flags; + ccutil_.unicharset.CopyFrom(unicharset); + null_char_ = GetUnicharset().has_special_codes() ? UNICHAR_BROKEN + : GetUnicharset().size(); + SetUnicharsetProperties(script_dir); +} + +// Initializes the character set encode/decode mechanism directly from a +// previously setup UNICHARSET and UnicharCompress. +// ctc_mode controls how the truth text is mapped to the network targets. +// Note: Call before InitNetwork! +void LSTMTrainer::InitCharSet(const UNICHARSET& unicharset, + const UnicharCompress& recoder) { + EmptyConstructor(); + int flags = TF_COMPRESS_UNICHARSET; + training_flags_ = static_cast(flags); + ccutil_.unicharset.CopyFrom(unicharset); + recoder_ = recoder; + null_char_ = GetUnicharset().has_special_codes() ? UNICHAR_BROKEN + : GetUnicharset().size(); + RecodedCharID code; + recoder_.EncodeUnichar(null_char_, &code); + null_char_ = code(0); + // Space should encode as itself. + recoder_.EncodeUnichar(UNICHAR_SPACE, &code); + ASSERT_HOST(code(0) == UNICHAR_SPACE); +} + +// Initializes the trainer with a network_spec in the network description +// net_flags control network behavior according to the NetworkFlags enum. +// There isn't really much difference between them - only where the effects +// are implemented. +// For other args see NetworkBuilder::InitNetwork. +// Note: Be sure to call InitCharSet before InitNetwork! +bool LSTMTrainer::InitNetwork(const STRING& network_spec, int append_index, + int net_flags, float weight_range, + float learning_rate, float momentum) { + // Call after InitCharSet. + ASSERT_HOST(GetUnicharset().size() > SPECIAL_UNICHAR_CODES_COUNT); + weight_range_ = weight_range; + learning_rate_ = learning_rate; + momentum_ = momentum; + int num_outputs = null_char_ == GetUnicharset().size() + ? null_char_ + 1 + : GetUnicharset().size(); + if (IsRecoding()) num_outputs = recoder_.code_range(); + if (!NetworkBuilder::InitNetwork(num_outputs, network_spec, append_index, + net_flags, weight_range, &randomizer_, + &network_)) { + return false; + } + network_str_ += network_spec; + tprintf("Built network:%s from request %s\n", + network_->spec().string(), network_spec.string()); + tprintf("Training parameters:\n Debug interval = %d," + " weights = %g, learning rate = %g, momentum=%g\n", + debug_interval_, weight_range_, learning_rate_, momentum_); + return true; +} + +// Initializes a trainer from a serialized TFNetworkModel proto. +// Returns the global step of TensorFlow graph or 0 if failed. +int LSTMTrainer::InitTensorFlowNetwork(const std::string& tf_proto) { +#ifdef INCLUDE_TENSORFLOW + delete network_; + TFNetwork* tf_net = new TFNetwork("TensorFlow"); + training_iteration_ = tf_net->InitFromProtoStr(tf_proto); + if (training_iteration_ == 0) { + tprintf("InitFromProtoStr failed!!\n"); + return 0; + } + network_ = tf_net; + ASSERT_HOST(recoder_.code_range() == tf_net->num_classes()); + return training_iteration_; +#else + tprintf("TensorFlow not compiled in! -DINCLUDE_TENSORFLOW\n"); + return 0; +#endif +} + +// Resets all the iteration counters for fine tuning or traininng a head, +// where we want the error reporting to reset. +void LSTMTrainer::InitIterations() { + sample_iteration_ = 0; + training_iteration_ = 0; + learning_iteration_ = 0; + prev_sample_iteration_ = 0; + best_error_rate_ = 100.0; + best_iteration_ = 0; + worst_error_rate_ = 0.0; + worst_iteration_ = 0; + stall_iteration_ = kMinStallIterations; + improvement_steps_ = kMinStallIterations; + perfect_delay_ = 0; + last_perfect_training_iteration_ = 0; + for (int i = 0; i < ET_COUNT; ++i) { + best_error_rates_[i] = 100.0; + worst_error_rates_[i] = 0.0; + error_buffers_[i].init_to_size(kRollingBufferSize_, 0.0); + error_rates_[i] = 100.0; + } + error_rate_of_last_saved_best_ = kMinStartedErrorRate; +} + +// If the training sample is usable, grid searches for the optimal +// dict_ratio/cert_offset, and returns the results in a string of space- +// separated triplets of ratio,offset=worderr. +Trainability LSTMTrainer::GridSearchDictParams( + const ImageData* trainingdata, int iteration, double min_dict_ratio, + double dict_ratio_step, double max_dict_ratio, double min_cert_offset, + double cert_offset_step, double max_cert_offset, STRING* results) { + sample_iteration_ = iteration; + NetworkIO fwd_outputs, targets; + Trainability result = + PrepareForBackward(trainingdata, &fwd_outputs, &targets); + if (result == UNENCODABLE || result == HI_PRECISION_ERR || dict_ == NULL) + return result; + + // Encode/decode the truth to get the normalization. + GenericVector truth_labels, ocr_labels, xcoords; + ASSERT_HOST(EncodeString(trainingdata->transcription(), &truth_labels)); + // NO-dict error. + RecodeBeamSearch base_search(recoder_, null_char_, SimpleTextOutput(), NULL); + base_search.Decode(fwd_outputs, 1.0, 0.0, RecodeBeamSearch::kMinCertainty, + NULL); + base_search.ExtractBestPathAsLabels(&ocr_labels, &xcoords); + STRING truth_text = DecodeLabels(truth_labels); + STRING ocr_text = DecodeLabels(ocr_labels); + double baseline_error = ComputeWordError(&truth_text, &ocr_text); + results->add_str_double("0,0=", baseline_error); + + RecodeBeamSearch search(recoder_, null_char_, SimpleTextOutput(), dict_); + for (double r = min_dict_ratio; r < max_dict_ratio; r += dict_ratio_step) { + for (double c = min_cert_offset; c < max_cert_offset; + c += cert_offset_step) { + search.Decode(fwd_outputs, r, c, RecodeBeamSearch::kMinCertainty, NULL); + search.ExtractBestPathAsLabels(&ocr_labels, &xcoords); + truth_text = DecodeLabels(truth_labels); + ocr_text = DecodeLabels(ocr_labels); + // This is destructive on both strings. + double word_error = ComputeWordError(&truth_text, &ocr_text); + if ((r == min_dict_ratio && c == min_cert_offset) || + !std::isfinite(word_error)) { + STRING t = DecodeLabels(truth_labels); + STRING o = DecodeLabels(ocr_labels); + tprintf("r=%g, c=%g, truth=%s, ocr=%s, wderr=%g, truth[0]=%d\n", r, c, + t.string(), o.string(), word_error, truth_labels[0]); + } + results->add_str_double(" ", r); + results->add_str_double(",", c); + results->add_str_double("=", word_error); + } + } + return result; +} + +// Provides output on the distribution of weight values. +void LSTMTrainer::DebugNetwork() { + network_->DebugWeights(); +} + +// Loads a set of lstmf files that were created using the lstm.train config to +// tesseract into memory ready for training. Returns false if nothing was +// loaded. +bool LSTMTrainer::LoadAllTrainingData(const GenericVector& filenames) { + training_data_.Clear(); + return training_data_.LoadDocuments(filenames, "eng", CacheStrategy(), + file_reader_); +} + +// Keeps track of best and locally worst char error_rate and launches tests +// using tester, when a new min or max is reached. +// Writes checkpoints at appropriate times and builds and returns a log message +// to indicate progress. Returns false if nothing interesting happened. +bool LSTMTrainer::MaintainCheckpoints(TestCallback tester, STRING* log_msg) { + PrepareLogMsg(log_msg); + double error_rate = CharError(); + int iteration = learning_iteration(); + if (iteration >= stall_iteration_ && + error_rate > best_error_rate_ * (1.0 + kSubTrainerMarginFraction) && + best_error_rate_ < kMinStartedErrorRate && !best_trainer_.empty()) { + // It hasn't got any better in a long while, and is a margin worse than the + // best, so go back to the best model and try a different learning rate. + StartSubtrainer(log_msg); + } + SubTrainerResult sub_trainer_result = STR_NONE; + if (sub_trainer_ != NULL) { + sub_trainer_result = UpdateSubtrainer(log_msg); + if (sub_trainer_result == STR_REPLACED) { + // Reset the inputs, as we have overwritten *this. + error_rate = CharError(); + iteration = learning_iteration(); + PrepareLogMsg(log_msg); + } + } + bool result = true; // Something interesting happened. + GenericVector rec_model_data; + if (error_rate < best_error_rate_) { + SaveRecognitionDump(&rec_model_data); + log_msg->add_str_double(" New best char error = ", error_rate); + *log_msg += UpdateErrorGraph(iteration, error_rate, rec_model_data, tester); + // If sub_trainer_ is not NULL, either *this beat it to a new best, or it + // just overwrote *this. In either case, we have finished with it. + delete sub_trainer_; + sub_trainer_ = NULL; + stall_iteration_ = learning_iteration() + kMinStallIterations; + if (TransitionTrainingStage(kStageTransitionThreshold)) { + log_msg->add_str_int(" Transitioned to stage ", CurrentTrainingStage()); + } + checkpoint_writer_->Run(NO_BEST_TRAINER, this, &best_trainer_); + if (error_rate < error_rate_of_last_saved_best_ * kBestCheckpointFraction) { + STRING best_model_name = DumpFilename(); + if (!(*file_writer_)(best_trainer_, best_model_name)) { + *log_msg += " failed to write best model:"; + } else { + *log_msg += " wrote best model:"; + error_rate_of_last_saved_best_ = best_error_rate_; + } + *log_msg += best_model_name; + } + } else if (error_rate > worst_error_rate_) { + SaveRecognitionDump(&rec_model_data); + log_msg->add_str_double(" New worst char error = ", error_rate); + *log_msg += UpdateErrorGraph(iteration, error_rate, rec_model_data, tester); + if (worst_error_rate_ > best_error_rate_ + kMinDivergenceRate && + best_error_rate_ < kMinStartedErrorRate && !best_trainer_.empty()) { + // Error rate has ballooned. Go back to the best model. + *log_msg += "\nDivergence! "; + // Copy best_trainer_ before reading it, as it will get overwritten. + GenericVector revert_data(best_trainer_); + if (checkpoint_reader_->Run(revert_data, this)) { + LogIterations("Reverted to", log_msg); + ReduceLearningRates(this, log_msg); + } else { + LogIterations("Failed to Revert at", log_msg); + } + // If it fails again, we will wait twice as long before reverting again. + stall_iteration_ = iteration + 2 * (iteration - learning_iteration()); + // Re-save the best trainer with the new learning rates and stall + // iteration. + checkpoint_writer_->Run(NO_BEST_TRAINER, this, &best_trainer_); + } + } else { + // Something interesting happened only if the sub_trainer_ was trained. + result = sub_trainer_result != STR_NONE; + } + if (checkpoint_writer_ != NULL && file_writer_ != NULL && + checkpoint_name_.length() > 0) { + // Write a current checkpoint. + GenericVector checkpoint; + if (!checkpoint_writer_->Run(FULL, this, &checkpoint) || + !(*file_writer_)(checkpoint, checkpoint_name_)) { + *log_msg += " failed to write checkpoint."; + } else { + *log_msg += " wrote checkpoint."; + } + } + *log_msg += "\n"; + return result; +} + +// Builds a string containing a progress message with current error rates. +void LSTMTrainer::PrepareLogMsg(STRING* log_msg) const { + LogIterations("At", log_msg); + log_msg->add_str_double(", Mean rms=", error_rates_[ET_RMS]); + log_msg->add_str_double("%, delta=", error_rates_[ET_DELTA]); + log_msg->add_str_double("%, char train=", error_rates_[ET_CHAR_ERROR]); + log_msg->add_str_double("%, word train=", error_rates_[ET_WORD_RECERR]); + log_msg->add_str_double("%, skip ratio=", error_rates_[ET_SKIP_RATIO]); + *log_msg += "%, "; +} + +// Appends iteration learning_iteration()/training_iteration()/ +// sample_iteration() to the log_msg. +void LSTMTrainer::LogIterations(const char* intro_str, STRING* log_msg) const { + *log_msg += intro_str; + log_msg->add_str_int(" iteration ", learning_iteration()); + log_msg->add_str_int("/", training_iteration()); + log_msg->add_str_int("/", sample_iteration()); +} + +// Returns true and increments the training_stage_ if the error rate has just +// passed through the given threshold for the first time. +bool LSTMTrainer::TransitionTrainingStage(float error_threshold) { + if (best_error_rate_ < error_threshold && + training_stage_ + 1 < num_training_stages_) { + ++training_stage_; + return true; + } + return false; +} + +// Writes to the given file. Returns false in case of error. +bool LSTMTrainer::Serialize(TFile* fp) const { + if (!LSTMRecognizer::Serialize(fp)) return false; + if (fp->FWrite(&learning_iteration_, sizeof(learning_iteration_), 1) != 1) + return false; + if (fp->FWrite(&prev_sample_iteration_, sizeof(prev_sample_iteration_), 1) != + 1) + return false; + if (fp->FWrite(&perfect_delay_, sizeof(perfect_delay_), 1) != 1) return false; + if (fp->FWrite(&last_perfect_training_iteration_, + sizeof(last_perfect_training_iteration_), 1) != 1) + return false; + for (int i = 0; i < ET_COUNT; ++i) { + if (!error_buffers_[i].Serialize(fp)) return false; + } + if (fp->FWrite(&error_rates_, sizeof(error_rates_), 1) != 1) return false; + if (fp->FWrite(&training_stage_, sizeof(training_stage_), 1) != 1) + return false; + uinT8 amount = serialize_amount_; + if (fp->FWrite(&amount, sizeof(amount), 1) != 1) return false; + if (amount == LIGHT) return true; // We are done. + if (fp->FWrite(&best_error_rate_, sizeof(best_error_rate_), 1) != 1) + return false; + if (fp->FWrite(&best_error_rates_, sizeof(best_error_rates_), 1) != 1) + return false; + if (fp->FWrite(&best_iteration_, sizeof(best_iteration_), 1) != 1) + return false; + if (fp->FWrite(&worst_error_rate_, sizeof(worst_error_rate_), 1) != 1) + return false; + if (fp->FWrite(&worst_error_rates_, sizeof(worst_error_rates_), 1) != 1) + return false; + if (fp->FWrite(&worst_iteration_, sizeof(worst_iteration_), 1) != 1) + return false; + if (fp->FWrite(&stall_iteration_, sizeof(stall_iteration_), 1) != 1) + return false; + if (!best_model_data_.Serialize(fp)) return false; + if (!worst_model_data_.Serialize(fp)) return false; + if (amount != NO_BEST_TRAINER && !best_trainer_.Serialize(fp)) return false; + GenericVector sub_data; + if (sub_trainer_ != NULL && !SaveTrainingDump(LIGHT, sub_trainer_, &sub_data)) + return false; + if (!sub_data.Serialize(fp)) return false; + if (!best_error_history_.Serialize(fp)) return false; + if (!best_error_iterations_.Serialize(fp)) return false; + if (fp->FWrite(&improvement_steps_, sizeof(improvement_steps_), 1) != 1) + return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool LSTMTrainer::DeSerialize(bool swap, TFile* fp) { + if (!LSTMRecognizer::DeSerialize(swap, fp)) return false; + if (fp->FRead(&learning_iteration_, sizeof(learning_iteration_), 1) != 1) { + // Special case. If we successfully decoded the recognizer, but fail here + // then it means we were just given a recognizer, so issue a warning and + // allow it. + tprintf("Warning: LSTMTrainer deserialized an LSTMRecognizer!\n"); + learning_iteration_ = 0; + network_->SetEnableTraining(TS_RE_ENABLE); + return true; + } + if (fp->FRead(&prev_sample_iteration_, sizeof(prev_sample_iteration_), 1) != + 1) + return false; + if (fp->FRead(&perfect_delay_, sizeof(perfect_delay_), 1) != 1) return false; + if (fp->FRead(&last_perfect_training_iteration_, + sizeof(last_perfect_training_iteration_), 1) != 1) + return false; + for (int i = 0; i < ET_COUNT; ++i) { + if (!error_buffers_[i].DeSerialize(swap, fp)) return false; + } + if (fp->FRead(&error_rates_, sizeof(error_rates_), 1) != 1) return false; + if (fp->FRead(&training_stage_, sizeof(training_stage_), 1) != 1) + return false; + uinT8 amount; + if (fp->FRead(&amount, sizeof(amount), 1) != 1) return false; + if (amount == LIGHT) return true; // Don't read the rest. + if (fp->FRead(&best_error_rate_, sizeof(best_error_rate_), 1) != 1) + return false; + if (fp->FRead(&best_error_rates_, sizeof(best_error_rates_), 1) != 1) + return false; + if (fp->FRead(&best_iteration_, sizeof(best_iteration_), 1) != 1) + return false; + if (fp->FRead(&worst_error_rate_, sizeof(worst_error_rate_), 1) != 1) + return false; + if (fp->FRead(&worst_error_rates_, sizeof(worst_error_rates_), 1) != 1) + return false; + if (fp->FRead(&worst_iteration_, sizeof(worst_iteration_), 1) != 1) + return false; + if (fp->FRead(&stall_iteration_, sizeof(stall_iteration_), 1) != 1) + return false; + if (!best_model_data_.DeSerialize(swap, fp)) return false; + if (!worst_model_data_.DeSerialize(swap, fp)) return false; + if (amount != NO_BEST_TRAINER && !best_trainer_.DeSerialize(swap, fp)) + return false; + GenericVector sub_data; + if (!sub_data.DeSerialize(swap, fp)) return false; + delete sub_trainer_; + if (sub_data.empty()) { + sub_trainer_ = NULL; + } else { + sub_trainer_ = new LSTMTrainer(); + if (!ReadTrainingDump(sub_data, sub_trainer_)) return false; + } + if (!best_error_history_.DeSerialize(swap, fp)) return false; + if (!best_error_iterations_.DeSerialize(swap, fp)) return false; + if (fp->FRead(&improvement_steps_, sizeof(improvement_steps_), 1) != 1) + return false; + return true; +} + +// De-serializes the saved best_trainer_ into sub_trainer_, and adjusts the +// learning rates (by scaling reduction, or layer specific, according to +// NF_LAYER_SPECIFIC_LR). +void LSTMTrainer::StartSubtrainer(STRING* log_msg) { + delete sub_trainer_; + sub_trainer_ = new LSTMTrainer(); + if (!checkpoint_reader_->Run(best_trainer_, sub_trainer_)) { + *log_msg += " Failed to revert to previous best for trial!"; + delete sub_trainer_; + sub_trainer_ = NULL; + } else { + log_msg->add_str_int(" Trial sub_trainer_ from iteration ", + sub_trainer_->training_iteration()); + // Reduce learning rate so it doesn't diverge this time. + sub_trainer_->ReduceLearningRates(this, log_msg); + // If it fails again, we will wait twice as long before reverting again. + int stall_offset = + learning_iteration() - sub_trainer_->learning_iteration(); + stall_iteration_ = learning_iteration() + 2 * stall_offset; + sub_trainer_->stall_iteration_ = stall_iteration_; + // Re-save the best trainer with the new learning rates and stall iteration. + checkpoint_writer_->Run(NO_BEST_TRAINER, sub_trainer_, &best_trainer_); + } +} + +// While the sub_trainer_ is behind the current training iteration and its +// training error is at least kSubTrainerMarginFraction better than the +// current training error, trains the sub_trainer_, and returns STR_UPDATED if +// it did anything. If it catches up, and has a better error rate than the +// current best, as well as a margin over the current error rate, then the +// trainer in *this is replaced with sub_trainer_, and STR_REPLACED is +// returned. STR_NONE is returned if the subtrainer wasn't good enough to +// receive any training iterations. +SubTrainerResult LSTMTrainer::UpdateSubtrainer(STRING* log_msg) { + double training_error = CharError(); + double sub_error = sub_trainer_->CharError(); + double sub_margin = (training_error - sub_error) / sub_error; + if (sub_margin >= kSubTrainerMarginFraction) { + log_msg->add_str_double(" sub_trainer=", sub_error); + log_msg->add_str_double(" margin=", 100.0 * sub_margin); + *log_msg += "\n"; + // Catch up to current iteration. + int end_iteration = training_iteration(); + while (sub_trainer_->training_iteration() < end_iteration && + sub_margin >= kSubTrainerMarginFraction) { + int target_iteration = + sub_trainer_->training_iteration() + kNumPagesPerBatch; + while (sub_trainer_->training_iteration() < target_iteration) { + sub_trainer_->TrainOnLine(this, false); + } + STRING batch_log = "Sub:"; + sub_trainer_->PrepareLogMsg(&batch_log); + batch_log += "\n"; + tprintf("UpdateSubtrainer:%s", batch_log.string()); + *log_msg += batch_log; + sub_error = sub_trainer_->CharError(); + sub_margin = (training_error - sub_error) / sub_error; + } + if (sub_error < best_error_rate_ && + sub_margin >= kSubTrainerMarginFraction) { + // The sub_trainer_ has won the race to a new best. Switch to it. + GenericVector updated_trainer; + SaveTrainingDump(LIGHT, sub_trainer_, &updated_trainer); + ReadTrainingDump(updated_trainer, this); + log_msg->add_str_int(" Sub trainer wins at iteration ", + training_iteration()); + *log_msg += "\n"; + return STR_REPLACED; + } + return STR_UPDATED; + } + return STR_NONE; +} + +// Reduces network learning rates, either for everything, or for layers +// independently, according to NF_LAYER_SPECIFIC_LR. +void LSTMTrainer::ReduceLearningRates(LSTMTrainer* samples_trainer, + STRING* log_msg) { + if (network_->TestFlag(NF_LAYER_SPECIFIC_LR)) { + int num_reduced = ReduceLayerLearningRates( + kLearningRateDecay, kNumAdjustmentIterations, samples_trainer); + log_msg->add_str_int("\nReduced learning rate on layers: ", num_reduced); + } else { + ScaleLearningRate(kLearningRateDecay); + log_msg->add_str_double("\nReduced learning rate to :", learning_rate_); + } + *log_msg += "\n"; +} + +// Considers reducing the learning rate independently for each layer down by +// factor(<1), or leaving it the same, by double-training the given number of +// samples and minimizing the amount of changing of sign of weight updates. +// Even if it looks like all weights should remain the same, an adjustment +// will be made to guarantee a different result when reverting to an old best. +// Returns the number of layer learning rates that were reduced. +int LSTMTrainer::ReduceLayerLearningRates(double factor, int num_samples, + LSTMTrainer* samples_trainer) { + enum WhichWay { + LR_DOWN, // Learning rate will go down by factor. + LR_SAME, // Learning rate will stay the same. + LR_COUNT // Size of arrays. + }; + // Epsilon is so small that it may as well be zero, but still positive. + const double kEpsilon = 1.0e-30; + GenericVector layers = EnumerateLayers(); + int num_layers = layers.size(); + GenericVector num_weights; + num_weights.init_to_size(num_layers, 0); + GenericVector bad_sums[LR_COUNT]; + GenericVector ok_sums[LR_COUNT]; + for (int i = 0; i < LR_COUNT; ++i) { + bad_sums[i].init_to_size(num_layers, 0.0); + ok_sums[i].init_to_size(num_layers, 0.0); + } + double momentum_factor = 1.0 / (1.0 - momentum_); + GenericVector orig_trainer; + SaveTrainingDump(LIGHT, this, &orig_trainer); + for (int i = 0; i < num_layers; ++i) { + Network* layer = GetLayer(layers[i]); + num_weights[i] = layer->IsTraining() ? layer->num_weights() : 0; + } + int iteration = sample_iteration(); + for (int s = 0; s < num_samples; ++s) { + // Which way will we modify the learning rate? + for (int ww = 0; ww < LR_COUNT; ++ww) { + // Transfer momentum to learning rate and adjust by the ww factor. + float ww_factor = momentum_factor; + if (ww == LR_DOWN) ww_factor *= factor; + // Make a copy of *this, so we can mess about without damaging anything. + LSTMTrainer copy_trainer; + copy_trainer.ReadTrainingDump(orig_trainer, ©_trainer); + // Clear the updates, doing nothing else. + copy_trainer.network_->Update(0.0, 0.0, 0); + // Adjust the learning rate in each layer. + for (int i = 0; i < num_layers; ++i) { + if (num_weights[i] == 0) continue; + copy_trainer.ScaleLayerLearningRate(layers[i], ww_factor); + } + copy_trainer.SetIteration(iteration); + // Train on the sample, but keep the update in updates_ instead of + // applying to the weights. + const ImageData* trainingdata = + copy_trainer.TrainOnLine(samples_trainer, true); + if (trainingdata == NULL) continue; + // We'll now use this trainer again for each layer. + GenericVector updated_trainer; + SaveTrainingDump(LIGHT, ©_trainer, &updated_trainer); + for (int i = 0; i < num_layers; ++i) { + if (num_weights[i] == 0) continue; + LSTMTrainer layer_trainer; + layer_trainer.ReadTrainingDump(updated_trainer, &layer_trainer); + Network* layer = layer_trainer.GetLayer(layers[i]); + // Update the weights in just the layer, and also zero the updates + // matrix (to epsilon). + layer->Update(0.0, kEpsilon, 0); + // Train again on the same sample, again holding back the updates. + layer_trainer.TrainOnLine(trainingdata, true); + // Count the sign changes in the updates in layer vs in copy_trainer. + float before_bad = bad_sums[ww][i]; + float before_ok = ok_sums[ww][i]; + layer->CountAlternators(*copy_trainer.GetLayer(layers[i]), + &ok_sums[ww][i], &bad_sums[ww][i]); + float bad_frac = + bad_sums[ww][i] + ok_sums[ww][i] - before_bad - before_ok; + if (bad_frac > 0.0f) + bad_frac = (bad_sums[ww][i] - before_bad) / bad_frac; + } + } + ++iteration; + } + int num_lowered = 0; + for (int i = 0; i < num_layers; ++i) { + if (num_weights[i] == 0) continue; + Network* layer = GetLayer(layers[i]); + float lr = GetLayerLearningRate(layers[i]); + double total_down = bad_sums[LR_DOWN][i] + ok_sums[LR_DOWN][i]; + double total_same = bad_sums[LR_SAME][i] + ok_sums[LR_SAME][i]; + double frac_down = bad_sums[LR_DOWN][i] / total_down; + double frac_same = bad_sums[LR_SAME][i] / total_same; + tprintf("Layer %d=%s: lr %g->%g%%, lr %g->%g%%", i, layer->name().string(), + lr * factor, 100.0 * frac_down, lr, 100.0 * frac_same); + if (frac_down < frac_same * kImprovementFraction) { + tprintf(" REDUCED\n"); + ScaleLayerLearningRate(layers[i], factor); + ++num_lowered; + } else { + tprintf(" SAME\n"); + } + } + if (num_lowered == 0) { + // Just lower everything to make sure. + for (int i = 0; i < num_layers; ++i) { + if (num_weights[i] > 0) { + ScaleLayerLearningRate(layers[i], factor); + ++num_lowered; + } + } + } + return num_lowered; +} + +// Converts the string to integer class labels, with appropriate null_char_s +// in between if not in SimpleTextOutput mode. Returns false on failure. +/* static */ +bool LSTMTrainer::EncodeString(const STRING& str, const UNICHARSET& unicharset, + const UnicharCompress* recoder, bool simple_text, + int null_char, GenericVector* labels) { + if (str.string() == NULL || str.length() <= 0) { + tprintf("Empty truth string!\n"); + return false; + } + int err_index; + GenericVector internal_labels; + labels->truncate(0); + if (!simple_text) labels->push_back(null_char); + if (unicharset.encode_string(str.string(), true, &internal_labels, NULL, + &err_index)) { + bool success = true; + for (int i = 0; i < internal_labels.size(); ++i) { + if (recoder != NULL) { + // Re-encode labels via recoder. + RecodedCharID code; + int len = recoder->EncodeUnichar(internal_labels[i], &code); + if (len > 0) { + for (int j = 0; j < len; ++j) { + labels->push_back(code(j)); + if (!simple_text) labels->push_back(null_char); + } + } else { + success = false; + err_index = 0; + break; + } + } else { + labels->push_back(internal_labels[i]); + if (!simple_text) labels->push_back(null_char); + } + } + if (success) return true; + } + tprintf("Encoding of string failed! Failure bytes:"); + while (err_index < str.length()) { + tprintf(" %x", str[err_index++]); + } + tprintf("\n"); + return false; +} + +// Performs forward-backward on the given trainingdata. +// Returns a Trainability enum to indicate the suitability of the sample. +Trainability LSTMTrainer::TrainOnLine(const ImageData* trainingdata, + bool batch) { + NetworkIO fwd_outputs, targets; + Trainability trainable = + PrepareForBackward(trainingdata, &fwd_outputs, &targets); + ++sample_iteration_; + if (trainable == UNENCODABLE || trainable == NOT_BOXED) { + return trainable; // Sample was unusable. + } + bool debug = debug_interval_ > 0 && + training_iteration() % debug_interval_ == 0; + // Run backprop on the output. + NetworkIO bp_deltas; + if (network_->IsTraining() && + (trainable != PERFECT || + training_iteration() > + last_perfect_training_iteration_ + perfect_delay_)) { + network_->Backward(debug, targets, &scratch_space_, &bp_deltas); + network_->Update(learning_rate_, batch ? -1.0f : momentum_, + training_iteration_ + 1); + } +#ifndef GRAPHICS_DISABLED + if (debug_interval_ == 1 && debug_win_ != NULL) { + delete debug_win_->AwaitEvent(SVET_CLICK); + } +#endif // GRAPHICS_DISABLED + // Roll the memory of past means. + RollErrorBuffers(); + return trainable; +} + +// Prepares the ground truth, runs forward, and prepares the targets. +// Returns a Trainability enum to indicate the suitability of the sample. +Trainability LSTMTrainer::PrepareForBackward(const ImageData* trainingdata, + NetworkIO* fwd_outputs, + NetworkIO* targets) { + if (trainingdata == NULL) { + tprintf("Null trainingdata.\n"); + return UNENCODABLE; + } + // Ensure repeatability of random elements even across checkpoints. + bool debug = debug_interval_ > 0 && + training_iteration() % debug_interval_ == 0; + GenericVector truth_labels; + if (!EncodeString(trainingdata->transcription(), &truth_labels)) { + tprintf("Can't encode transcription: %s\n", + trainingdata->transcription().string()); + return UNENCODABLE; + } + int w = 0; + while (w < truth_labels.size() && + (truth_labels[w] == UNICHAR_SPACE || truth_labels[w] == null_char_)) + ++w; + if (w == truth_labels.size()) { + tprintf("Blank transcription: %s\n", + trainingdata->transcription().string()); + return UNENCODABLE; + } + float image_scale; + NetworkIO inputs; + bool invert = trainingdata->boxes().empty(); + if (!RecognizeLine(*trainingdata, invert, debug, invert, 0.0f, &image_scale, + &inputs, fwd_outputs)) { + tprintf("Image not trainable\n"); + return UNENCODABLE; + } + targets->Resize(*fwd_outputs, network_->NumOutputs()); + LossType loss_type = OutputLossType(); + if (loss_type == LT_SOFTMAX) { + if (!ComputeTextTargets(*fwd_outputs, truth_labels, targets)) { + tprintf("Compute simple targets failed!\n"); + return UNENCODABLE; + } + } else if (loss_type == LT_CTC) { + if (!ComputeCTCTargets(truth_labels, fwd_outputs, targets)) { + tprintf("Compute CTC targets failed!\n"); + return UNENCODABLE; + } + } else { + tprintf("Logistic outputs not implemented yet!\n"); + return UNENCODABLE; + } + GenericVector ocr_labels; + GenericVector xcoords; + LabelsFromOutputs(*fwd_outputs, 0.0f, &ocr_labels, &xcoords); + // CTC does not produce correct target labels to begin with. + if (loss_type != LT_CTC) { + LabelsFromOutputs(*targets, 0.0f, &truth_labels, &xcoords); + } + if (!DebugLSTMTraining(inputs, *trainingdata, *fwd_outputs, truth_labels, + *targets)) { + tprintf("Input width was %d\n", inputs.Width()); + return UNENCODABLE; + } + STRING ocr_text = DecodeLabels(ocr_labels); + STRING truth_text = DecodeLabels(truth_labels); + targets->SubtractAllFromFloat(*fwd_outputs); + if (debug_interval_ != 0) { + tprintf("Iteration %d: BEST OCR TEXT : %s\n", training_iteration(), + ocr_text.string()); + } + double char_error = ComputeCharError(truth_labels, ocr_labels); + double word_error = ComputeWordError(&truth_text, &ocr_text); + double delta_error = ComputeErrorRates(*targets, char_error, word_error); + if (debug_interval_ != 0) { + tprintf("File %s page %d %s:\n", trainingdata->imagefilename().string(), + trainingdata->page_number(), delta_error == 0.0 ? "(Perfect)" : ""); + } + if (delta_error == 0.0) return PERFECT; + if (targets->AnySuspiciousTruth(kHighConfidence)) return HI_PRECISION_ERR; + return TRAINABLE; +} + +// Writes the trainer to memory, so that the current training state can be +// restored. +bool LSTMTrainer::SaveTrainingDump(SerializeAmount serialize_amount, + const LSTMTrainer* trainer, + GenericVector* data) const { + TFile fp; + fp.OpenWrite(data); + trainer->serialize_amount_ = serialize_amount; + return trainer->Serialize(&fp); +} + +// Reads previously saved trainer from memory. +bool LSTMTrainer::ReadTrainingDump(const GenericVector& data, + LSTMTrainer* trainer) { + return trainer->ReadSizedTrainingDump(&data[0], data.size()); +} + +bool LSTMTrainer::ReadSizedTrainingDump(const char* data, int size) { + TFile fp; + fp.Open(data, size); + return DeSerialize(false, &fp); +} + +// Writes the recognizer to memory, so that it can be used for testing later. +void LSTMTrainer::SaveRecognitionDump(GenericVector* data) const { + TFile fp; + fp.OpenWrite(data); + network_->SetEnableTraining(TS_TEMP_DISABLE); + ASSERT_HOST(LSTMRecognizer::Serialize(&fp)); + network_->SetEnableTraining(TS_RE_ENABLE); +} + +// Reads and returns a previously saved recognizer from memory. +LSTMRecognizer* LSTMTrainer::ReadRecognitionDump( + const GenericVector& data) { + TFile fp; + fp.Open(&data[0], data.size()); + LSTMRecognizer* recognizer = new LSTMRecognizer; + ASSERT_HOST(recognizer->DeSerialize(false, &fp)); + return recognizer; +} + +// Returns a suitable filename for a training dump, based on the model_base_, +// the iteration and the error rates. +STRING LSTMTrainer::DumpFilename() const { + STRING filename; + filename.add_str_double(model_base_.string(), best_error_rate_); + filename.add_str_int("_", best_iteration_); + filename += ".lstm"; + return filename; +} + +// Fills the whole error buffer of the given type with the given value. +void LSTMTrainer::FillErrorBuffer(double new_error, ErrorTypes type) { + for (int i = 0; i < kRollingBufferSize_; ++i) + error_buffers_[type][i] = new_error; + error_rates_[type] = 100.0 * new_error; +} + +// Factored sub-constructor sets up reasonable default values. +void LSTMTrainer::EmptyConstructor() { + align_win_ = NULL; + target_win_ = NULL; + ctc_win_ = NULL; + recon_win_ = NULL; + checkpoint_iteration_ = 0; + serialize_amount_ = FULL; + training_stage_ = 0; + num_training_stages_ = 2; + InitIterations(); +} + +// Sets the unicharset properties using the given script_dir as a source of +// script unicharsets. If the flag TF_COMPRESS_UNICHARSET is true, also sets +// up the recoder_ to simplify the unicharset. +void LSTMTrainer::SetUnicharsetProperties(const STRING& script_dir) { + tprintf("Setting unichar properties\n"); + for (int s = 0; s < GetUnicharset().get_script_table_size(); ++s) { + if (strcmp("NULL", GetUnicharset().get_script_from_script_id(s)) == 0) + continue; + // Load the unicharset for the script if available. + STRING filename = script_dir + "/" + + GetUnicharset().get_script_from_script_id(s) + + ".unicharset"; + UNICHARSET script_set; + GenericVector data; + if ((*file_reader_)(filename, &data) && + script_set.load_from_inmemory_file(&data[0], data.size())) { + tprintf("Setting properties for script %s\n", + GetUnicharset().get_script_from_script_id(s)); + ccutil_.unicharset.SetPropertiesFromOther(script_set); + } + } + if (IsRecoding()) { + STRING filename = script_dir + "/radical-stroke.txt"; + GenericVector data; + if ((*file_reader_)(filename, &data)) { + data += '\0'; + STRING stroke_table = &data[0]; + if (recoder_.ComputeEncoding(GetUnicharset(), null_char_, + &stroke_table)) { + RecodedCharID code; + recoder_.EncodeUnichar(null_char_, &code); + null_char_ = code(0); + // Space should encode as itself. + recoder_.EncodeUnichar(UNICHAR_SPACE, &code); + ASSERT_HOST(code(0) == UNICHAR_SPACE); + return; + } + } else { + tprintf("Failed to load radical-stroke info from: %s\n", + filename.string()); + } + training_flags_ &= ~TF_COMPRESS_UNICHARSET; + } +} + +// Outputs the string and periodically displays the given network inputs +// as an image in the given window, and the corresponding labels at the +// corresponding x_starts. +// Returns false if the truth string is empty. +bool LSTMTrainer::DebugLSTMTraining(const NetworkIO& inputs, + const ImageData& trainingdata, + const NetworkIO& fwd_outputs, + const GenericVector& truth_labels, + const NetworkIO& outputs) { + const STRING& truth_text = DecodeLabels(truth_labels); + if (truth_text.string() == NULL || truth_text.length() <= 0) { + tprintf("Empty truth string at decode time!\n"); + return false; + } + if (debug_interval_ != 0) { + // Get class labels, xcoords and string. + GenericVector labels; + GenericVector xcoords; + LabelsFromOutputs(outputs, 0.0f, &labels, &xcoords); + STRING text = DecodeLabels(labels); + tprintf("Iteration %d: ALIGNED TRUTH : %s\n", + training_iteration(), text.string()); + if (debug_interval_ > 0 && training_iteration() % debug_interval_ == 0) { + tprintf("TRAINING activation path for truth string %s\n", + truth_text.string()); + DebugActivationPath(outputs, labels, xcoords); + DisplayForward(inputs, labels, xcoords, "LSTMTraining", &align_win_); + if (OutputLossType() == LT_CTC) { + DisplayTargets(fwd_outputs, "CTC Outputs", &ctc_win_); + DisplayTargets(outputs, "CTC Targets", &target_win_); + } + } + } + return true; +} + +// Displays the network targets as line a line graph. +void LSTMTrainer::DisplayTargets(const NetworkIO& targets, + const char* window_name, ScrollView** window) { +#ifndef GRAPHICS_DISABLED // do nothing if there's no graphics. + int width = targets.Width(); + int num_features = targets.NumFeatures(); + Network::ClearWindow(true, window_name, width * kTargetXScale, kTargetYScale, + window); + for (int c = 0; c < num_features; ++c) { + int color = c % (ScrollView::GREEN_YELLOW - 1) + 2; + (*window)->Pen(static_cast(color)); + int start_t = -1; + for (int t = 0; t < width; ++t) { + double target = targets.f(t)[c]; + target *= kTargetYScale; + if (target >= 1) { + if (start_t < 0) { + (*window)->SetCursor(t - 1, 0); + start_t = t; + } + (*window)->DrawTo(t, target); + } else if (start_t >= 0) { + (*window)->DrawTo(t, 0); + (*window)->DrawTo(start_t - 1, 0); + start_t = -1; + } + } + if (start_t >= 0) { + (*window)->DrawTo(width, 0); + (*window)->DrawTo(start_t - 1, 0); + } + } + (*window)->Update(); +#endif // GRAPHICS_DISABLED +} + +// Builds a no-compromises target where the first positions should be the +// truth labels and the rest is padded with the null_char_. +bool LSTMTrainer::ComputeTextTargets(const NetworkIO& outputs, + const GenericVector& truth_labels, + NetworkIO* targets) { + if (truth_labels.size() > targets->Width()) { + tprintf("Error: transcription %s too long to fit into target of width %d\n", + DecodeLabels(truth_labels).string(), targets->Width()); + return false; + } + for (int i = 0; i < truth_labels.size() && i < targets->Width(); ++i) { + targets->SetActivations(i, truth_labels[i], 1.0); + } + for (int i = truth_labels.size(); i < targets->Width(); ++i) { + targets->SetActivations(i, null_char_, 1.0); + } + return true; +} + +// Builds a target using standard CTC. truth_labels should be pre-padded with +// nulls wherever desired. They don't have to be between all labels. +// outputs is input-output, as it gets clipped to minimum probability. +bool LSTMTrainer::ComputeCTCTargets(const GenericVector& truth_labels, + NetworkIO* outputs, NetworkIO* targets) { + // Bottom-clip outputs to a minimum probability. + CTC::NormalizeProbs(outputs); + return CTC::ComputeCTCTargets(truth_labels, null_char_, + outputs->float_array(), targets); +} + +// Computes network errors, and stores the results in the rolling buffers, +// along with the supplied text_error. +// Returns the delta error of the current sample (not running average.) +double LSTMTrainer::ComputeErrorRates(const NetworkIO& deltas, + double char_error, double word_error) { + UpdateErrorBuffer(ComputeRMSError(deltas), ET_RMS); + // Delta error is the fraction of timesteps with >0.5 error in the top choice + // score. If zero, then the top choice characters are guaranteed correct, + // even when there is residue in the RMS error. + double delta_error = ComputeWinnerError(deltas); + UpdateErrorBuffer(delta_error, ET_DELTA); + UpdateErrorBuffer(word_error, ET_WORD_RECERR); + UpdateErrorBuffer(char_error, ET_CHAR_ERROR); + // Skip ratio measures the difference between sample_iteration_ and + // training_iteration_, which reflects the number of unusable samples, + // usually due to unencodable truth text, or the text not fitting in the + // space for the output. + double skip_count = sample_iteration_ - prev_sample_iteration_; + UpdateErrorBuffer(skip_count, ET_SKIP_RATIO); + return delta_error; +} + +// Computes the network activation RMS error rate. +double LSTMTrainer::ComputeRMSError(const NetworkIO& deltas) { + double total_error = 0.0; + int width = deltas.Width(); + int num_classes = deltas.NumFeatures(); + for (int t = 0; t < width; ++t) { + const float* class_errs = deltas.f(t); + for (int c = 0; c < num_classes; ++c) { + double error = class_errs[c]; + total_error += error * error; + } + } + return sqrt(total_error / (width * num_classes)); +} + +// Computes network activation winner error rate. (Number of values that are +// in error by >= 0.5 divided by number of time-steps.) More closely related +// to final character error than RMS, but still directly calculable from +// just the deltas. Because of the binary nature of the targets, zero winner +// error is a sufficient but not necessary condition for zero char error. +double LSTMTrainer::ComputeWinnerError(const NetworkIO& deltas) { + int num_errors = 0; + int width = deltas.Width(); + int num_classes = deltas.NumFeatures(); + for (int t = 0; t < width; ++t) { + const float* class_errs = deltas.f(t); + for (int c = 0; c < num_classes; ++c) { + float abs_delta = fabs(class_errs[c]); + // TODO(rays) Filtering cases where the delta is very large to cut out + // GT errors doesn't work. Find a better way or get better truth. + if (0.5 <= abs_delta) + ++num_errors; + } + } + return static_cast(num_errors) / width; +} + +// Computes a very simple bag of chars char error rate. +double LSTMTrainer::ComputeCharError(const GenericVector& truth_str, + const GenericVector& ocr_str) { + GenericVector label_counts; + label_counts.init_to_size(NumOutputs(), 0); + int truth_size = 0; + for (int i = 0; i < truth_str.size(); ++i) { + if (truth_str[i] != null_char_) { + ++label_counts[truth_str[i]]; + ++truth_size; + } + } + for (int i = 0; i < ocr_str.size(); ++i) { + if (ocr_str[i] != null_char_) { + --label_counts[ocr_str[i]]; + } + } + int char_errors = 0; + for (int i = 0; i < label_counts.size(); ++i) { + char_errors += abs(label_counts[i]); + } + if (truth_size == 0) { + return (char_errors == 0) ? 0.0 : 1.0; + } + return static_cast(char_errors) / truth_size; +} + +// Computes word recall error rate using a very simple bag of words algorithm. +// NOTE that this is destructive on both input strings. +double LSTMTrainer::ComputeWordError(STRING* truth_str, STRING* ocr_str) { + typedef std::unordered_map > StrMap; + GenericVector truth_words, ocr_words; + truth_str->split(' ', &truth_words); + if (truth_words.empty()) return 0.0; + ocr_str->split(' ', &ocr_words); + StrMap word_counts; + for (int i = 0; i < truth_words.size(); ++i) { + std::string truth_word(truth_words[i].string()); + StrMap::iterator it = word_counts.find(truth_word); + if (it == word_counts.end()) + word_counts.insert(std::make_pair(truth_word, 1)); + else + ++it->second; + } + for (int i = 0; i < ocr_words.size(); ++i) { + std::string ocr_word(ocr_words[i].string()); + StrMap::iterator it = word_counts.find(ocr_word); + if (it == word_counts.end()) + word_counts.insert(std::make_pair(ocr_word, -1)); + else + --it->second; + } + int word_recall_errs = 0; + for (StrMap::const_iterator it = word_counts.begin(); it != word_counts.end(); + ++it) { + if (it->second > 0) word_recall_errs += it->second; + } + return static_cast(word_recall_errs) / truth_words.size(); +} + +// Updates the error buffer and corresponding mean of the given type with +// the new_error. +void LSTMTrainer::UpdateErrorBuffer(double new_error, ErrorTypes type) { + int index = training_iteration_ % kRollingBufferSize_; + error_buffers_[type][index] = new_error; + // Compute the mean error. + int mean_count = MIN(training_iteration_ + 1, error_buffers_[type].size()); + double buffer_sum = 0.0; + for (int i = 0; i < mean_count; ++i) buffer_sum += error_buffers_[type][i]; + double mean = buffer_sum / mean_count; + // Trim precision to 1/1000 of 1%. + error_rates_[type] = IntCastRounded(100000.0 * mean) / 1000.0; +} + +// Rolls error buffers and reports the current means. +void LSTMTrainer::RollErrorBuffers() { + prev_sample_iteration_ = sample_iteration_; + if (NewSingleError(ET_DELTA) > 0.0) + ++learning_iteration_; + else + last_perfect_training_iteration_ = training_iteration_; + ++training_iteration_; + if (debug_interval_ != 0) { + tprintf("Mean rms=%g%%, delta=%g%%, train=%g%%(%g%%), skip ratio=%g%%\n", + error_rates_[ET_RMS], error_rates_[ET_DELTA], + error_rates_[ET_CHAR_ERROR], error_rates_[ET_WORD_RECERR], + error_rates_[ET_SKIP_RATIO]); + } +} + +// Given that error_rate is either a new min or max, updates the best/worst +// error rates, and record of progress. +// Tester is an externally supplied callback function that tests on some +// data set with a given model and records the error rates in a graph. +STRING LSTMTrainer::UpdateErrorGraph(int iteration, double error_rate, + const GenericVector& model_data, + TestCallback tester) { + if (error_rate > best_error_rate_ + && iteration < best_iteration_ + kErrorGraphInterval) { + // Too soon to record a new point. + if (tester != NULL) + return tester->Run(worst_iteration_, NULL, worst_model_data_, + CurrentTrainingStage()); + else + return ""; + } + STRING result; + // NOTE: there are 2 asymmetries here: + // 1. We are computing the global minimum, but the local maximum in between. + // 2. If the tester returns an empty string, indicating that it is busy, + // call it repeatedly on new local maxima to test the previous min, but + // not the other way around, as there is little point testing the maxima + // between very frequent minima. + if (error_rate < best_error_rate_) { + // This is a new (global) minimum. + if (tester != NULL) { + result = tester->Run(worst_iteration_, worst_error_rates_, + worst_model_data_, CurrentTrainingStage()); + worst_model_data_.truncate(0); + best_model_data_ = model_data; + } + best_error_rate_ = error_rate; + memcpy(best_error_rates_, error_rates_, sizeof(error_rates_)); + best_iteration_ = iteration; + best_error_history_.push_back(error_rate); + best_error_iterations_.push_back(iteration); + // Compute 2% decay time. + double two_percent_more = error_rate + 2.0; + int i; + for (i = best_error_history_.size() - 1; + i >= 0 && best_error_history_[i] < two_percent_more; --i) { + } + int old_iteration = i >= 0 ? best_error_iterations_[i] : 0; + improvement_steps_ = iteration - old_iteration; + tprintf("2 Percent improvement time=%d, best error was %g @ %d\n", + improvement_steps_, i >= 0 ? best_error_history_[i] : 100.0, + old_iteration); + } else if (error_rate > best_error_rate_) { + // This is a new (local) maximum. + if (tester != NULL) { + if (best_model_data_.empty()) { + // Allow for multiple data points with "worst" error rate. + result = tester->Run(worst_iteration_, worst_error_rates_, + worst_model_data_, CurrentTrainingStage()); + } else { + result = tester->Run(best_iteration_, best_error_rates_, + best_model_data_, CurrentTrainingStage()); + } + if (result.length() > 0) + best_model_data_.truncate(0); + worst_model_data_ = model_data; + } + } + worst_error_rate_ = error_rate; + memcpy(worst_error_rates_, error_rates_, sizeof(error_rates_)); + worst_iteration_ = iteration; + return result; +} + +} // namespace tesseract. diff --git a/lstm/lstmtrainer.h b/lstm/lstmtrainer.h new file mode 100644 index 00000000..2054284d --- /dev/null +++ b/lstm/lstmtrainer.h @@ -0,0 +1,481 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstmtrainer.h +// Description: Top-level line trainer class for LSTM-based networks. +// Author: Ray Smith +// Created: Fri May 03 09:07:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_LSTMTRAINER_H_ +#define TESSERACT_LSTM_LSTMTRAINER_H_ + +#include "imagedata.h" +#include "lstmrecognizer.h" +#include "rect.h" +#include "tesscallback.h" + +namespace tesseract { + +class LSTM; +class LSTMTrainer; +class Parallel; +class Reversed; +class Softmax; +class Series; + +// Enum for the types of errors that are counted. +enum ErrorTypes { + ET_RMS, // RMS activation error. + ET_DELTA, // Number of big errors in deltas. + ET_WORD_RECERR, // Output text string word recall error. + ET_CHAR_ERROR, // Output text string total char error. + ET_SKIP_RATIO, // Fraction of samples skipped. + ET_COUNT // For array sizing. +}; + +// Enum for the trainability_ flags. +enum Trainability { + TRAINABLE, // Non-zero delta error. + PERFECT, // Zero delta error. + UNENCODABLE, // Not trainable due to coding/alignment trouble. + HI_PRECISION_ERR, // Hi confidence disagreement. + NOT_BOXED, // Early in training and has no character boxes. +}; + +// Enum to define the amount of data to get serialized. +enum SerializeAmount { + LIGHT, // Minimal data for remote training. + NO_BEST_TRAINER, // Save an empty vector in place of best_trainer_. + FULL, // All data including best_trainer_. +}; + +// Enum to indicate how the sub_trainer_ training went. +enum SubTrainerResult { + STR_NONE, // Did nothing as not good enough. + STR_UPDATED, // Subtrainer was updated, but didn't replace *this. + STR_REPLACED // Subtrainer replaced *this. +}; + +class LSTMTrainer; +// Function to restore the trainer state from a given checkpoint. +// Returns false on failure. +typedef TessResultCallback2&, LSTMTrainer*>* + CheckPointReader; +// Function to save a checkpoint of the current trainer state. +// Returns false on failure. SerializeAmount determines the amount of the +// trainer to serialize, typically used for saving the best state. +typedef TessResultCallback3*>* CheckPointWriter; +// Function to compute and record error rates on some external test set(s). +// Args are: iteration, mean errors, model, training stage. +// Returns a STRING containing logging information about the tests. +typedef TessResultCallback4&, int>* TestCallback; + +// Trainer class for LSTM networks. Most of the effort is in creating the +// ideal target outputs from the transcription. A box file is used if it is +// available, otherwise estimates of the char widths from the unicharset are +// used to guide a DP search for the best fit to the transcription. +class LSTMTrainer : public LSTMRecognizer { + public: + LSTMTrainer(); + // Callbacks may be null, in which case defaults are used. + LSTMTrainer(FileReader file_reader, FileWriter file_writer, + CheckPointReader checkpoint_reader, + CheckPointWriter checkpoint_writer, + const char* model_base, const char* checkpoint_name, + int debug_interval, inT64 max_memory); + virtual ~LSTMTrainer(); + + // Tries to deserialize a trainer from the given file and silently returns + // false in case of failure. + bool TryLoadingCheckpoint(const char* filename); + + // Initializes the character set encode/decode mechanism. + // train_flags control training behavior according to the TrainingFlags + // enum, including character set encoding. + // script_dir is required for TF_COMPRESS_UNICHARSET, and, if provided, + // fully initializes the unicharset from the universal unicharsets. + // Note: Call before InitNetwork! + void InitCharSet(const UNICHARSET& unicharset, const STRING& script_dir, + int train_flags); + // Initializes the character set encode/decode mechanism directly from a + // previously setup UNICHARSET and UnicharCompress. + // ctc_mode controls how the truth text is mapped to the network targets. + // Note: Call before InitNetwork! + void InitCharSet(const UNICHARSET& unicharset, + const UnicharCompress& recoder); + + // Initializes the trainer with a network_spec in the network description + // net_flags control network behavior according to the NetworkFlags enum. + // There isn't really much difference between them - only where the effects + // are implemented. + // For other args see NetworkBuilder::InitNetwork. + // Note: Be sure to call InitCharSet before InitNetwork! + bool InitNetwork(const STRING& network_spec, int append_index, int net_flags, + float weight_range, float learning_rate, float momentum); + // Initializes a trainer from a serialized TFNetworkModel proto. + // Returns the global step of TensorFlow graph or 0 if failed. + // Building a compatible TF graph: See tfnetwork.proto. + int InitTensorFlowNetwork(const std::string& tf_proto); + // Resets all the iteration counters for fine tuning or training a head, + // where we want the error reporting to reset. + void InitIterations(); + + // Accessors. + double ActivationError() const { + return error_rates_[ET_DELTA]; + } + double CharError() const { return error_rates_[ET_CHAR_ERROR]; } + const double* error_rates() const { + return error_rates_; + } + double best_error_rate() const { + return best_error_rate_; + } + int best_iteration() const { + return best_iteration_; + } + int learning_iteration() const { return learning_iteration_; } + int improvement_steps() const { return improvement_steps_; } + void set_perfect_delay(int delay) { perfect_delay_ = delay; } + const GenericVector& best_trainer() const { return best_trainer_; } + // Returns the error that was just calculated by PrepareForBackward. + double NewSingleError(ErrorTypes type) const { + return error_buffers_[type][training_iteration() % kRollingBufferSize_]; + } + // Returns the error that was just calculated by TrainOnLine. Since + // TrainOnLine rolls the error buffers, this is one further back than + // NewSingleError. + double LastSingleError(ErrorTypes type) const { + return error_buffers_[type] + [(training_iteration() + kRollingBufferSize_ - 1) % + kRollingBufferSize_]; + } + const DocumentCache& training_data() const { + return training_data_; + } + DocumentCache* mutable_training_data() { return &training_data_; } + + // If the training sample is usable, grid searches for the optimal + // dict_ratio/cert_offset, and returns the results in a string of space- + // separated triplets of ratio,offset=worderr. + Trainability GridSearchDictParams( + const ImageData* trainingdata, int iteration, double min_dict_ratio, + double dict_ratio_step, double max_dict_ratio, double min_cert_offset, + double cert_offset_step, double max_cert_offset, STRING* results); + + void SetSerializeMode(SerializeAmount serialize_amount) const { + serialize_amount_ = serialize_amount; + } + + // Provides output on the distribution of weight values. + void DebugNetwork(); + + // Loads a set of lstmf files that were created using the lstm.train config to + // tesseract into memory ready for training. Returns false if nothing was + // loaded. + bool LoadAllTrainingData(const GenericVector& filenames); + + // Keeps track of best and locally worst error rate, using internally computed + // values. See MaintainCheckpointsSpecific for more detail. + bool MaintainCheckpoints(TestCallback tester, STRING* log_msg); + // Keeps track of best and locally worst error_rate (whatever it is) and + // launches tests using rec_model, when a new min or max is reached. + // Writes checkpoints using train_model at appropriate times and builds and + // returns a log message to indicate progress. Returns false if nothing + // interesting happened. + bool MaintainCheckpointsSpecific(int iteration, + const GenericVector* train_model, + const GenericVector* rec_model, + TestCallback tester, STRING* log_msg); + // Builds a string containing a progress message with current error rates. + void PrepareLogMsg(STRING* log_msg) const; + // Appends iteration learning_iteration()/training_iteration()/ + // sample_iteration() to the log_msg. + void LogIterations(const char* intro_str, STRING* log_msg) const; + + // TODO(rays) Add curriculum learning. + // Returns true and increments the training_stage_ if the error rate has just + // passed through the given threshold for the first time. + bool TransitionTrainingStage(float error_threshold); + // Returns the current training stage. + int CurrentTrainingStage() const { return training_stage_; } + + // Writes to the given file. Returns false in case of error. + virtual bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + virtual bool DeSerialize(bool swap, TFile* fp); + + // De-serializes the saved best_trainer_ into sub_trainer_, and adjusts the + // learning rates (by scaling reduction, or layer specific, according to + // NF_LAYER_SPECIFIC_LR). + void StartSubtrainer(STRING* log_msg); + // While the sub_trainer_ is behind the current training iteration and its + // training error is at least kSubTrainerMarginFraction better than the + // current training error, trains the sub_trainer_, and returns STR_UPDATED if + // it did anything. If it catches up, and has a better error rate than the + // current best, as well as a margin over the current error rate, then the + // trainer in *this is replaced with sub_trainer_, and STR_REPLACED is + // returned. STR_NONE is returned if the subtrainer wasn't good enough to + // receive any training iterations. + SubTrainerResult UpdateSubtrainer(STRING* log_msg); + // Reduces network learning rates, either for everything, or for layers + // independently, according to NF_LAYER_SPECIFIC_LR. + void ReduceLearningRates(LSTMTrainer* samples_trainer, STRING* log_msg); + // Considers reducing the learning rate independently for each layer down by + // factor(<1), or leaving it the same, by double-training the given number of + // samples and minimizing the amount of changing of sign of weight updates. + // Even if it looks like all weights should remain the same, an adjustment + // will be made to guarantee a different result when reverting to an old best. + // Returns the number of layer learning rates that were reduced. + int ReduceLayerLearningRates(double factor, int num_samples, + LSTMTrainer* samples_trainer); + + // Converts the string to integer class labels, with appropriate null_char_s + // in between if not in SimpleTextOutput mode. Returns false on failure. + bool EncodeString(const STRING& str, GenericVector* labels) const { + return EncodeString(str, GetUnicharset(), IsRecoding() ? &recoder_ : NULL, + SimpleTextOutput(), null_char_, labels); + } + // Static version operates on supplied unicharset, encoder, simple_text. + static bool EncodeString(const STRING& str, const UNICHARSET& unicharset, + const UnicharCompress* recoder, bool simple_text, + int null_char, GenericVector* labels); + + // Converts the network to int if not already. + void ConvertToInt() { + if ((training_flags_ & TF_INT_MODE) == 0) { + network_->ConvertToInt(); + training_flags_ |= TF_INT_MODE; + } + } + + // Performs forward-backward on the given trainingdata. + // Returns the sample that was used or NULL if the next sample was deemed + // unusable. samples_trainer could be this or an alternative trainer that + // holds the training samples. + const ImageData* TrainOnLine(LSTMTrainer* samples_trainer, bool batch) { + int sample_index = sample_iteration(); + const ImageData* image = + samples_trainer->training_data_.GetPageBySerial(sample_index); + if (image != NULL) { + Trainability trainable = TrainOnLine(image, batch); + if (trainable == UNENCODABLE || trainable == NOT_BOXED) { + return NULL; // Sample was unusable. + } + } else { + ++sample_iteration_; + } + return image; + } + Trainability TrainOnLine(const ImageData* trainingdata, bool batch); + + // Prepares the ground truth, runs forward, and prepares the targets. + // Returns a Trainability enum to indicate the suitability of the sample. + Trainability PrepareForBackward(const ImageData* trainingdata, + NetworkIO* fwd_outputs, NetworkIO* targets); + + // Writes the trainer to memory, so that the current training state can be + // restored. + bool SaveTrainingDump(SerializeAmount serialize_amount, + const LSTMTrainer* trainer, + GenericVector* data) const; + + // Reads previously saved trainer from memory. + bool ReadTrainingDump(const GenericVector& data, LSTMTrainer* trainer); + bool ReadSizedTrainingDump(const char* data, int size); + + // Sets up the data for MaintainCheckpoints from a light ReadTrainingDump. + void SetupCheckpointInfo(); + + // Writes the recognizer to memory, so that it can be used for testing later. + void SaveRecognitionDump(GenericVector* data) const; + + // Reads and returns a previously saved recognizer from memory. + static LSTMRecognizer* ReadRecognitionDump(const GenericVector& data); + + // Writes current best model to a file, unless it has already been written. + bool SaveBestModel(FileWriter writer) const; + + // Returns a suitable filename for a training dump, based on the model_base_, + // the iteration and the error rates. + STRING DumpFilename() const; + + // Fills the whole error buffer of the given type with the given value. + void FillErrorBuffer(double new_error, ErrorTypes type); + + protected: + // Factored sub-constructor sets up reasonable default values. + void EmptyConstructor(); + + // Sets the unicharset properties using the given script_dir as a source of + // script unicharsets. If the flag TF_COMPRESS_UNICHARSET is true, also sets + // up the recoder_ to simplify the unicharset. + void SetUnicharsetProperties(const STRING& script_dir); + + // Outputs the string and periodically displays the given network inputs + // as an image in the given window, and the corresponding labels at the + // corresponding x_starts. + // Returns false if the truth string is empty. + bool DebugLSTMTraining(const NetworkIO& inputs, + const ImageData& trainingdata, + const NetworkIO& fwd_outputs, + const GenericVector& truth_labels, + const NetworkIO& outputs); + // Displays the network targets as line a line graph. + void DisplayTargets(const NetworkIO& targets, const char* window_name, + ScrollView** window); + + // Builds a no-compromises target where the first positions should be the + // truth labels and the rest is padded with the null_char_. + bool ComputeTextTargets(const NetworkIO& outputs, + const GenericVector& truth_labels, + NetworkIO* targets); + + // Builds a target using standard CTC. truth_labels should be pre-padded with + // nulls wherever desired. They don't have to be between all labels. + // outputs is input-output, as it gets clipped to minimum probability. + bool ComputeCTCTargets(const GenericVector& truth_labels, + NetworkIO* outputs, NetworkIO* targets); + + // Computes network errors, and stores the results in the rolling buffers, + // along with the supplied text_error. + // Returns the delta error of the current sample (not running average.) + double ComputeErrorRates(const NetworkIO& deltas, double char_error, + double word_error); + + // Computes the network activation RMS error rate. + double ComputeRMSError(const NetworkIO& deltas); + + // Computes network activation winner error rate. (Number of values that are + // in error by >= 0.5 divided by number of time-steps.) More closely related + // to final character error than RMS, but still directly calculable from + // just the deltas. Because of the binary nature of the targets, zero winner + // error is a sufficient but not necessary condition for zero char error. + double ComputeWinnerError(const NetworkIO& deltas); + + // Computes a very simple bag of chars char error rate. + double ComputeCharError(const GenericVector& truth_str, + const GenericVector& ocr_str); + // Computes a very simple bag of words word recall error rate. + // NOTE that this is destructive on both input strings. + double ComputeWordError(STRING* truth_str, STRING* ocr_str); + + // Updates the error buffer and corresponding mean of the given type with + // the new_error. + void UpdateErrorBuffer(double new_error, ErrorTypes type); + + // Rolls error buffers and reports the current means. + void RollErrorBuffers(); + + // Given that error_rate is either a new min or max, updates the best/worst + // error rates, and record of progress. + STRING UpdateErrorGraph(int iteration, double error_rate, + const GenericVector& model_data, + TestCallback tester); + + protected: + // Alignment display window. + ScrollView* align_win_; + // CTC target display window. + ScrollView* target_win_; + // CTC output display window. + ScrollView* ctc_win_; + // Reconstructed image window. + ScrollView* recon_win_; + // How often to display a debug image. + int debug_interval_; + // Iteration at which the last checkpoint was dumped. + int checkpoint_iteration_; + // Basename of files to save best models to. + STRING model_base_; + // Checkpoint filename. + STRING checkpoint_name_; + // Training data. + DocumentCache training_data_; + // A hack to serialize less data for batch training and record file version. + mutable SerializeAmount serialize_amount_; + // Name to use when saving best_trainer_. + STRING best_model_name_; + // Number of available training stages. + int num_training_stages_; + // Checkpointing callbacks. + FileReader file_reader_; + FileWriter file_writer_; + // TODO(rays) These are pointers, and must be deleted. Switch to unique_ptr + // when we can commit to c++11. + CheckPointReader checkpoint_reader_; + CheckPointWriter checkpoint_writer_; + + // ===Serialized data to ensure that a restart produces the same results.=== + // These members are only serialized when serialize_amount_ != LIGHT. + // Best error rate so far. + double best_error_rate_; + // Snapshot of all error rates at best_iteration_. + double best_error_rates_[ET_COUNT]; + // Iteration of best_error_rate_. + int best_iteration_; + // Worst error rate since best_error_rate_. + double worst_error_rate_; + // Snapshot of all error rates at worst_iteration_. + double worst_error_rates_[ET_COUNT]; + // Iteration of worst_error_rate_. + int worst_iteration_; + // Iteration at which the process will be thought stalled. + int stall_iteration_; + // Saved recognition models for computing test error for graph points. + GenericVector best_model_data_; + GenericVector worst_model_data_; + // Saved trainer for reverting back to last known best. + GenericVector best_trainer_; + // A subsidiary trainer running with a different learning rate until either + // *this or sub_trainer_ hits a new best. + LSTMTrainer* sub_trainer_; + // Error rate at which last best model was dumped. + float error_rate_of_last_saved_best_; + // Current stage of training. + int training_stage_; + // History of best error rate against iteration. Used for computing the + // number of steps to each 2% improvement. + GenericVector best_error_history_; + GenericVector best_error_iterations_; + // Number of iterations since the best_error_rate_ was 2% more than it is now. + int improvement_steps_; + // Number of iterations that yielded a non-zero delta error and thus provided + // significant learning. learning_iteration_ <= training_iteration_. + // learning_iteration_ is used to measure rate of learning progress. + int learning_iteration_; + // Saved value of sample_iteration_ before looking for the the next sample. + int prev_sample_iteration_; + // How often to include a PERFECT training sample in backprop. + // A PERFECT training sample is used if the current + // training_iteration_ > last_perfect_training_iteration_ + perfect_delay_, + // so with perfect_delay_ == 0, all samples are used, and with + // perfect_delay_ == 4, at most 1 in 5 samples will be perfect. + int perfect_delay_; + // Value of training_iteration_ at which the last PERFECT training sample + // was used in back prop. + int last_perfect_training_iteration_; + // Rolling buffers storing recent training errors are indexed by + // training_iteration % kRollingBufferSize_. + static const int kRollingBufferSize_ = 1000; + GenericVector error_buffers_[ET_COUNT]; + // Rounded mean percent trailing training errors in the buffers. + double error_rates_[ET_COUNT]; // RMS training error. +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_LSTMTRAINER_H_ diff --git a/lstm/maxpool.cpp b/lstm/maxpool.cpp new file mode 100644 index 00000000..2164aaf5 --- /dev/null +++ b/lstm/maxpool.cpp @@ -0,0 +1,87 @@ +/////////////////////////////////////////////////////////////////////// +// File: maxpool.h +// Description: Standard Max-Pooling layer. +// Author: Ray Smith +// Created: Tue Mar 18 16:28:18 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "maxpool.h" +#include "tprintf.h" + +namespace tesseract { + +Maxpool::Maxpool(const STRING& name, int ni, int x_scale, int y_scale) + : Reconfig(name, ni, x_scale, y_scale) { + type_ = NT_MAXPOOL; + no_ = ni; +} + +Maxpool::~Maxpool() { +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool Maxpool::DeSerialize(bool swap, TFile* fp) { + bool result = Reconfig::DeSerialize(swap, fp); + no_ = ni_; + return result; +} + +// Runs forward propagation of activations on the input line. +// See NetworkCpp for a detailed discussion of the arguments. +void Maxpool::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + output->ResizeScaled(input, x_scale_, y_scale_, no_); + maxes_.ResizeNoInit(output->Width(), ni_); + back_map_ = input.stride_map(); + + StrideMap::Index dest_index(output->stride_map()); + do { + int out_t = dest_index.t(); + StrideMap::Index src_index(input.stride_map(), dest_index.index(FD_BATCH), + dest_index.index(FD_HEIGHT) * y_scale_, + dest_index.index(FD_WIDTH) * x_scale_); + // Find the max input out of x_scale_ groups of y_scale_ inputs. + // Do it independently for each input dimension. + int* max_line = maxes_[out_t]; + int in_t = src_index.t(); + output->CopyTimeStepFrom(out_t, input, in_t); + for (int i = 0; i < ni_; ++i) { + max_line[i] = in_t; + } + for (int x = 0; x < x_scale_; ++x) { + for (int y = 0; y < y_scale_; ++y) { + StrideMap::Index src_xy(src_index); + if (src_xy.AddOffset(x, FD_WIDTH) && src_xy.AddOffset(y, FD_HEIGHT)) { + output->MaxpoolTimeStep(out_t, input, src_xy.t(), max_line); + } + } + } + } while (dest_index.Increment()); +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool Maxpool::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + back_deltas->ResizeToMap(fwd_deltas.int_mode(), back_map_, ni_); + back_deltas->MaxpoolBackward(fwd_deltas, maxes_); + return true; +} + + +} // namespace tesseract. + diff --git a/lstm/maxpool.h b/lstm/maxpool.h new file mode 100644 index 00000000..1f742a9d --- /dev/null +++ b/lstm/maxpool.h @@ -0,0 +1,71 @@ +/////////////////////////////////////////////////////////////////////// +// File: maxpool.h +// Description: Standard Max-Pooling layer. +// Author: Ray Smith +// Created: Tue Mar 18 16:28:18 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_MAXPOOL_H_ +#define TESSERACT_LSTM_MAXPOOL_H_ + +#include "reconfig.h" + +namespace tesseract { + +// Maxpooling reduction. Independently for each input, selects the location +// in the rectangle that contains the max value. +// Backprop propagates only to the position that was the max. +class Maxpool : public Reconfig { + public: + Maxpool(const STRING& name, int ni, int x_scale, int y_scale); + virtual ~Maxpool(); + + // Accessors. + virtual STRING spec() const { + STRING spec; + spec.add_str_int("Mp", y_scale_); + spec.add_str_int(",", x_scale_); + return spec; + } + + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + virtual bool DeSerialize(bool swap, TFile* fp); + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + + // Runs backward propagation of errors on the deltas line. + // See Network for a detailed discussion of the arguments. + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas); + + private: + // Memory of which input was the max. + GENERIC_2D_ARRAY maxes_; +}; + + +} // namespace tesseract. + + + + + +#endif // TESSERACT_LSTM_MAXPOOL_H_ + diff --git a/lstm/network.cpp b/lstm/network.cpp new file mode 100644 index 00000000..791848ad --- /dev/null +++ b/lstm/network.cpp @@ -0,0 +1,342 @@ +/////////////////////////////////////////////////////////////////////// +// File: network.cpp +// Description: Base class for neural network implementations. +// Author: Ray Smith +// Created: Wed May 01 17:25:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +// Include automatically generated configuration file if running autoconf. +#ifdef HAVE_CONFIG_H +#include "config_auto.h" +#endif + +#include "network.h" + +#include + +// This base class needs to know about all its sub-classes because of the +// factory deserializing method: CreateFromFile. +#include "allheaders.h" +#include "convolve.h" +#include "fullyconnected.h" +#include "input.h" +#include "lstm.h" +#include "maxpool.h" +#include "parallel.h" +#include "reconfig.h" +#include "reversed.h" +#include "scrollview.h" +#include "series.h" +#include "statistc.h" +#ifdef INCLUDE_TENSORFLOW +#include "tfnetwork.h" +#endif +#include "tprintf.h" + +namespace tesseract { + +// Min and max window sizes. +const int kMinWinSize = 500; +const int kMaxWinSize = 2000; +// Window frame sizes need adding on to make the content fit. +const int kXWinFrameSize = 30; +const int kYWinFrameSize = 80; + +// String names corresponding to the NetworkType enum. Keep in sync. +// Names used in Serialization to allow re-ordering/addition/deletion of +// layer types in NetworkType without invalidating existing network files. +char const* const Network::kTypeNames[NT_COUNT] = { + "Invalid", "Input", + "Convolve", "Maxpool", + "Parallel", "Replicated", + "ParBidiLSTM", "DepParUDLSTM", + "Par2dLSTM", "Series", + "Reconfig", "RTLReversed", + "TTBReversed", "XYTranspose", + "LSTM", "SummLSTM", + "Logistic", "LinLogistic", + "LinTanh", "Tanh", + "Relu", "Linear", + "Softmax", "SoftmaxNoCTC", + "LSTMSoftmax", "LSTMBinarySoftmax", + "TensorFlow", +}; + +Network::Network() + : type_(NT_NONE), + training_(TS_ENABLED), + needs_to_backprop_(true), + network_flags_(0), + ni_(0), + no_(0), + num_weights_(0), + forward_win_(NULL), + backward_win_(NULL), + randomizer_(NULL) {} +Network::Network(NetworkType type, const STRING& name, int ni, int no) + : type_(type), + training_(TS_ENABLED), + needs_to_backprop_(true), + network_flags_(0), + ni_(ni), + no_(no), + num_weights_(0), + name_(name), + forward_win_(NULL), + backward_win_(NULL), + randomizer_(NULL) {} + +Network::~Network() { +} + +// Suspends/Enables/Permanently disables training by setting the training_ +// flag. Serialize and DeSerialize only operate on the run-time data if state +// is TS_DISABLED or TS_TEMP_DISABLE. Specifying TS_TEMP_DISABLE will +// temporarily disable layers in state TS_ENABLED, allowing a trainer to +// serialize as if it were a recognizer. +// TS_RE_ENABLE will re-enable layers that were previously in any disabled +// state. If in TS_TEMP_DISABLE then the flag is just changed, but if in +// TS_DISABLED, the deltas in the weight matrices are reinitialized so that a +// recognizer can be converted back to a trainer. +void Network::SetEnableTraining(TrainingState state) { + if (state == TS_RE_ENABLE) { + training_ = TS_ENABLED; + } else { + training_ = state; + } +} + +// Sets flags that control the action of the network. See NetworkFlags enum +// for bit values. +void Network::SetNetworkFlags(uinT32 flags) { + network_flags_ = flags; +} + +// Sets up the network for training. Initializes weights using weights of +// scale `range` picked according to the random number generator `randomizer`. +int Network::InitWeights(float range, TRand* randomizer) { + randomizer_ = randomizer; + return 0; +} + +// Provides a pointer to a TRand for any networks that care to use it. +// Note that randomizer is a borrowed pointer that should outlive the network +// and should not be deleted by any of the networks. +void Network::SetRandomizer(TRand* randomizer) { + randomizer_ = randomizer; +} + +// Sets needs_to_backprop_ to needs_backprop and returns true if +// needs_backprop || any weights in this network so the next layer forward +// can be told to produce backprop for this layer if needed. +bool Network::SetupNeedsBackprop(bool needs_backprop) { + needs_to_backprop_ = needs_backprop; + return needs_backprop || num_weights_ > 0; +} + +// Writes to the given file. Returns false in case of error. +bool Network::Serialize(TFile* fp) const { + inT8 data = NT_NONE; + if (fp->FWrite(&data, sizeof(data), 1) != 1) return false; + STRING type_name = kTypeNames[type_]; + if (!type_name.Serialize(fp)) return false; + data = training_; + if (fp->FWrite(&data, sizeof(data), 1) != 1) return false; + data = needs_to_backprop_; + if (fp->FWrite(&data, sizeof(data), 1) != 1) return false; + if (fp->FWrite(&network_flags_, sizeof(network_flags_), 1) != 1) return false; + if (fp->FWrite(&ni_, sizeof(ni_), 1) != 1) return false; + if (fp->FWrite(&no_, sizeof(no_), 1) != 1) return false; + if (fp->FWrite(&num_weights_, sizeof(num_weights_), 1) != 1) return false; + if (!name_.Serialize(fp)) return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +// Should be overridden by subclasses, but NOT called by their DeSerialize. +bool Network::DeSerialize(bool swap, TFile* fp) { + inT8 data = 0; + if (fp->FRead(&data, sizeof(data), 1) != 1) return false; + if (data == NT_NONE) { + STRING type_name; + if (!type_name.DeSerialize(swap, fp)) return false; + for (data = 0; data < NT_COUNT && type_name != kTypeNames[data]; ++data) { + } + if (data == NT_COUNT) { + tprintf("Invalid network layer type:%s\n", type_name.string()); + return false; + } + } + type_ = static_cast(data); + if (fp->FRead(&data, sizeof(data), 1) != 1) return false; + training_ = data == TS_ENABLED ? TS_ENABLED : TS_DISABLED; + if (fp->FRead(&data, sizeof(data), 1) != 1) return false; + needs_to_backprop_ = data != 0; + if (fp->FRead(&network_flags_, sizeof(network_flags_), 1) != 1) return false; + if (fp->FRead(&ni_, sizeof(ni_), 1) != 1) return false; + if (fp->FRead(&no_, sizeof(no_), 1) != 1) return false; + if (fp->FRead(&num_weights_, sizeof(num_weights_), 1) != 1) return false; + if (!name_.DeSerialize(swap, fp)) return false; + if (swap) { + ReverseN(&network_flags_, sizeof(network_flags_)); + ReverseN(&ni_, sizeof(ni_)); + ReverseN(&no_, sizeof(no_)); + ReverseN(&num_weights_, sizeof(num_weights_)); + } + return true; +} + +// Reads from the given file. Returns NULL in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +// Determines the type of the serialized class and calls its DeSerialize +// on a new object of the appropriate type, which is returned. +Network* Network::CreateFromFile(bool swap, TFile* fp) { + Network stub; + if (!stub.DeSerialize(swap, fp)) return NULL; + Network* network = NULL; + switch (stub.type_) { + case NT_CONVOLVE: + network = new Convolve(stub.name_, stub.ni_, 0, 0); + break; + case NT_INPUT: + network = new Input(stub.name_, stub.ni_, stub.no_); + break; + case NT_LSTM: + case NT_LSTM_SOFTMAX: + case NT_LSTM_SOFTMAX_ENCODED: + case NT_LSTM_SUMMARY: + network = + new LSTM(stub.name_, stub.ni_, stub.no_, stub.no_, false, stub.type_); + break; + case NT_MAXPOOL: + network = new Maxpool(stub.name_, stub.ni_, 0, 0); + break; + // All variants of Parallel. + case NT_PARALLEL: + case NT_REPLICATED: + case NT_PAR_RL_LSTM: + case NT_PAR_UD_LSTM: + case NT_PAR_2D_LSTM: + network = new Parallel(stub.name_, stub.type_); + break; + case NT_RECONFIG: + network = new Reconfig(stub.name_, stub.ni_, 0, 0); + break; + // All variants of reversed. + case NT_XREVERSED: + case NT_YREVERSED: + case NT_XYTRANSPOSE: + network = new Reversed(stub.name_, stub.type_); + break; + case NT_SERIES: + network = new Series(stub.name_); + break; + case NT_TENSORFLOW: +#ifdef INCLUDE_TENSORFLOW + network = new TFNetwork(stub.name_); +#else + tprintf("TensorFlow not compiled in! -DINCLUDE_TENSORFLOW\n"); + return NULL; +#endif + break; + // All variants of FullyConnected. + case NT_SOFTMAX: + case NT_SOFTMAX_NO_CTC: + case NT_RELU: + case NT_TANH: + case NT_LINEAR: + case NT_LOGISTIC: + case NT_POSCLIP: + case NT_SYMCLIP: + network = new FullyConnected(stub.name_, stub.ni_, stub.no_, stub.type_); + break; + default: + return NULL; + } + network->training_ = stub.training_; + network->needs_to_backprop_ = stub.needs_to_backprop_; + network->network_flags_ = stub.network_flags_; + network->num_weights_ = stub.num_weights_; + if (!network->DeSerialize(swap, fp)) { + delete network; + return NULL; + } + return network; +} + +// Returns a random number in [-range, range]. +double Network::Random(double range) { + ASSERT_HOST(randomizer_ != NULL); + return randomizer_->SignedRand(range); +} + +// === Debug image display methods. === +// Displays the image of the matrix to the forward window. +void Network::DisplayForward(const NetworkIO& matrix) { +#ifndef GRAPHICS_DISABLED // do nothing if there's no graphics + Pix* image = matrix.ToPix(); + ClearWindow(false, name_.string(), pixGetWidth(image), + pixGetHeight(image), &forward_win_); + DisplayImage(image, forward_win_); + forward_win_->Update(); +#endif // GRAPHICS_DISABLED +} + +// Displays the image of the matrix to the backward window. +void Network::DisplayBackward(const NetworkIO& matrix) { +#ifndef GRAPHICS_DISABLED // do nothing if there's no graphics + Pix* image = matrix.ToPix(); + STRING window_name = name_ + "-back"; + ClearWindow(false, window_name.string(), pixGetWidth(image), + pixGetHeight(image), &backward_win_); + DisplayImage(image, backward_win_); + backward_win_->Update(); +#endif // GRAPHICS_DISABLED +} + +#ifndef GRAPHICS_DISABLED +// Creates the window if needed, otherwise clears it. +void Network::ClearWindow(bool tess_coords, const char* window_name, + int width, int height, ScrollView** window) { + if (*window == NULL) { + int min_size = MIN(width, height); + if (min_size < kMinWinSize) { + if (min_size < 1) min_size = 1; + width = width * kMinWinSize / min_size; + height = height * kMinWinSize / min_size; + } + width += kXWinFrameSize; + height += kYWinFrameSize; + if (width > kMaxWinSize) width = kMaxWinSize; + if (height > kMaxWinSize) height = kMaxWinSize; + *window = new ScrollView(window_name, 80, 100, width, height, width, height, + tess_coords); + tprintf("Created window %s of size %d, %d\n", window_name, width, height); + } else { + (*window)->Clear(); + } +} + +// Displays the pix in the given window. and returns the height of the pix. +// The pix is pixDestroyed. +int Network::DisplayImage(Pix* pix, ScrollView* window) { + int height = pixGetHeight(pix); + window->Image(pix, 0, 0); + pixDestroy(&pix); + return height; +} +#endif // GRAPHICS_DISABLED + +} // namespace tesseract. diff --git a/lstm/network.h b/lstm/network.h new file mode 100644 index 00000000..db38b182 --- /dev/null +++ b/lstm/network.h @@ -0,0 +1,307 @@ +/////////////////////////////////////////////////////////////////////// +// File: network.h +// Description: Base class for neural network implementations. +// Author: Ray Smith +// Created: Wed May 01 16:38:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_NETWORK_H_ +#define TESSERACT_LSTM_NETWORK_H_ + +#include +#include + +#include "genericvector.h" +#include "helpers.h" +#include "matrix.h" +#include "networkio.h" +#include "serialis.h" +#include "static_shape.h" +#include "tprintf.h" + +struct Pix; +class ScrollView; +class TBOX; + +namespace tesseract { + +class ImageData; +class NetworkScratch; + +// Enum to store the run-time type of a Network. Keep in sync with kTypeNames. +enum NetworkType { + NT_NONE, // The naked base class. + NT_INPUT, // Inputs from an image. + // Plumbing networks combine other networks or rearrange the inputs. + NT_CONVOLVE, // Duplicates inputs in a sliding window neighborhood. + NT_MAXPOOL, // Chooses the max result from a rectangle. + NT_PARALLEL, // Runs networks in parallel. + NT_REPLICATED, // Runs identical networks in parallel. + NT_PAR_RL_LSTM, // Runs LTR and RTL LSTMs in parallel. + NT_PAR_UD_LSTM, // Runs Up and Down LSTMs in parallel. + NT_PAR_2D_LSTM, // Runs 4 LSTMs in parallel. + NT_SERIES, // Executes a sequence of layers. + NT_RECONFIG, // Scales the time/y size but makes the output deeper. + NT_XREVERSED, // Reverses the x direction of the inputs/outputs. + NT_YREVERSED, // Reverses the y-direction of the inputs/outputs. + NT_XYTRANSPOSE, // Transposes x and y (for just a single op). + // Functional networks actually calculate stuff. + NT_LSTM, // Long-Short-Term-Memory block. + NT_LSTM_SUMMARY, // LSTM that only keeps its last output. + NT_LOGISTIC, // Fully connected logistic nonlinearity. + NT_POSCLIP, // Fully connected rect lin version of logistic. + NT_SYMCLIP, // Fully connected rect lin version of tanh. + NT_TANH, // Fully connected with tanh nonlinearity. + NT_RELU, // Fully connected with rectifier nonlinearity. + NT_LINEAR, // Fully connected with no nonlinearity. + NT_SOFTMAX, // Softmax uses exponential normalization, with CTC. + NT_SOFTMAX_NO_CTC, // Softmax uses exponential normalization, no CTC. + // The SOFTMAX LSTMs both have an extra softmax layer on top, but inside, with + // the outputs fed back to the input of the LSTM at the next timestep. + // The ENCODED version binary encodes the softmax outputs, providing log2 of + // the number of outputs as additional inputs, and the other version just + // provides all the softmax outputs as additional inputs. + NT_LSTM_SOFTMAX, // 1-d LSTM with built-in fully connected softmax. + NT_LSTM_SOFTMAX_ENCODED, // 1-d LSTM with built-in binary encoded softmax. + // A TensorFlow graph encapsulated as a Tesseract network. + NT_TENSORFLOW, + + NT_COUNT // Array size. +}; + +// Enum of Network behavior flags. Can in theory be set for each individual +// network element. +enum NetworkFlags { + // Network forward/backprop behavior. + NF_LAYER_SPECIFIC_LR = 64, // Separate learning rate for each layer. + NF_ADA_GRAD = 128, // Weight-specific learning rate. +}; + +// State of training and desired state used in SetEnableTraining. +enum TrainingState { + // Valid states of training_. + TS_DISABLED, // Disabled permanently. + TS_ENABLED, // Enabled for backprop and to write a training dump. + TS_TEMP_DISABLE, // Temporarily disabled to write a recognition dump. + // Valid only for SetEnableTraining. + TS_RE_ENABLE, // Re-Enable whatever the current state. +}; + +// Base class for network types. Not quite an abstract base class, but almost. +// Most of the time no isolated Network exists, except prior to +// deserialization. +class Network { + public: + Network(); + Network(NetworkType type, const STRING& name, int ni, int no); + virtual ~Network(); + + // Accessors. + NetworkType type() const { + return type_; + } + bool IsTraining() const { return training_ == TS_ENABLED; } + bool needs_to_backprop() const { + return needs_to_backprop_; + } + int num_weights() const { return num_weights_; } + int NumInputs() const { + return ni_; + } + int NumOutputs() const { + return no_; + } + // Returns the required shape input to the network. + virtual StaticShape InputShape() const { + StaticShape result; + return result; + } + // Returns the shape output from the network given an input shape (which may + // be partially unknown ie zero). + virtual StaticShape OutputShape(const StaticShape& input_shape) const { + StaticShape result(input_shape); + result.set_depth(no_); + return result; + } + const STRING& name() const { + return name_; + } + virtual STRING spec() const { + return "?"; + } + bool TestFlag(NetworkFlags flag) const { + return (network_flags_ & flag) != 0; + } + + // Initialization and administrative functions that are mostly provided + // by Plumbing. + // Returns true if the given type is derived from Plumbing, and thus contains + // multiple sub-networks that can have their own learning rate. + virtual bool IsPlumbingType() const { return false; } + + // Suspends/Enables/Permanently disables training by setting the training_ + // flag. Serialize and DeSerialize only operate on the run-time data if state + // is TS_DISABLED or TS_TEMP_DISABLE. Specifying TS_TEMP_DISABLE will + // temporarily disable layers in state TS_ENABLED, allowing a trainer to + // serialize as if it were a recognizer. + // TS_RE_ENABLE will re-enable layers that were previously in any disabled + // state. If in TS_TEMP_DISABLE then the flag is just changed, but if in + // TS_DISABLED, the deltas in the weight matrices are reinitialized so that a + // recognizer can be converted back to a trainer. + virtual void SetEnableTraining(TrainingState state); + + // Sets flags that control the action of the network. See NetworkFlags enum + // for bit values. + virtual void SetNetworkFlags(uinT32 flags); + + // Sets up the network for training. Initializes weights using weights of + // scale `range` picked according to the random number generator `randomizer`. + // Note that randomizer is a borrowed pointer that should outlive the network + // and should not be deleted by any of the networks. + // Returns the number of weights initialized. + virtual int InitWeights(float range, TRand* randomizer); + + // Converts a float network to an int network. + virtual void ConvertToInt() {} + + // Provides a pointer to a TRand for any networks that care to use it. + // Note that randomizer is a borrowed pointer that should outlive the network + // and should not be deleted by any of the networks. + virtual void SetRandomizer(TRand* randomizer); + + // Sets needs_to_backprop_ to needs_backprop and returns true if + // needs_backprop || any weights in this network so the next layer forward + // can be told to produce backprop for this layer if needed. + virtual bool SetupNeedsBackprop(bool needs_backprop); + + // Returns the most recent reduction factor that the network applied to the + // time sequence. Assumes that any 2-d is already eliminated. Used for + // scaling bounding boxes of truth data and calculating result bounding boxes. + // WARNING: if GlobalMinimax is used to vary the scale, this will return + // the last used scale factor. Call it before any forward, and it will return + // the minimum scale factor of the paths through the GlobalMinimax. + virtual int XScaleFactor() const { + return 1; + } + + // Provides the (minimum) x scale factor to the network (of interest only to + // input units) so they can determine how to scale bounding boxes. + virtual void CacheXScaleFactor(int factor) {} + + // Provides debug output on the weights. + virtual void DebugWeights() { + tprintf("Must override Network::DebugWeights for type %d\n", type_); + } + + // Writes to the given file. Returns false in case of error. + // Should be overridden by subclasses, but called by their Serialize. + virtual bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + // Should be overridden by subclasses, but NOT called by their DeSerialize. + virtual bool DeSerialize(bool swap, TFile* fp); + + // Updates the weights using the given learning rate and momentum. + // num_samples is the quotient to be used in the adagrad computation iff + // use_ada_grad_ is true. + virtual void Update(float learning_rate, float momentum, int num_samples) {} + // Sums the products of weight updates in *this and other, splitting into + // positive (same direction) in *same and negative (different direction) in + // *changed. + virtual void CountAlternators(const Network& other, double* same, + double* changed) const {} + + // Reads from the given file. Returns NULL in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + // Determines the type of the serialized class and calls its DeSerialize + // on a new object of the appropriate type, which is returned. + static Network* CreateFromFile(bool swap, TFile* fp); + + // Runs forward propagation of activations on the input line. + // Note that input and output are both 2-d arrays. + // The 1st index is the time element. In a 1-d network, it might be the pixel + // position on the textline. In a 2-d network, the linearization is defined + // by the stride_map. (See networkio.h). + // The 2nd index of input is the network inputs/outputs, and the dimension + // of the input must match NumInputs() of this network. + // The output array will be resized as needed so that its 1st dimension is + // always equal to the number of output values, and its second dimension is + // always NumOutputs(). Note that all this detail is encapsulated away inside + // NetworkIO, as are the internals of the scratch memory space used by the + // network. See networkscratch.h for that. + // If input_transpose is not NULL, then it contains the transpose of input, + // and the caller guarantees that it will still be valid on the next call to + // backward. The callee is therefore at liberty to save the pointer and + // reference it on a call to backward. This is a bit ugly, but it makes it + // possible for a replicating parallel to calculate the input transpose once + // instead of all the replicated networks having to do it. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + tprintf("Must override Network::Forward for type %d\n", type_); + } + + // Runs backward propagation of errors on fwdX_deltas. + // Note that fwd_deltas and back_deltas are both 2-d arrays as with Forward. + // Returns false if back_deltas was not set, due to there being no point in + // propagating further backwards. Thus most complete networks will always + // return false from Backward! + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + tprintf("Must override Network::Backward for type %d\n", type_); + return false; + } + + // === Debug image display methods. === + // Displays the image of the matrix to the forward window. + void DisplayForward(const NetworkIO& matrix); + // Displays the image of the matrix to the backward window. + void DisplayBackward(const NetworkIO& matrix); + + // Creates the window if needed, otherwise clears it. + static void ClearWindow(bool tess_coords, const char* window_name, + int width, int height, ScrollView** window); + + // Displays the pix in the given window. and returns the height of the pix. + // The pix is pixDestroyed. + static int DisplayImage(Pix* pix, ScrollView* window); + + protected: + // Returns a random number in [-range, range]. + double Random(double range); + + protected: + NetworkType type_; // Type of the derived network class. + TrainingState training_; // Are we currently training? + bool needs_to_backprop_; // This network needs to output back_deltas. + inT32 network_flags_; // Behavior control flags in NetworkFlags. + inT32 ni_; // Number of input values. + inT32 no_; // Number of output values. + inT32 num_weights_; // Number of weights in this and sub-network. + STRING name_; // A unique name for this layer. + + // NOT-serialized debug data. + ScrollView* forward_win_; // Recognition debug display window. + ScrollView* backward_win_; // Training debug display window. + TRand* randomizer_; // Random number generator. + + // Static serialized name/type_ mapping. Keep in sync with NetworkType. + static char const* const kTypeNames[NT_COUNT]; +}; + + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_NETWORK_H_ diff --git a/lstm/networkbuilder.cpp b/lstm/networkbuilder.cpp new file mode 100644 index 00000000..053e092b --- /dev/null +++ b/lstm/networkbuilder.cpp @@ -0,0 +1,488 @@ +/////////////////////////////////////////////////////////////////////// +// File: networkbuilder.h +// Description: Class to parse the network description language and +// build a corresponding network. +// Author: Ray Smith +// Created: Wed Jul 16 18:35:38 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "networkbuilder.h" +#include "convolve.h" +#include "fullyconnected.h" +#include "input.h" +#include "lstm.h" +#include "maxpool.h" +#include "network.h" +#include "parallel.h" +#include "reconfig.h" +#include "reversed.h" +#include "series.h" +#include "unicharset.h" + +namespace tesseract { + +// Builds a network with a network_spec in the network description +// language, to recognize a character set of num_outputs size. +// If append_index is non-negative, then *network must be non-null and the +// given network_spec will be appended to *network AFTER append_index, with +// the top of the input *network discarded. +// Note that network_spec is call by value to allow a non-const char* pointer +// into the string for BuildFromString. +// net_flags control network behavior according to the NetworkFlags enum. +// The resulting network is returned via **network. +// Returns false if something failed. +bool NetworkBuilder::InitNetwork(int num_outputs, STRING network_spec, + int append_index, int net_flags, + float weight_range, TRand* randomizer, + Network** network) { + NetworkBuilder builder(num_outputs); + Series* bottom_series = NULL; + StaticShape input_shape; + if (append_index >= 0) { + // Split the current network after the given append_index. + ASSERT_HOST(*network != NULL && (*network)->type() == NT_SERIES); + Series* series = reinterpret_cast(*network); + Series* top_series = NULL; + series->SplitAt(append_index, &bottom_series, &top_series); + if (bottom_series == NULL || top_series == NULL) { + tprintf("Yikes! Splitting current network failed!!\n"); + return false; + } + input_shape = bottom_series->OutputShape(input_shape); + delete top_series; + } + char* str_ptr = &network_spec[0]; + *network = builder.BuildFromString(input_shape, &str_ptr); + if (*network == NULL) return false; + (*network)->SetNetworkFlags(net_flags); + (*network)->InitWeights(weight_range, randomizer); + (*network)->SetupNeedsBackprop(false); + if (bottom_series != NULL) { + bottom_series->AppendSeries(*network); + *network = bottom_series; + } + (*network)->CacheXScaleFactor((*network)->XScaleFactor()); + return true; +} + +// Helper skips whitespace. +static void SkipWhitespace(char** str) { + while (**str == ' ' || **str == '\t' || **str == '\n') ++*str; +} + +// Parses the given string and returns a network according to the network +// description language in networkbuilder.h +Network* NetworkBuilder::BuildFromString(const StaticShape& input_shape, + char** str) { + SkipWhitespace(str); + char code_ch = **str; + if (code_ch == '[') { + return ParseSeries(input_shape, nullptr, str); + } + if (input_shape.depth() == 0) { + // There must be an input at this point. + return ParseInput(str); + } + switch (code_ch) { + case '(': + return ParseParallel(input_shape, str); + case 'R': + return ParseR(input_shape, str); + case 'S': + return ParseS(input_shape, str); + case 'C': + return ParseC(input_shape, str); + case 'M': + return ParseM(input_shape, str); + case 'L': + return ParseLSTM(input_shape, str); + case 'F': + return ParseFullyConnected(input_shape, str); + case 'O': + return ParseOutput(input_shape, str); + default: + tprintf("Invalid network spec:%s\n", *str); + return nullptr; + } + return nullptr; +} + +// Parses an input specification and returns the result, which may include a +// series. +Network* NetworkBuilder::ParseInput(char** str) { + // There must be an input at this point. + int length = 0; + int batch, height, width, depth; + int num_converted = + sscanf(*str, "%d,%d,%d,%d%n", &batch, &height, &width, &depth, &length); + StaticShape shape; + shape.SetShape(batch, height, width, depth); + // num_converted may or may not include the length. + if (num_converted != 4 && num_converted != 5) { + tprintf("Must specify an input layer as the first layer, not %s!!\n", *str); + return nullptr; + } + *str += length; + Input* input = new Input("Input", shape); + // We want to allow [rest of net... or [rest of net... so we + // have to check explicitly for '[' here. + SkipWhitespace(str); + if (**str == '[') return ParseSeries(shape, input, str); + return input; +} + +// Parses a sequential series of networks, defined by [...]. +Network* NetworkBuilder::ParseSeries(const StaticShape& input_shape, + Input* input_layer, char** str) { + StaticShape shape = input_shape; + Series* series = new Series("Series"); + ++*str; + if (input_layer != nullptr) { + series->AddToStack(input_layer); + shape = input_layer->OutputShape(shape); + } + Network* network = NULL; + while (**str != '\0' && **str != ']' && + (network = BuildFromString(shape, str)) != NULL) { + shape = network->OutputShape(shape); + series->AddToStack(network); + } + if (**str != ']') { + tprintf("Missing ] at end of [Series]!\n"); + delete series; + return NULL; + } + ++*str; + return series; +} + +// Parses a parallel set of networks, defined by (...). +Network* NetworkBuilder::ParseParallel(const StaticShape& input_shape, + char** str) { + Parallel* parallel = new Parallel("Parallel", NT_PARALLEL); + ++*str; + Network* network = NULL; + while (**str != '\0' && **str != ')' && + (network = BuildFromString(input_shape, str)) != NULL) { + parallel->AddToStack(network); + } + if (**str != ')') { + tprintf("Missing ) at end of (Parallel)!\n"); + delete parallel; + return nullptr; + } + ++*str; + return parallel; +} + +// Parses a network that begins with 'R'. +Network* NetworkBuilder::ParseR(const StaticShape& input_shape, char** str) { + char dir = (*str)[1]; + if (dir == 'x' || dir == 'y') { + STRING name = "Reverse"; + name += dir; + *str += 2; + Network* network = BuildFromString(input_shape, str); + if (network == nullptr) return nullptr; + Reversed* rev = + new Reversed(name, dir == 'y' ? NT_YREVERSED : NT_XREVERSED); + rev->SetNetwork(network); + return rev; + } + int replicas = strtol(*str + 1, str, 10); + if (replicas <= 0) { + tprintf("Invalid R spec!:%s\n", *str); + return nullptr; + } + Parallel* parallel = new Parallel("Replicated", NT_REPLICATED); + char* str_copy = *str; + for (int i = 0; i < replicas; ++i) { + str_copy = *str; + Network* network = BuildFromString(input_shape, &str_copy); + if (network == NULL) { + tprintf("Invalid replicated network!\n"); + delete parallel; + return nullptr; + } + parallel->AddToStack(network); + } + *str = str_copy; + return parallel; +} + +// Parses a network that begins with 'S'. +Network* NetworkBuilder::ParseS(const StaticShape& input_shape, char** str) { + int y = strtol(*str + 1, str, 10); + if (**str == ',') { + int x = strtol(*str + 1, str, 10); + if (y <= 0 || x <= 0) { + tprintf("Invalid S spec!:%s\n", *str); + return nullptr; + } + return new Reconfig("Reconfig", input_shape.depth(), x, y); + } else if (**str == '(') { + // TODO(rays) Add Generic reshape. + tprintf("Generic reshape not yet implemented!!\n"); + return nullptr; + } + tprintf("Invalid S spec!:%s\n", *str); + return nullptr; +} + +// Helper returns the fully-connected type for the character code. +static NetworkType NonLinearity(char func) { + switch (func) { + case 's': + return NT_LOGISTIC; + case 't': + return NT_TANH; + case 'r': + return NT_RELU; + case 'l': + return NT_LINEAR; + case 'm': + return NT_SOFTMAX; + case 'p': + return NT_POSCLIP; + case 'n': + return NT_SYMCLIP; + default: + return NT_NONE; + } +} + +// Parses a network that begins with 'C'. +Network* NetworkBuilder::ParseC(const StaticShape& input_shape, char** str) { + NetworkType type = NonLinearity((*str)[1]); + if (type == NT_NONE) { + tprintf("Invalid nonlinearity on C-spec!: %s\n", *str); + return nullptr; + } + int y = 0, x = 0, d = 0; + if ((y = strtol(*str + 2, str, 10)) <= 0 || **str != ',' || + (x = strtol(*str + 1, str, 10)) <= 0 || **str != ',' || + (d = strtol(*str + 1, str, 10)) <= 0) { + tprintf("Invalid C spec!:%s\n", *str); + return nullptr; + } + if (x == 1 && y == 1) { + // No actual convolution. Just a FullyConnected on the current depth, to + // be slid over all batch,y,x. + return new FullyConnected("Conv1x1", input_shape.depth(), d, type); + } + Series* series = new Series("ConvSeries"); + Convolve* convolve = + new Convolve("Convolve", input_shape.depth(), x / 2, y / 2); + series->AddToStack(convolve); + StaticShape fc_input = convolve->OutputShape(input_shape); + series->AddToStack(new FullyConnected("ConvNL", fc_input.depth(), d, type)); + return series; +} + +// Parses a network that begins with 'M'. +Network* NetworkBuilder::ParseM(const StaticShape& input_shape, char** str) { + int y = 0, x = 0; + if ((*str)[1] != 'p' || (y = strtol(*str + 2, str, 10)) <= 0 || + **str != ',' || (x = strtol(*str + 1, str, 10)) <= 0) { + tprintf("Invalid Mp spec!:%s\n", *str); + return nullptr; + } + return new Maxpool("Maxpool", input_shape.depth(), x, y); +} + +// Parses an LSTM network, either individual, bi- or quad-directional. +Network* NetworkBuilder::ParseLSTM(const StaticShape& input_shape, char** str) { + bool two_d = false; + NetworkType type = NT_LSTM; + char* spec_start = *str; + int chars_consumed = 1; + int num_outputs = 0; + char key = (*str)[chars_consumed], dir = 'f', dim = 'x'; + if (key == 'S') { + type = NT_LSTM_SOFTMAX; + num_outputs = num_softmax_outputs_; + ++chars_consumed; + } else if (key == 'E') { + type = NT_LSTM_SOFTMAX_ENCODED; + num_outputs = num_softmax_outputs_; + ++chars_consumed; + } else if (key == '2' && (((*str)[2] == 'x' && (*str)[3] == 'y') || + ((*str)[2] == 'y' && (*str)[3] == 'x'))) { + chars_consumed = 4; + dim = (*str)[3]; + two_d = true; + } else if (key == 'f' || key == 'r' || key == 'b') { + dir = key; + dim = (*str)[2]; + if (dim != 'x' && dim != 'y') { + tprintf("Invalid dimension (x|y) in L Spec!:%s\n", *str); + return nullptr; + } + chars_consumed = 3; + if ((*str)[chars_consumed] == 's') { + ++chars_consumed; + type = NT_LSTM_SUMMARY; + } + } else { + tprintf("Invalid direction (f|r|b) in L Spec!:%s\n", *str); + return nullptr; + } + int num_states = strtol(*str + chars_consumed, str, 10); + if (num_states <= 0) { + tprintf("Invalid number of states in L Spec!:%s\n", *str); + return nullptr; + } + Network* lstm = nullptr; + if (two_d) { + lstm = BuildLSTMXYQuad(input_shape.depth(), num_states); + } else { + if (num_outputs == 0) num_outputs = num_states; + STRING name(spec_start, *str - spec_start); + lstm = new LSTM(name, input_shape.depth(), num_states, num_outputs, false, + type); + if (dir != 'f') { + Reversed* rev = new Reversed("RevLSTM", NT_XREVERSED); + rev->SetNetwork(lstm); + lstm = rev; + } + if (dir == 'b') { + name += "LTR"; + Parallel* parallel = new Parallel("BidiLSTM", NT_PAR_RL_LSTM); + parallel->AddToStack(new LSTM(name, input_shape.depth(), num_states, + num_outputs, false, type)); + parallel->AddToStack(lstm); + lstm = parallel; + } + } + if (dim == 'y') { + Reversed* rev = new Reversed("XYTransLSTM", NT_XYTRANSPOSE); + rev->SetNetwork(lstm); + lstm = rev; + } + return lstm; +} + +// Builds a set of 4 lstms with x and y reversal, running in true parallel. +Network* NetworkBuilder::BuildLSTMXYQuad(int num_inputs, int num_states) { + Parallel* parallel = new Parallel("2DLSTMQuad", NT_PAR_2D_LSTM); + parallel->AddToStack(new LSTM("L2DLTRDown", num_inputs, num_states, + num_states, true, NT_LSTM)); + Reversed* rev = new Reversed("L2DLTRXRev", NT_XREVERSED); + rev->SetNetwork(new LSTM("L2DRTLDown", num_inputs, num_states, num_states, + true, NT_LSTM)); + parallel->AddToStack(rev); + rev = new Reversed("L2DRTLYRev", NT_YREVERSED); + rev->SetNetwork( + new LSTM("L2DRTLUp", num_inputs, num_states, num_states, true, NT_LSTM)); + Reversed* rev2 = new Reversed("L2DXRevU", NT_XREVERSED); + rev2->SetNetwork(rev); + parallel->AddToStack(rev2); + rev = new Reversed("L2DXRevY", NT_YREVERSED); + rev->SetNetwork(new LSTM("L2DLTRDown", num_inputs, num_states, num_states, + true, NT_LSTM)); + parallel->AddToStack(rev); + return parallel; +} + +// Helper builds a truly (0-d) fully connected layer of the given type. +static Network* BuildFullyConnected(const StaticShape& input_shape, + NetworkType type, const STRING& name, + int depth) { + if (input_shape.height() == 0 || input_shape.width() == 0) { + tprintf("Fully connected requires positive height and width, had %d,%d\n", + input_shape.height(), input_shape.width()); + return nullptr; + } + int input_size = input_shape.height() * input_shape.width(); + int input_depth = input_size * input_shape.depth(); + Network* fc = new FullyConnected(name, input_depth, depth, type); + if (input_size > 1) { + Series* series = new Series("FCSeries"); + series->AddToStack(new Reconfig("FCReconfig", input_shape.depth(), + input_shape.width(), input_shape.height())); + series->AddToStack(fc); + fc = series; + } + return fc; +} + +// Parses a Fully connected network. +Network* NetworkBuilder::ParseFullyConnected(const StaticShape& input_shape, + char** str) { + char* spec_start = *str; + NetworkType type = NonLinearity((*str)[1]); + if (type == NT_NONE) { + tprintf("Invalid nonlinearity on F-spec!: %s\n", *str); + return nullptr; + } + int depth = strtol(*str + 1, str, 10); + if (depth <= 0) { + tprintf("Invalid F spec!:%s\n", *str); + return nullptr; + } + STRING name(spec_start, *str - spec_start); + return BuildFullyConnected(input_shape, type, name, depth); +} + +// Parses an Output spec. +Network* NetworkBuilder::ParseOutput(const StaticShape& input_shape, + char** str) { + char dims_ch = (*str)[1]; + if (dims_ch != '0' && dims_ch != '1' && dims_ch != '2') { + tprintf("Invalid dims (2|1|0) in output spec!:%s\n", *str); + return nullptr; + } + char type_ch = (*str)[2]; + if (type_ch != 'l' && type_ch != 's' && type_ch != 'c') { + tprintf("Invalid output type (l|s|c) in output spec!:%s\n", *str); + return nullptr; + } + int depth = strtol(*str + 3, str, 10); + if (depth != num_softmax_outputs_) { + tprintf("Warning: given outputs %d not equal to unicharset of %d.\n", depth, + num_softmax_outputs_); + depth = num_softmax_outputs_; + } + NetworkType type = NT_SOFTMAX; + if (type_ch == 'l') + type = NT_LOGISTIC; + else if (type_ch == 's') + type = NT_SOFTMAX_NO_CTC; + if (dims_ch == '0') { + // Same as standard fully connected. + return BuildFullyConnected(input_shape, type, "Output", depth); + } else if (dims_ch == '2') { + // We don't care if x and/or y are variable. + return new FullyConnected("Output2d", input_shape.depth(), depth, type); + } + // For 1-d y has to be fixed, and if not 1, moved to depth. + if (input_shape.height() == 0) { + tprintf("Fully connected requires fixed height!\n"); + return nullptr; + } + int input_size = input_shape.height(); + int input_depth = input_size * input_shape.depth(); + Network* fc = new FullyConnected("Output", input_depth, depth, type); + if (input_size > 1) { + Series* series = new Series("FCSeries"); + series->AddToStack(new Reconfig("FCReconfig", input_shape.depth(), 1, + input_shape.height())); + series->AddToStack(fc); + fc = series; + } + return fc; +} + +} // namespace tesseract. + diff --git a/lstm/networkbuilder.h b/lstm/networkbuilder.h new file mode 100644 index 00000000..a405fc52 --- /dev/null +++ b/lstm/networkbuilder.h @@ -0,0 +1,160 @@ +/////////////////////////////////////////////////////////////////////// +// File: networkbuilder.h +// Description: Class to parse the network description language and +// build a corresponding network. +// Author: Ray Smith +// Created: Wed Jul 16 18:35:38 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_NETWORKBUILDER_H_ +#define TESSERACT_LSTM_NETWORKBUILDER_H_ + +#include "static_shape.h" +#include "stridemap.h" + +class STRING; +class UNICHARSET; + +namespace tesseract { + +class Input; +class Network; +class Parallel; +class TRand; + +class NetworkBuilder { + public: + explicit NetworkBuilder(int num_softmax_outputs) + : num_softmax_outputs_(num_softmax_outputs) {} + + // Builds a network with a network_spec in the network description + // language, to recognize a character set of num_outputs size. + // If append_index is non-negative, then *network must be non-null and the + // given network_spec will be appended to *network AFTER append_index, with + // the top of the input *network discarded. + // Note that network_spec is call by value to allow a non-const char* pointer + // into the string for BuildFromString. + // net_flags control network behavior according to the NetworkFlags enum. + // The resulting network is returned via **network. + // Returns false if something failed. + static bool InitNetwork(int num_outputs, STRING network_spec, + int append_index, int net_flags, float weight_range, + TRand* randomizer, Network** network); + + // Parses the given string and returns a network according to the following + // language: + // ============ Syntax of description below: ============ + // represents a number. + // represents any single network element, including (recursively) a + // [...] series or (...) parallel construct. + // (s|t|r|l|m) (regex notation) represents a single required letter. + // NOTE THAT THROUGHOUT, x and y are REVERSED from conventional mathematics, + // to use the same convention as Tensor Flow. The reason TF adopts this + // convention is to eliminate the need to transpose images on input, since + // adjacent memory locations in images increase x and then y, while adjacent + // memory locations in tensors in TF, and NetworkIO in tesseract increase the + // rightmost index first, then the next-left and so-on, like C arrays. + // ============ INPUTS ============ + // ,,, A batch of b images with height h, width w, and depth d. + // b, h and/or w may be zero, to indicate variable size. Some network layer + // (summarizing LSTM) must be used to make a variable h known. + // d may be 1 for greyscale, 3 for color. + // NOTE that throughout the constructed network, the inputs/outputs are all of + // the same [batch,height,width,depth] dimensions, even if a different size. + // ============ PLUMBING ============ + // [...] Execute ... networks in series (layers). + // (...) Execute ... networks in parallel, with their output depths added. + // R Execute d replicas of net in parallel, with their output depths + // added. + // Rx Execute with x-dimension reversal. + // Ry Execute with y-dimension reversal. + // S, Rescale 2-D input by shrink factor x,y, rearranging the data by + // increasing the depth of the input by factor xy. + // Mp, Maxpool the input, reducing the size by an (x,y) rectangle. + // ============ FUNCTIONAL UNITS ============ + // C(s|t|r|l|m),, Convolves using a (x,y) window, with no shrinkage, + // random infill, producing d outputs, then applies a non-linearity: + // s: Sigmoid, t: Tanh, r: Relu, l: Linear, m: Softmax. + // F(s|t|r|l|m) Truly fully-connected with s|t|r|l|m non-linearity and d + // outputs. Connects to every x,y,depth position of the input, reducing + // height, width to 1, producing a single vector as the output. + // Input height and width must be constant. + // For a sliding-window linear or non-linear map that connects just to the + // input depth, and leaves the input image size as-is, use a 1x1 convolution + // eg. Cr1,1,64 instead of Fr64. + // L(f|r|b)(x|y)[s] LSTM cell with n states/outputs. + // The LSTM must have one of: + // f runs the LSTM forward only. + // r runs the LSTM reversed only. + // b runs the LSTM bidirectionally. + // It will operate on either the x- or y-dimension, treating the other + // dimension independently (as if part of the batch). + // s (optional) summarizes the output in the requested dimension, + // outputting only the final step, collapsing the dimension to a + // single element. + // LS Forward-only LSTM cell in the x-direction, with built-in Softmax. + // LE Forward-only LSTM cell in the x-direction, with built-in softmax, + // with binary Encoding. + // L2xy Full 2-d LSTM operating in quad-directions (bidi in x and y) and + // all the output depths added. + // ============ OUTPUTS ============ + // The network description must finish with an output specification: + // O(2|1|0)(l|s|c) output layer with n classes + // 2 (heatmap) Output is a 2-d vector map of the input (possibly at + // different scale). + // 1 (sequence) Output is a 1-d sequence of vector values. + // 0 (category) Output is a 0-d single vector value. + // l uses a logistic non-linearity on the output, allowing multiple + // hot elements in any output vector value. + // s uses a softmax non-linearity, with one-hot output in each value. + // c uses a softmax with CTC. Can only be used with s (sequence). + // NOTE1: Only O1s and O1c are currently supported. + // NOTE2: n is totally ignored, and for compatibility purposes only. The + // output number of classes is obtained automatically from the + // unicharset. + Network* BuildFromString(const StaticShape& input_shape, char** str); + + private: + // Parses an input specification and returns the result, which may include a + // series. + Network* ParseInput(char** str); + // Parses a sequential series of networks, defined by [...]. + Network* ParseSeries(const StaticShape& input_shape, Input* input_layer, + char** str); + // Parses a parallel set of networks, defined by (...). + Network* ParseParallel(const StaticShape& input_shape, char** str); + // Parses a network that begins with 'R'. + Network* ParseR(const StaticShape& input_shape, char** str); + // Parses a network that begins with 'S'. + Network* ParseS(const StaticShape& input_shape, char** str); + // Parses a network that begins with 'C'. + Network* ParseC(const StaticShape& input_shape, char** str); + // Parses a network that begins with 'M'. + Network* ParseM(const StaticShape& input_shape, char** str); + // Parses an LSTM network, either individual, bi- or quad-directional. + Network* ParseLSTM(const StaticShape& input_shape, char** str); + // Builds a set of 4 lstms with t and y reversal, running in true parallel. + static Network* BuildLSTMXYQuad(int num_inputs, int num_states); + // Parses a Fully connected network. + Network* ParseFullyConnected(const StaticShape& input_shape, char** str); + // Parses an Output spec. + Network* ParseOutput(const StaticShape& input_shape, char** str); + + private: + int num_softmax_outputs_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_NETWORKBUILDER_H_ diff --git a/lstm/networkio.cpp b/lstm/networkio.cpp new file mode 100644 index 00000000..92edff2a --- /dev/null +++ b/lstm/networkio.cpp @@ -0,0 +1,979 @@ +/////////////////////////////////////////////////////////////////////// +// File: networkio.cpp +// Description: Network input/output data, allowing float/int implementations. +// Author: Ray Smith +// Created: Thu Jun 19 13:01:31 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "networkio.h" + +#include "allheaders.h" +#include "functions.h" +#include "statistc.h" +#include "tprintf.h" + +namespace tesseract { + +// Minimum value to output for certainty. +const float kMinCertainty = -20.0f; +// Probability corresponding to kMinCertainty. +const float kMinProb = exp(kMinCertainty); + +// Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim. +void NetworkIO::Resize2d(bool int_mode, int width, int num_features) { + stride_map_ = StrideMap(); + int_mode_ = int_mode; + if (int_mode_) { + i_.ResizeNoInit(width, num_features); + } else { + f_.ResizeNoInit(width, num_features); + } +} + +// Resizes to a specific stride_map. +void NetworkIO::ResizeToMap(bool int_mode, const StrideMap& stride_map, + int num_features) { + // If this assert fails, it most likely got here through an uninitialized + // scratch element, ie call NetworkScratch::IO::Resizexxx() not + // NetworkIO::Resizexxx()!! + ASSERT_HOST(this != NULL); + stride_map_ = stride_map; + int_mode_ = int_mode; + if (int_mode_) { + i_.ResizeNoInit(stride_map.Width(), num_features); + } else { + f_.ResizeNoInit(stride_map.Width(), num_features); + } + ZeroInvalidElements(); +} + +// Shrinks image size by x_scale,y_scale, and use given number of features. +void NetworkIO::ResizeScaled(const NetworkIO& src, + int x_scale, int y_scale, int num_features) { + StrideMap stride_map = src.stride_map_; + stride_map.ScaleXY(x_scale, y_scale); + ResizeToMap(src.int_mode_, stride_map, num_features); +} + +// Resizes to just 1 x-coord, whatever the input. +void NetworkIO::ResizeXTo1(const NetworkIO& src, int num_features) { + StrideMap stride_map = src.stride_map_; + stride_map.ReduceWidthTo1(); + ResizeToMap(src.int_mode_, stride_map, num_features); +} + +// Initialize all the array to zero. +void NetworkIO::Zero() { + int width = Width(); + // Zero out the everything. Column-by-column in case it is aligned. + for (int t = 0; t < width; ++t) { + ZeroTimeStep(t); + } +} + +// Initializes to zero all elements of the array that do not correspond to +// valid image positions. (If a batch of different-sized images are packed +// together, then there will be padding pixels.) +void NetworkIO::ZeroInvalidElements() { + int num_features = NumFeatures(); + int full_width = stride_map_.Size(FD_WIDTH); + int full_height = stride_map_.Size(FD_HEIGHT); + StrideMap::Index b_index(stride_map_); + do { + int end_x = b_index.MaxIndexOfDim(FD_WIDTH) + 1; + if (end_x < full_width) { + // The width is small, so fill for every valid y. + StrideMap::Index y_index(b_index); + int fill_size = num_features * (full_width - end_x); + do { + StrideMap::Index z_index(y_index); + z_index.AddOffset(end_x, FD_WIDTH); + if (int_mode_) { + ZeroVector(fill_size, i_[z_index.t()]); + } else { + ZeroVector(fill_size, f_[z_index.t()]); + } + } while (y_index.AddOffset(1, FD_HEIGHT)); + } + int end_y = b_index.MaxIndexOfDim(FD_HEIGHT) + 1; + if (end_y < full_height) { + // The height is small, so fill in the space in one go. + StrideMap::Index y_index(b_index); + y_index.AddOffset(end_y, FD_HEIGHT); + int fill_size = num_features * full_width * (full_height - end_y); + if (int_mode_) { + ZeroVector(fill_size, i_[y_index.t()]); + } else { + ZeroVector(fill_size, f_[y_index.t()]); + } + } + } while (b_index.AddOffset(1, FD_BATCH)); +} + +// Helper computes a black point and white point to contrast-enhance an image. +// The computation is based on the assumption that the image is of a single line +// of text, so a horizontal line through the middle of the image passes through +// at least some of it, so local minima and maxima are a good proxy for black +// and white pixel samples. +static void ComputeBlackWhite(Pix* pix, float* black, float* white) { + int width = pixGetWidth(pix); + int height = pixGetHeight(pix); + STATS mins(0, 256), maxes(0, 256); + if (width >= 3) { + int y = height / 2; + l_uint32* line = pixGetData(pix) + pixGetWpl(pix) * y; + int prev = GET_DATA_BYTE(line, 0); + int curr = GET_DATA_BYTE(line, 1); + for (int x = 1; x + 1 < width; ++x) { + int next = GET_DATA_BYTE(line, x + 1); + if ((curr < prev && curr <= next) || (curr <= prev && curr < next)) { + // Local minimum. + mins.add(curr, 1); + } + if ((curr > prev && curr >= next) || (curr >= prev && curr > next)) { + // Local maximum. + maxes.add(curr, 1); + } + prev = curr; + curr = next; + } + } + if (mins.get_total() == 0) mins.add(0, 1); + if (maxes.get_total() == 0) maxes.add(255, 1); + *black = mins.ile(0.25); + *white = maxes.ile(0.75); +} + +// Sets up the array from the given image, using the currently set int_mode_. +// If the image width doesn't match the shape, the image is truncated or padded +// with noise to match. +void NetworkIO::FromPix(const StaticShape& shape, const Pix* pix, + TRand* randomizer) { + std::vector pixes(1, pix); + FromPixes(shape, pixes, randomizer); +} + +// Sets up the array from the given set of images, using the currently set +// int_mode_. If the image width doesn't match the shape, the images are +// truncated or padded with noise to match. +void NetworkIO::FromPixes(const StaticShape& shape, + const std::vector& pixes, + TRand* randomizer) { + int target_height = shape.height(); + int target_width = shape.width(); + std::vector> h_w_pairs; + for (auto pix : pixes) { + Pix* var_pix = const_cast(pix); + int width = pixGetWidth(var_pix); + if (target_width != 0) width = target_width; + int height = pixGetHeight(var_pix); + if (target_height != 0) height = target_height; + h_w_pairs.emplace_back(height, width); + } + stride_map_.SetStride(h_w_pairs); + ResizeToMap(int_mode(), stride_map_, shape.depth()); + // Iterate over the images again to copy the data. + for (int b = 0; b < pixes.size(); ++b) { + Pix* pix = const_cast(pixes[b]); + float black = 0.0f, white = 255.0f; + if (shape.depth() != 3) ComputeBlackWhite(pix, &black, &white); + float contrast = (white - black) / 2.0f; + if (contrast <= 0.0f) contrast = 1.0f; + if (shape.height() == 1) { + Copy1DGreyImage(b, pix, black, contrast, randomizer); + } else { + Copy2DImage(b, pix, black, contrast, randomizer); + } + } +} + +// Copies the given pix to *this at the given batch index, stretching and +// clipping the pixel values so that [black, black + 2*contrast] maps to the +// dynamic range of *this, ie [-1,1] for a float and (-127,127) for int. +// This is a 2-d operation in the sense that the output depth is the number +// of input channels, the height is the height of the image, and the width +// is the width of the image, or truncated/padded with noise if the width +// is a fixed size. +void NetworkIO::Copy2DImage(int batch, Pix* pix, float black, float contrast, + TRand* randomizer) { + int width = pixGetWidth(pix); + int height = pixGetHeight(pix); + int wpl = pixGetWpl(pix); + StrideMap::Index index(stride_map_); + index.AddOffset(batch, FD_BATCH); + int t = index.t(); + int target_height = stride_map_.Size(FD_HEIGHT); + int target_width = stride_map_.Size(FD_WIDTH); + int num_features = NumFeatures(); + bool color = num_features == 3; + if (width > target_width) width = target_width; + uinT32* line = pixGetData(pix); + for (int y = 0; y < target_height; ++y, line += wpl) { + int x = 0; + if (y < height) { + for (x = 0; x < width; ++x, ++t) { + if (color) { + int f = 0; + for (int c = COLOR_RED; c <= COLOR_BLUE; ++c) { + int pixel = GET_DATA_BYTE(line + x, c); + SetPixel(t, f++, pixel, black, contrast); + } + } else { + int pixel = GET_DATA_BYTE(line, x); + SetPixel(t, 0, pixel, black, contrast); + } + } + } + for (; x < target_width; ++x) Randomize(t++, 0, num_features, randomizer); + } +} + +// Copies the given pix to *this at the given batch index, as Copy2DImage +// above, except that the output depth is the height of the input image, the +// output height is 1, and the output width as for Copy2DImage. +// The image is thus treated as a 1-d set of vertical pixel strips. +void NetworkIO::Copy1DGreyImage(int batch, Pix* pix, float black, + float contrast, TRand* randomizer) { + int width = pixGetWidth(pix); + int height = pixGetHeight(pix); + ASSERT_HOST(height == NumFeatures()); + int wpl = pixGetWpl(pix); + StrideMap::Index index(stride_map_); + index.AddOffset(batch, FD_BATCH); + int t = index.t(); + int target_width = stride_map_.Size(FD_WIDTH); + if (width > target_width) width = target_width; + int x; + for (x = 0; x < width; ++x, ++t) { + for (int y = 0; y < height; ++y) { + uinT32* line = pixGetData(pix) + wpl * y; + int pixel = GET_DATA_BYTE(line, x); + SetPixel(t, y, pixel, black, contrast); + } + } + for (; x < target_width; ++x) Randomize(t++, 0, height, randomizer); +} + +// Helper stores the pixel value in i_ or f_ according to int_mode_. +// t: is the index from the StrideMap corresponding to the current +// [batch,y,x] position +// f: is the index into the depth/channel +// pixel: the value of the pixel from the image (in one channel) +// black: the pixel value to map to the lowest of the range of *this +// contrast: the range of pixel values to stretch to half the range of *this. +void NetworkIO::SetPixel(int t, int f, int pixel, float black, float contrast) { + float float_pixel = (pixel - black) / contrast - 1.0f; + if (int_mode_) { + i_[t][f] = ClipToRange(IntCastRounded((MAX_INT8 + 1) * float_pixel), + -MAX_INT8, MAX_INT8); + } else { + f_[t][f] = float_pixel; + } +} + +// Converts the array to a Pix. Must be pixDestroyed after use. +Pix* NetworkIO::ToPix() const { + // Count the width of the image, and find the max multiplication factor. + int im_width = stride_map_.Size(FD_WIDTH); + int im_height = stride_map_.Size(FD_HEIGHT); + int num_features = NumFeatures(); + int feature_factor = 1; + if (num_features == 3) { + // Special hack for color. + num_features = 1; + feature_factor = 3; + } + Pix* pix = pixCreate(im_width, im_height * num_features, 32); + StrideMap::Index index(stride_map_); + do { + int im_x = index.index(FD_WIDTH); + int top_im_y = index.index(FD_HEIGHT); + int im_y = top_im_y; + int t = index.t(); + if (int_mode_) { + const inT8* features = i_[t]; + for (int y = 0; y < num_features; ++y, im_y += im_height) { + int pixel = features[y * feature_factor]; + // 1 or 2 features use greyscale. + int red = ClipToRange(pixel + 128, 0, 255); + int green = red, blue = red; + if (feature_factor == 3) { + // With 3 features assume RGB color. + green = ClipToRange(features[y * feature_factor + 1] + 128, 0, 255); + blue = ClipToRange(features[y * feature_factor + 2] + 128, 0, 255); + } else if (num_features > 3) { + // More than 3 features use false yellow/blue color, assuming a signed + // input in the range [-1,1]. + red = abs(pixel) * 2; + if (pixel >= 0) { + green = red; + blue = 0; + } else { + blue = red; + green = red = 0; + } + } + pixSetPixel(pix, im_x, im_y, (red << L_RED_SHIFT) | + (green << L_GREEN_SHIFT) | + (blue << L_BLUE_SHIFT)); + } + } else { + const float* features = f_[t]; + for (int y = 0; y < num_features; ++y, im_y += im_height) { + float pixel = features[y * feature_factor]; + // 1 or 2 features use greyscale. + int red = ClipToRange(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255); + int green = red, blue = red; + if (feature_factor == 3) { + // With 3 features assume RGB color. + pixel = features[y * feature_factor + 1]; + green = ClipToRange(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255); + pixel = features[y * feature_factor + 2]; + blue = ClipToRange(IntCastRounded((pixel + 1.0f) * 127.5f), 0, 255); + } else if (num_features > 3) { + // More than 3 features use false yellow/blue color, assuming a signed + // input in the range [-1,1]. + red = ClipToRange(IntCastRounded(fabs(pixel) * 255), 0, 255); + if (pixel >= 0) { + green = red; + blue = 0; + } else { + blue = red; + green = red = 0; + } + } + pixSetPixel(pix, im_x, im_y, (red << L_RED_SHIFT) | + (green << L_GREEN_SHIFT) | + (blue << L_BLUE_SHIFT)); + } + } + } while (index.Increment()); + return pix; +} + +// Prints the first and last num timesteps of the array for each feature. +void NetworkIO::Print(int num) const { + int num_features = NumFeatures(); + for (int y = 0; y < num_features; ++y) { + for (int t = 0; t < Width(); ++t) { + if (num == 0 || t < num || t + num >= Width()) { + if (int_mode_) { + tprintf(" %g", static_cast(i_[t][y]) / MAX_INT8); + } else { + tprintf(" %g", f_[t][y]); + } + } + } + tprintf("\n"); + } +} + +// Copies a single time step from src. +void NetworkIO::CopyTimeStepFrom(int dest_t, const NetworkIO& src, int src_t) { + ASSERT_HOST(int_mode_ == src.int_mode_); + if (int_mode_) { + memcpy(i_[dest_t], src.i_[src_t], i_.dim2() * sizeof(i_[0][0])); + } else { + memcpy(f_[dest_t], src.f_[src_t], f_.dim2() * sizeof(f_[0][0])); + } +} + +// Copies a part of single time step from src. +void NetworkIO::CopyTimeStepGeneral(int dest_t, int dest_offset, + int num_features, const NetworkIO& src, + int src_t, int src_offset) { + ASSERT_HOST(int_mode_ == src.int_mode_); + if (int_mode_) { + memcpy(i_[dest_t] + dest_offset, src.i_[src_t] + src_offset, + num_features * sizeof(i_[0][0])); + } else { + memcpy(f_[dest_t] + dest_offset, src.f_[src_t] + src_offset, + num_features * sizeof(f_[0][0])); + } +} + +// Zeroes a single time step. +void NetworkIO::ZeroTimeStepGeneral(int t, int offset, int num_features) { + if (int_mode_) { + ZeroVector(num_features, i_[t] + offset); + } else { + ZeroVector(num_features, f_[t] + offset); + } +} + +// Sets the given range to random values. +void NetworkIO::Randomize(int t, int offset, int num_features, + TRand* randomizer) { + if (int_mode_) { + inT8* line = i_[t] + offset; + for (int i = 0; i < num_features; ++i) + line[i] = IntCastRounded(randomizer->SignedRand(MAX_INT8)); + } else { + // float mode. + float* line = f_[t] + offset; + for (int i = 0; i < num_features; ++i) + line[i] = randomizer->SignedRand(1.0); + } +} + +// Helper returns the label and score of the best choice over a range. +int NetworkIO::BestChoiceOverRange(int t_start, int t_end, int not_this, + int null_ch, float* rating, + float* certainty) const { + if (t_end <= t_start) return -1; + int max_char = -1; + float min_score = 0.0f; + for (int c = 0; c < NumFeatures(); ++c) { + if (c == not_this || c == null_ch) continue; + ScoresOverRange(t_start, t_end, c, null_ch, rating, certainty); + if (max_char < 0 || *rating < min_score) { + min_score = *rating; + max_char = c; + } + } + ScoresOverRange(t_start, t_end, max_char, null_ch, rating, certainty); + return max_char; +} + +// Helper returns the rating and certainty of the choice over a range in output. +void NetworkIO::ScoresOverRange(int t_start, int t_end, int choice, int null_ch, + float* rating, float* certainty) const { + ASSERT_HOST(!int_mode_); + *rating = 0.0f; + *certainty = 0.0f; + if (t_end <= t_start || t_end <= 0) return; + float ratings[3] = {0.0f, 0.0f, 0.0f}; + float certs[3] = {0.0f, 0.0f, 0.0f}; + for (int t = t_start; t < t_end; ++t) { + const float* line = f_[t]; + float score = ProbToCertainty(line[choice]); + float zero = ProbToCertainty(line[null_ch]); + if (t == t_start) { + ratings[2] = MAX_FLOAT32; + ratings[1] = -score; + certs[1] = score; + } else { + for (int i = 2; i >= 1; --i) { + if (ratings[i] > ratings[i - 1]) { + ratings[i] = ratings[i - 1]; + certs[i] = certs[i - 1]; + } + } + ratings[2] -= zero; + if (zero < certs[2]) certs[2] = zero; + ratings[1] -= score; + if (score < certs[1]) certs[1] = score; + } + ratings[0] -= zero; + if (zero < certs[0]) certs[0] = zero; + } + int best_i = ratings[2] < ratings[1] ? 2 : 1; + *rating = ratings[best_i] + t_end - t_start; + *certainty = certs[best_i]; +} + +// Returns the index (label) of the best value at the given timestep, +// excluding not_this and not_that, and if not null, sets the score to the +// log of the corresponding value. +int NetworkIO::BestLabel(int t, int not_this, int not_that, + float* score) const { + ASSERT_HOST(!int_mode_); + int best_index = -1; + float best_score = -MAX_FLOAT32; + const float* line = f_[t]; + for (int i = 0; i < f_.dim2(); ++i) { + if (line[i] > best_score && i != not_this && i != not_that) { + best_score = line[i]; + best_index = i; + } + } + if (score != NULL) *score = ProbToCertainty(best_score); + return best_index; +} + +// Returns the best start position out of [start, end) (into which all labels +// must fit) to obtain the highest cumulative score for the given labels. +int NetworkIO::PositionOfBestMatch(const GenericVector& labels, int start, + int end) const { + int length = labels.size(); + int last_start = end - length; + int best_start = -1; + double best_score = 0.0; + for (int s = start; s <= last_start; ++s) { + double score = ScoreOfLabels(labels, s); + if (score > best_score || best_start < 0) { + best_score = score; + best_start = s; + } + } + return best_start; +} + +// Returns the cumulative score of the given labels starting at start, and +// using one label per time-step. +double NetworkIO::ScoreOfLabels(const GenericVector& labels, + int start) const { + int length = labels.size(); + double score = 0.0; + for (int i = 0; i < length; ++i) { + score += f_(start + i, labels[i]); + } + return score; +} + +// Helper function sets all the outputs for a single timestep, such that +// label has value ok_score, and the other labels share 1 - ok_score. +void NetworkIO::SetActivations(int t, int label, float ok_score) { + ASSERT_HOST(!int_mode_); + int num_classes = NumFeatures(); + float bad_score = (1.0f - ok_score) / (num_classes - 1); + float* targets = f_[t]; + for (int i = 0; i < num_classes; ++i) + targets[i] = bad_score; + targets[label] = ok_score; +} + +// Modifies the values, only if needed, so that the given label is +// the winner at the given time step t. +void NetworkIO::EnsureBestLabel(int t, int label) { + ASSERT_HOST(!int_mode_); + if (BestLabel(t, NULL) != label) { + // Output value needs enhancing. Third all the other elements and add the + // remainder to best_label. + int num_classes = NumFeatures(); + float* targets = f_[t]; + for (int c = 0; c < num_classes; ++c) { + if (c == label) { + targets[c] += (1.0 - targets[c]) * (2 / 3.0); + } else { + targets[c] /= 3.0; + } + } + } +} + +// Helper function converts prob to certainty taking the minimum into account. +/* static */ +float NetworkIO::ProbToCertainty(float prob) { + return prob > kMinProb ? log(prob) : kMinCertainty; +} + +// Returns true if there is any bad value that is suspiciously like a GT +// error. Assuming that *this is the difference(gradient) between target +// and forward output, returns true if there is a large negative value +// (correcting a very confident output) for which there is no corresponding +// positive value in an adjacent timestep for the same feature index. This +// allows the box-truthed samples to make fine adjustments to position while +// stopping other disagreements of confident output with ground truth. +bool NetworkIO::AnySuspiciousTruth(float confidence_thr) const { + int num_features = NumFeatures(); + for (int t = 0; t < Width(); ++t) { + const float* features = f_[t]; + for (int y = 0; y < num_features; ++y) { + float grad = features[y]; + if (grad < -confidence_thr) { + // Correcting strong output. Check for movement. + if ((t == 0 || f_[t - 1][y] < confidence_thr / 2) && + (t + 1 == Width() || f_[t + 1][y] < confidence_thr / 2)) { + return true; // No strong positive on either side. + } + } + } + } + return false; +} + +// Reads a single timestep to floats in the range [-1, 1]. +void NetworkIO::ReadTimeStep(int t, double* output) const { + if (int_mode_) { + const inT8* line = i_[t]; + for (int i = 0; i < i_.dim2(); ++i) { + output[i] = static_cast(line[i]) / MAX_INT8; + } + } else { + const float* line = f_[t]; + for (int i = 0; i < f_.dim2(); ++i) { + output[i] = static_cast(line[i]); + } + } +} + +// Adds a single timestep to floats. +void NetworkIO::AddTimeStep(int t, double* inout) const { + int num_features = NumFeatures(); + if (int_mode_) { + const inT8* line = i_[t]; + for (int i = 0; i < num_features; ++i) { + inout[i] += static_cast(line[i]) / MAX_INT8; + } + } else { + const float* line = f_[t]; + for (int i = 0; i < num_features; ++i) { + inout[i] += line[i]; + } + } +} + +// Adds part of a single timestep to floats. +void NetworkIO::AddTimeStepPart(int t, int offset, int num_features, + float* inout) const { + if (int_mode_) { + const inT8* line = i_[t] + offset; + for (int i = 0; i < num_features; ++i) { + inout[i] += static_cast(line[i]) / MAX_INT8; + } + } else { + const float* line = f_[t] + offset; + for (int i = 0; i < num_features; ++i) { + inout[i] += line[i]; + } + } +} + +// Writes a single timestep from floats in the range [-1, 1]. +void NetworkIO::WriteTimeStep(int t, const double* input) { + WriteTimeStepPart(t, 0, NumFeatures(), input); +} + +// Writes a single timestep from floats in the range [-1, 1] writing only +// num_features elements of input to (*this)[t], starting at offset. +void NetworkIO::WriteTimeStepPart(int t, int offset, int num_features, + const double* input) { + if (int_mode_) { + inT8* line = i_[t] + offset; + for (int i = 0; i < num_features; ++i) { + line[i] = ClipToRange(IntCastRounded(input[i] * MAX_INT8), + -MAX_INT8, MAX_INT8); + } + } else { + float* line = f_[t] + offset; + for (int i = 0; i < num_features; ++i) { + line[i] = static_cast(input[i]); + } + } +} + +// Maxpools a single time step from src. +void NetworkIO::MaxpoolTimeStep(int dest_t, const NetworkIO& src, int src_t, + int* max_line) { + ASSERT_HOST(int_mode_ == src.int_mode_); + if (int_mode_) { + int dim = i_.dim2(); + inT8* dest_line = i_[dest_t]; + const inT8* src_line = src.i_[src_t]; + for (int i = 0; i < dim; ++i) { + if (dest_line[i] < src_line[i]) { + dest_line[i] = src_line[i]; + max_line[i] = src_t; + } + } + } else { + int dim = f_.dim2(); + float* dest_line = f_[dest_t]; + const float* src_line = src.f_[src_t]; + for (int i = 0; i < dim; ++i) { + if (dest_line[i] < src_line[i]) { + dest_line[i] = src_line[i]; + max_line[i] = src_t; + } + } + } +} + +// Runs maxpool backward, using maxes to index timesteps in *this. +void NetworkIO::MaxpoolBackward(const NetworkIO& fwd, + const GENERIC_2D_ARRAY& maxes) { + ASSERT_HOST(!int_mode_); + Zero(); + StrideMap::Index index(fwd.stride_map_); + do { + int t = index.t(); + const int* max_line = maxes[t]; + const float* fwd_line = fwd.f_[t]; + int num_features = fwd.f_.dim2(); + for (int i = 0; i < num_features; ++i) { + f_[max_line[i]][i] = fwd_line[i]; + } + } while (index.Increment()); +} + +// Returns the min over time of the maxes over features of the outputs. +float NetworkIO::MinOfMaxes() const { + float min_max = 0.0f; + int width = Width(); + int num_features = NumFeatures(); + for (int t = 0; t < width; ++t) { + float max_value = -MAX_FLOAT32; + if (int_mode_) { + const inT8* column = i_[t]; + for (int i = 0; i < num_features; ++i) { + if (column[i] > max_value) max_value = column[i]; + } + } else { + const float* column = f_[t]; + for (int i = 0; i < num_features; ++i) { + if (column[i] > max_value) max_value = column[i]; + } + } + if (t == 0 || max_value < min_max) min_max = max_value; + } + return min_max; +} + +// Computes combined results for a combiner that chooses between an existing +// input and itself, with an additional output to indicate the choice. +void NetworkIO::CombineOutputs(const NetworkIO& base_output, + const NetworkIO& combiner_output) { + int no = base_output.NumFeatures(); + ASSERT_HOST(combiner_output.NumFeatures() == no + 1); + Resize(base_output, no); + int width = Width(); + if (int_mode_) { + // Number of outputs from base and final result. + for (int t = 0; t < width; ++t) { + inT8* out_line = i_[t]; + const inT8* base_line = base_output.i_[t]; + const inT8* comb_line = combiner_output.i_[t]; + float base_weight = static_cast(comb_line[no]) / MAX_INT8; + float boost_weight = 1.0f - base_weight; + for (int i = 0; i < no; ++i) { + out_line[i] = IntCastRounded(base_line[i] * base_weight + + comb_line[i] * boost_weight); + } + } + } else { + for (int t = 0; t < width; ++t) { + float* out_line = f_[t]; + const float* base_line = base_output.f_[t]; + const float* comb_line = combiner_output.f_[t]; + float base_weight = comb_line[no]; + float boost_weight = 1.0f - base_weight; + for (int i = 0; i < no; ++i) { + out_line[i] = base_line[i] * base_weight + comb_line[i] * boost_weight; + } + } + } +} + +// Computes deltas for a combiner that chooses between 2 sets of inputs. +void NetworkIO::ComputeCombinerDeltas(const NetworkIO& fwd_deltas, + const NetworkIO& base_output) { + ASSERT_HOST(!int_mode_); + // Compute the deltas for the combiner. + int width = Width(); + int no = NumFeatures() - 1; + ASSERT_HOST(fwd_deltas.NumFeatures() == no); + ASSERT_HOST(base_output.NumFeatures() == no); + // Number of outputs from base and final result. + for (int t = 0; t < width; ++t) { + const float* delta_line = fwd_deltas.f_[t]; + const float* base_line = base_output.f_[t]; + float* comb_line = f_[t]; + float base_weight = comb_line[no]; + float boost_weight = 1.0f - base_weight; + float max_base_delta = 0.0; + for (int i = 0; i < no; ++i) { + // What did the combiner actually produce? + float output = base_line[i] * base_weight + comb_line[i] * boost_weight; + // Reconstruct the target from the delta. + float comb_target = delta_line[i] + output; + comb_line[i] = comb_target - comb_line[i]; + float base_delta = fabs(comb_target - base_line[i]); + if (base_delta > max_base_delta) max_base_delta = base_delta; + } + if (max_base_delta >= 0.5) { + // The base network got it wrong. The combiner should output the right + // answer and 0 for the base network. + comb_line[no] = 0.0 - base_weight; + } else { + // The base network was right. The combiner should flag that. + for (int i = 0; i < no; ++i) { + // All other targets are 0. + if (comb_line[i] > 0.0) comb_line[i] -= 1.0; + } + comb_line[no] = 1.0 - base_weight; + } + } +} + +// Copies the array checking that the types match. +void NetworkIO::CopyAll(const NetworkIO& src) { + ASSERT_HOST(src.int_mode_ == int_mode_); + f_ = src.f_; +} + +// Checks that both are floats and adds the src array to *this. +void NetworkIO::AddAllToFloat(const NetworkIO& src) { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!src.int_mode_); + f_ += src.f_; +} + +// Subtracts the array from a float array. src must also be float. +void NetworkIO::SubtractAllFromFloat(const NetworkIO& src) { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!src.int_mode_); + f_ -= src.f_; +} + +// Copies src to *this, with maxabs normalization to match scale. +void NetworkIO::CopyWithNormalization(const NetworkIO& src, + const NetworkIO& scale) { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!src.int_mode_); + ASSERT_HOST(!scale.int_mode_); + float src_max = src.f_.MaxAbs(); + ASSERT_HOST(std::isfinite(src_max)); + float scale_max = scale.f_.MaxAbs(); + ASSERT_HOST(std::isfinite(scale_max)); + if (src_max > 0.0f) { + float factor = scale_max / src_max; + for (int t = 0; t < src.Width(); ++t) { + const float* src_ptr = src.f_[t]; + float* dest_ptr = f_[t]; + for (int i = 0; i < src.f_.dim2(); ++i) dest_ptr[i] = src_ptr[i] * factor; + } + } else { + f_.Clear(); + } +} + +// Copies src to *this with independent reversal of the y dimension. +void NetworkIO::CopyWithYReversal(const NetworkIO& src) { + int num_features = src.NumFeatures(); + Resize(src, num_features); + StrideMap::Index b_index(src.stride_map_); + do { + int width = b_index.MaxIndexOfDim(FD_WIDTH) + 1; + StrideMap::Index fwd_index(b_index); + StrideMap::Index rev_index(b_index); + rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_HEIGHT), FD_HEIGHT); + do { + int fwd_t = fwd_index.t(); + int rev_t = rev_index.t(); + for (int x = 0; x < width; ++x) CopyTimeStepFrom(rev_t++, src, fwd_t++); + } while (fwd_index.AddOffset(1, FD_HEIGHT) && + rev_index.AddOffset(-1, FD_HEIGHT)); + } while (b_index.AddOffset(1, FD_BATCH)); +} + +// Copies src to *this with independent reversal of the x dimension. +void NetworkIO::CopyWithXReversal(const NetworkIO& src) { + int num_features = src.NumFeatures(); + Resize(src, num_features); + StrideMap::Index b_index(src.stride_map_); + do { + StrideMap::Index y_index(b_index); + do { + StrideMap::Index fwd_index(y_index); + StrideMap::Index rev_index(y_index); + rev_index.AddOffset(rev_index.MaxIndexOfDim(FD_WIDTH), FD_WIDTH); + do { + CopyTimeStepFrom(rev_index.t(), src, fwd_index.t()); + } while (fwd_index.AddOffset(1, FD_WIDTH) && + rev_index.AddOffset(-1, FD_WIDTH)); + } while (y_index.AddOffset(1, FD_HEIGHT)); + } while (b_index.AddOffset(1, FD_BATCH)); +} + +// Copies src to *this with independent transpose of the x and y dimensions. +void NetworkIO::CopyWithXYTranspose(const NetworkIO& src) { + int num_features = src.NumFeatures(); + stride_map_ = src.stride_map_; + stride_map_.TransposeXY(); + ResizeToMap(src.int_mode(), stride_map_, num_features); + StrideMap::Index src_b_index(src.stride_map_); + StrideMap::Index dest_b_index(stride_map_); + do { + StrideMap::Index src_y_index(src_b_index); + StrideMap::Index dest_x_index(dest_b_index); + do { + StrideMap::Index src_x_index(src_y_index); + StrideMap::Index dest_y_index(dest_x_index); + do { + CopyTimeStepFrom(dest_y_index.t(), src, src_x_index.t()); + } while (src_x_index.AddOffset(1, FD_WIDTH) && + dest_y_index.AddOffset(1, FD_HEIGHT)); + } while (src_y_index.AddOffset(1, FD_HEIGHT) && + dest_x_index.AddOffset(1, FD_WIDTH)); + } while (src_b_index.AddOffset(1, FD_BATCH) && + dest_b_index.AddOffset(1, FD_BATCH)); +} + +// Copies src to *this, at the given feature_offset, returning the total +// feature offset after the copy. Multiple calls will stack outputs from +// multiple sources in feature space. +int NetworkIO::CopyPacking(const NetworkIO& src, int feature_offset) { + ASSERT_HOST(int_mode_ == src.int_mode_); + int width = src.Width(); + ASSERT_HOST(width <= Width()); + int num_features = src.NumFeatures(); + ASSERT_HOST(num_features + feature_offset <= NumFeatures()); + if (int_mode_) { + for (int t = 0; t < width; ++t) { + memcpy(i_[t] + feature_offset, src.i_[t], + num_features * sizeof(i_[t][0])); + } + for (int t = width; t < i_.dim1(); ++t) { + memset(i_[t], 0, num_features * sizeof(i_[t][0])); + } + } else { + for (int t = 0; t < width; ++t) { + memcpy(f_[t] + feature_offset, src.f_[t], + num_features * sizeof(f_[t][0])); + } + for (int t = width; t < f_.dim1(); ++t) { + memset(f_[t], 0, num_features * sizeof(f_[t][0])); + } + } + return num_features + feature_offset; +} + +// Opposite of CopyPacking, fills *this with a part of src, starting at +// feature_offset, and picking num_features. +void NetworkIO::CopyUnpacking(const NetworkIO& src, int feature_offset, + int num_features) { + Resize(src, num_features); + int width = src.Width(); + ASSERT_HOST(num_features + feature_offset <= src.NumFeatures()); + if (int_mode_) { + for (int t = 0; t < width; ++t) { + memcpy(i_[t], src.i_[t] + feature_offset, + num_features * sizeof(i_[t][0])); + } + } else { + for (int t = 0; t < width; ++t) { + memcpy(f_[t], src.f_[t] + feature_offset, + num_features * sizeof(f_[t][0])); + } + } +} + +// Transposes the float part of *this into dest. +void NetworkIO::Transpose(TransposedArray* dest) const { + int width = Width(); + dest->ResizeNoInit(NumFeatures(), width); + for (int t = 0; t < width; ++t) dest->WriteStrided(t, f_[t]); +} + +// Clips the content of a single time-step to +/-range. +void NetworkIO::ClipVector(int t, float range) { + ASSERT_HOST(!int_mode_); + float* v = f_[t]; + int dim = f_.dim2(); + for (int i = 0; i < dim; ++i) + v[i] = ClipToRange(v[i], -range, range); +} + +} // namespace tesseract. diff --git a/lstm/networkio.h b/lstm/networkio.h new file mode 100644 index 00000000..50822699 --- /dev/null +++ b/lstm/networkio.h @@ -0,0 +1,341 @@ +/////////////////////////////////////////////////////////////////////// +// File: networkio.h +// Description: Network input/output data, allowing float/int implementations. +// Author: Ray Smith +// Created: Tue Jun 17 08:43:11 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_NETWORKIO_H_ +#define TESSERACT_LSTM_NETWORKIO_H_ + +#include +#include +#include + +#include "genericvector.h" +#include "helpers.h" +#include "static_shape.h" +#include "stridemap.h" +#include "weightmatrix.h" + +struct Pix; + +namespace tesseract { + +// Class to contain all the input/output of a network, allowing for fixed or +// variable-strided 2d to 1d mapping, and float or inT8 values. Provides +// enough calculating functions to hide the detail of the implementation. +class NetworkIO { + public: + NetworkIO() : int_mode_(false) {} + // Resizes the array (and stride), avoiding realloc if possible, to the given + // size from various size specs: + // Same stride size, but given number of features. + void Resize(const NetworkIO& src, int num_features) { + ResizeToMap(src.int_mode(), src.stride_map(), num_features); + } + // Resizes to a specific size as a 2-d temp buffer. No batches, no y-dim. + void Resize2d(bool int_mode, int width, int num_features); + // Resizes forcing a float representation with the stridemap of src and the + // given number of features. + void ResizeFloat(const NetworkIO& src, int num_features) { + ResizeToMap(false, src.stride_map(), num_features); + } + // Resizes to a specific stride_map. + void ResizeToMap(bool int_mode, const StrideMap& stride_map, + int num_features); + // Shrinks image size by x_scale,y_scale, and use given number of features. + void ResizeScaled(const NetworkIO& src, int x_scale, int y_scale, + int num_features); + // Resizes to just 1 x-coord, whatever the input. + void ResizeXTo1(const NetworkIO& src, int num_features); + // Initialize all the array to zero. + void Zero(); + // Initializes to zero all elements of the array that do not correspond to + // valid image positions. (If a batch of different-sized images are packed + // together, then there will be padding pixels.) + void ZeroInvalidElements(); + // Sets up the array from the given image, using the currently set int_mode_. + // If the image width doesn't match the shape, the image is truncated or + // padded with noise to match. + void FromPix(const StaticShape& shape, const Pix* pix, TRand* randomizer); + // Sets up the array from the given set of images, using the currently set + // int_mode_. If the image width doesn't match the shape, the images are + // truncated or padded with noise to match. + void FromPixes(const StaticShape& shape, const std::vector& pixes, + TRand* randomizer); + // Copies the given pix to *this at the given batch index, stretching and + // clipping the pixel values so that [black, black + 2*contrast] maps to the + // dynamic range of *this, ie [-1,1] for a float and (-127,127) for int. + // This is a 2-d operation in the sense that the output depth is the number + // of input channels, the height is the height of the image, and the width + // is the width of the image, or truncated/padded with noise if the width + // is a fixed size. + void Copy2DImage(int batch, Pix* pix, float black, float contrast, + TRand* randomizer); + // Copies the given pix to *this at the given batch index, as Copy2DImage + // above, except that the output depth is the height of the input image, the + // output height is 1, and the output width as for Copy2DImage. + // The image is thus treated as a 1-d set of vertical pixel strips. + void Copy1DGreyImage(int batch, Pix* pix, float black, float contrast, + TRand* randomizer); + // Helper stores the pixel value in i_ or f_ according to int_mode_. + // t: is the index from the StrideMap corresponding to the current + // [batch,y,x] position + // f: is the index into the depth/channel + // pixel: the value of the pixel from the image (in one channel) + // black: the pixel value to map to the lowest of the range of *this + // contrast: the range of pixel values to stretch to half the range of *this. + void SetPixel(int t, int f, int pixel, float black, float contrast); + // Converts the array to a Pix. Must be pixDestroyed after use. + Pix* ToPix() const; + // Prints the first and last num timesteps of the array for each feature. + void Print(int num) const; + + // Returns the timestep width. + int Width() const { + return int_mode_ ? i_.dim1() : f_.dim1(); + } + // Returns the number of features. + int NumFeatures() const { + return int_mode_ ? i_.dim2() : f_.dim2(); + } + // Accessor to a timestep of the float matrix. + float* f(int t) { + ASSERT_HOST(!int_mode_); + return f_[t]; + } + const float* f(int t) const { + ASSERT_HOST(!int_mode_); + return f_[t]; + } + const inT8* i(int t) const { + ASSERT_HOST(int_mode_); + return i_[t]; + } + bool int_mode() const { + return int_mode_; + } + void set_int_mode(bool is_quantized) { + int_mode_ = is_quantized; + } + const StrideMap& stride_map() const { + return stride_map_; + } + void set_stride_map(const StrideMap& map) { + stride_map_ = map; + } + const GENERIC_2D_ARRAY& float_array() const { return f_; } + GENERIC_2D_ARRAY* mutable_float_array() { return &f_; } + + // Copies a single time step from src. + void CopyTimeStepFrom(int dest_t, const NetworkIO& src, int src_t); + // Copies a part of single time step from src. + void CopyTimeStepGeneral(int dest_t, int dest_offset, int num_features, + const NetworkIO& src, int src_t, int src_offset); + // Zeroes a single time step. + void ZeroTimeStep(int t) { ZeroTimeStepGeneral(t, 0, NumFeatures()); } + void ZeroTimeStepGeneral(int t, int offset, int num_features); + // Sets the given range to random values. + void Randomize(int t, int offset, int num_features, TRand* randomizer); + + // Helper returns the label and score of the best choice over a range. + int BestChoiceOverRange(int t_start, int t_end, int not_this, int null_ch, + float* rating, float* certainty) const; + // Helper returns the rating and certainty of the choice over a range in t. + void ScoresOverRange(int t_start, int t_end, int choice, int null_ch, + float* rating, float* certainty) const; + // Returns the index (label) of the best value at the given timestep, + // and if not null, sets the score to the log of the corresponding value. + int BestLabel(int t, float* score) const { + return BestLabel(t, -1, -1, score); + } + // Returns the index (label) of the best value at the given timestep, + // excluding not_this and not_that, and if not null, sets the score to the + // log of the corresponding value. + int BestLabel(int t, int not_this, int not_that, float* score) const; + // Returns the best start position out of range (into which both start and end + // must fit) to obtain the highest cumulative score for the given labels. + int PositionOfBestMatch(const GenericVector& labels, int start, + int end) const; + // Returns the cumulative score of the given labels starting at start, and + // using one label per time-step. + double ScoreOfLabels(const GenericVector& labels, int start) const; + // Helper function sets all the outputs for a single timestep, such that + // label has value ok_score, and the other labels share 1 - ok_score. + // Assumes float mode. + void SetActivations(int t, int label, float ok_score); + // Modifies the values, only if needed, so that the given label is + // the winner at the given time step t. + // Assumes float mode. + void EnsureBestLabel(int t, int label); + // Helper function converts prob to certainty taking the minimum into account. + static float ProbToCertainty(float prob); + // Returns true if there is any bad value that is suspiciously like a GT + // error. Assuming that *this is the difference(gradient) between target + // and forward output, returns true if there is a large negative value + // (correcting a very confident output) for which there is no corresponding + // positive value in an adjacent timestep for the same feature index. This + // allows the box-truthed samples to make fine adjustments to position while + // stopping other disagreements of confident output with ground truth. + bool AnySuspiciousTruth(float confidence_thr) const; + + // Reads a single timestep to floats in the range [-1, 1]. + void ReadTimeStep(int t, double* output) const; + // Adds a single timestep to floats. + void AddTimeStep(int t, double* inout) const; + // Adds part of a single timestep to floats. + void AddTimeStepPart(int t, int offset, int num_features, float* inout) const; + // Writes a single timestep from floats in the range [-1, 1]. + void WriteTimeStep(int t, const double* input); + // Writes a single timestep from floats in the range [-1, 1] writing only + // num_features elements of input to (*this)[t], starting at offset. + void WriteTimeStepPart(int t, int offset, int num_features, + const double* input); + // Maxpools a single time step from src. + void MaxpoolTimeStep(int dest_t, const NetworkIO& src, int src_t, + int* max_line); + // Runs maxpool backward, using maxes to index timesteps in *this. + void MaxpoolBackward(const NetworkIO& fwd, + const GENERIC_2D_ARRAY& maxes); + // Returns the min over time of the maxes over features of the outputs. + float MinOfMaxes() const; + // Returns the min over time. + float Max() const { return int_mode_ ? i_.Max() : f_.Max(); } + // Computes combined results for a combiner that chooses between an existing + // input and itself, with an additional output to indicate the choice. + void CombineOutputs(const NetworkIO& base_output, + const NetworkIO& combiner_output); + // Computes deltas for a combiner that chooses between 2 sets of inputs. + void ComputeCombinerDeltas(const NetworkIO& fwd_deltas, + const NetworkIO& base_output); + + // Copies the array checking that the types match. + void CopyAll(const NetworkIO& src); + // Adds the array to a float array, with scaling to [-1, 1] if the src is int. + void AddAllToFloat(const NetworkIO& src); + // Subtracts the array from a float array. src must also be float. + void SubtractAllFromFloat(const NetworkIO& src); + + // Copies src to *this, with maxabs normalization to match scale. + void CopyWithNormalization(const NetworkIO& src, const NetworkIO& scale); + // Multiplies the float data by the given factor. + void ScaleFloatBy(float factor) { f_ *= factor; } + // Copies src to *this with independent reversal of the y dimension. + void CopyWithYReversal(const NetworkIO& src); + // Copies src to *this with independent reversal of the x dimension. + void CopyWithXReversal(const NetworkIO& src); + // Copies src to *this with independent transpose of the x and y dimensions. + void CopyWithXYTranspose(const NetworkIO& src); + // Copies src to *this, at the given feature_offset, returning the total + // feature offset after the copy. Multiple calls will stack outputs from + // multiple sources in feature space. + int CopyPacking(const NetworkIO& src, int feature_offset); + // Opposite of CopyPacking, fills *this with a part of src, starting at + // feature_offset, and picking num_features. Resizes *this to match. + void CopyUnpacking(const NetworkIO& src, int feature_offset, + int num_features); + // Transposes the float part of *this into dest. + void Transpose(TransposedArray* dest) const; + + // Clips the content of a single time-step to +/-range. + void ClipVector(int t, float range); + + // Applies Func to timestep t of *this (u) and multiplies the result by v + // component-wise, putting the product in *product. + // *this and v may be int or float, but must match. The outputs are double. + template + void FuncMultiply(const NetworkIO& v_io, int t, double* product) { + Func f; + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!v_io.int_mode_); + int dim = f_.dim2(); + if (int_mode_) { + const inT8* u = i_[t]; + const inT8* v = v_io.i_[t]; + for (int i = 0; i < dim; ++i) { + product[i] = f(u[i] / static_cast(MAX_INT8)) * v[i] / + static_cast(MAX_INT8); + } + } else { + const float* u = f_[t]; + const float* v = v_io.f_[t]; + for (int i = 0; i < dim; ++i) { + product[i] = f(u[i]) * v[i]; + } + } + } + // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, + // component-wise, putting the product in *product. + // All NetworkIOs are assumed to be float. + template + void FuncMultiply3(int u_t, const NetworkIO& v_io, int v_t, const double* w, + double* product) const { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!v_io.int_mode_); + Func f; + const float* u = f_[u_t]; + const float* v = v_io.f_[v_t]; + int dim = f_.dim2(); + for (int i = 0; i < dim; ++i) { + product[i] = f(u[i]) * v[i] * w[i]; + } + } + // Applies Func to *this (u) at u_t, and multiplies the result by v[v_t] * w, + // component-wise, adding the product to *product. + // All NetworkIOs are assumed to be float. + template + void FuncMultiply3Add(const NetworkIO& v_io, int t, const double* w, + double* product) const { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!v_io.int_mode_); + Func f; + const float* u = f_[t]; + const float* v = v_io.f_[t]; + int dim = f_.dim2(); + for (int i = 0; i < dim; ++i) { + product[i] += f(u[i]) * v[i] * w[i]; + } + } + // Applies Func1 to *this (u), Func2 to v, and multiplies the result by w, + // component-wise, putting the product in product, all at timestep t, except + // w, which is a simple array. All NetworkIOs are assumed to be float. + template + void Func2Multiply3(const NetworkIO& v_io, int t, const double* w, + double* product) const { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(!v_io.int_mode_); + Func1 f; + Func2 g; + const float* u = f_[t]; + const float* v = v_io.f_[t]; + int dim = f_.dim2(); + for (int i = 0; i < dim; ++i) { + product[i] = f(u[i]) * g(v[i]) * w[i]; + } + } + + private: + // Choice of float vs 8 bit int for data. + GENERIC_2D_ARRAY f_; + GENERIC_2D_ARRAY i_; + // Which of f_ and i_ are we actually using. + bool int_mode_; + // Stride for 2d input data. + StrideMap stride_map_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_NETWORKIO_H_ diff --git a/lstm/networkscratch.h b/lstm/networkscratch.h new file mode 100644 index 00000000..28185506 --- /dev/null +++ b/lstm/networkscratch.h @@ -0,0 +1,257 @@ +/////////////////////////////////////////////////////////////////////// +// File: networkscratch.h +// Description: Scratch space for Network layers that hides distinction +// between float/int implementations. +// Author: Ray Smith +// Created: Thu Jun 19 10:50:29 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_NETWORKSCRATCH_H_ +#define TESSERACT_LSTM_NETWORKSCRATCH_H_ + +#include "genericvector.h" +#include "matrix.h" +#include "networkio.h" +#include "svutil.h" +#include "tprintf.h" + +namespace tesseract { + +// Generic scratch space for network layers. Provides NetworkIO that can store +// a complete set (over time) of intermediates, and GenericVector +// scratch space that auto-frees after use. The aim here is to provide a set +// of temporary buffers to network layers that can be reused between layers +// and don't have to be reallocated on each call. +class NetworkScratch { + public: + NetworkScratch() : int_mode_(false) {} + ~NetworkScratch() {} + + // Sets the network representation. If the representation is integer, then + // default (integer) NetworkIOs are separated from the always-float variety. + // This saves memory by having separate int-specific and float-specific + // stacks. If the network representation is float, then all NetworkIOs go + // to the float stack. + void set_int_mode(bool int_mode) { + int_mode_ = int_mode; + } + + // Class that acts like a NetworkIO (by having an implicit cast operator), + // yet actually holds a pointer to NetworkIOs in the source NetworkScratch, + // and knows how to unstack the borrowed pointers on destruction. + class IO { + public: + // The NetworkIO should be sized after construction. + IO(const NetworkIO& src, NetworkScratch* scratch) + : int_mode_(scratch->int_mode_ && src.int_mode()), + scratch_space_(scratch) { + network_io_ = int_mode_ ? scratch_space_->int_stack_.Borrow() + : scratch_space_->float_stack_.Borrow(); + } + // Default constructor for arrays. Use one of the Resize functions + // below to initialize and size. + IO() : int_mode_(false), network_io_(NULL), scratch_space_(NULL) {} + + ~IO() { + if (scratch_space_ == NULL) { + ASSERT_HOST(network_io_ == NULL); + } else if (int_mode_) { + scratch_space_->int_stack_.Return(network_io_); + } else { + scratch_space_->float_stack_.Return(network_io_); + } + } + // Resizes the array (and stride), avoiding realloc if possible, to the + // size from various size specs: + // Same time size, given number of features. + void Resize(const NetworkIO& src, int num_features, + NetworkScratch* scratch) { + if (scratch_space_ == NULL) { + int_mode_ = scratch->int_mode_ && src.int_mode(); + scratch_space_ = scratch; + network_io_ = int_mode_ ? scratch_space_->int_stack_.Borrow() + : scratch_space_->float_stack_.Borrow(); + } + network_io_->Resize(src, num_features); + } + // Resizes to a specific size as a temp buffer. No batches, no y-dim. + void Resize2d(bool int_mode, int width, int num_features, + NetworkScratch* scratch) { + if (scratch_space_ == NULL) { + int_mode_ = scratch->int_mode_ && int_mode; + scratch_space_ = scratch; + network_io_ = int_mode_ ? scratch_space_->int_stack_.Borrow() + : scratch_space_->float_stack_.Borrow(); + } + network_io_->Resize2d(int_mode, width, num_features); + } + // Resize forcing a float representation with the width of src and the given + // number of features. + void ResizeFloat(const NetworkIO& src, int num_features, + NetworkScratch* scratch) { + if (scratch_space_ == NULL) { + int_mode_ = false; + scratch_space_ = scratch; + network_io_ = scratch_space_->float_stack_.Borrow(); + } + network_io_->ResizeFloat(src, num_features); + } + + // Returns a ref to a NetworkIO that enables *this to be treated as if + // it were just a NetworkIO*. + NetworkIO& operator*() { + return *network_io_; + } + NetworkIO* operator->() { + return network_io_; + } + operator NetworkIO*() { + return network_io_; + } + + private: + // True if this is from the always-float stack, otherwise the default stack. + bool int_mode_; + // The NetworkIO that we have borrowed from the scratch_space_. + NetworkIO* network_io_; + // The source scratch_space_. Borrowed pointer, used to free the + // NetworkIO. Don't delete! + NetworkScratch* scratch_space_; + }; // class IO. + + // Class that acts like a fixed array of float, yet actually uses space + // from a GenericVector in the source NetworkScratch, and knows how + // to unstack the borrowed vector on destruction. + class FloatVec { + public: + // The array will have size elements in it, uninitialized. + FloatVec(int size, NetworkScratch* scratch) + : vec_(NULL), scratch_space_(scratch) { + Init(size, scratch); + } + // Default constructor is for arrays. Use Init to setup. + FloatVec() : vec_(NULL), data_(NULL), scratch_space_(NULL) {} + ~FloatVec() { + if (scratch_space_ != NULL) scratch_space_->vec_stack_.Return(vec_); + } + + void Init(int size, NetworkScratch* scratch) { + if (scratch_space_ != NULL && vec_ != NULL) + scratch_space_->vec_stack_.Return(vec_); + scratch_space_ = scratch; + vec_ = scratch_space_->vec_stack_.Borrow(); + vec_->resize_no_init(size); + data_ = &(*vec_)[0]; + } + + // Use the cast operator instead of operator[] so the FloatVec can be used + // as a double* argument to a function call. + operator double*() const { return data_; } + double* get() { return data_; } + + private: + // Vector borrowed from the scratch space. Use Return to free it. + GenericVector* vec_; + // Short-cut pointer to the underlying array. + double* data_; + // The source scratch_space_. Borrowed pointer, used to free the + // vector. Don't delete! + NetworkScratch* scratch_space_; + }; // class FloatVec + + // Class that acts like a 2-D array of double, yet actually uses space + // from the source NetworkScratch, and knows how to unstack the borrowed + // array on destruction. + class GradientStore { + public: + // Default constructor is for arrays. Use Init to setup. + GradientStore() : array_(NULL), scratch_space_(NULL) {} + ~GradientStore() { + if (scratch_space_ != NULL) scratch_space_->array_stack_.Return(array_); + } + + void Init(int size1, int size2, NetworkScratch* scratch) { + if (scratch_space_ != NULL && array_ != NULL) + scratch_space_->array_stack_.Return(array_); + scratch_space_ = scratch; + array_ = scratch_space_->array_stack_.Borrow(); + array_->Resize(size1, size2, 0.0); + } + + // Accessors to get to the underlying TransposedArray. + TransposedArray* get() const { return array_; } + const TransposedArray& operator*() const { return *array_; } + + private: + // Array borrowed from the scratch space. Use Return to free it. + TransposedArray* array_; + // The source scratch_space_. Borrowed pointer, used to free the + // vector. Don't delete! + NetworkScratch* scratch_space_; + }; // class GradientStore + + // Class that does the work of holding a stack of objects, a stack pointer + // and a vector of in-use flags, so objects can be returned out of order. + // It is safe to attempt to Borrow/Return in multiple threads. + template class Stack { + public: + Stack() : stack_top_(0) { + } + + // Lends out the next free item, creating one if none available, sets + // the used flags and increments the stack top. + T* Borrow() { + SVAutoLock lock(&mutex_); + if (stack_top_ == stack_.size()) { + stack_.push_back(new T); + flags_.push_back(false); + } + flags_[stack_top_] = true; + return stack_[stack_top_++]; + } + // Takes back the given item, and marks it free. Item does not have to be + // the most recently lent out, but free slots don't get re-used until the + // blocking item is returned. The assumption is that there will only be + // small, temporary variations from true stack use. (Determined by the order + // of destructors within a local scope.) + void Return(T* item) { + SVAutoLock lock(&mutex_); + // Linear search will do. + int index = stack_top_ - 1; + while (index >= 0 && stack_[index] != item) --index; + if (index >= 0) flags_[index] = false; + while (stack_top_ > 0 && !flags_[stack_top_ - 1]) --stack_top_; + } + + private: + PointerVector stack_; + GenericVector flags_; + int stack_top_; + SVMutex mutex_; + }; // class Stack. + + private: + // If true, the network weights are inT8, if false, float. + bool int_mode_; + // Stacks of NetworkIO and GenericVector. Once allocated, they are not + // deleted until the NetworkScratch is deleted. + Stack int_stack_; + Stack float_stack_; + Stack > vec_stack_; + Stack array_stack_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_NETWORKSCRATCH_H_ diff --git a/lstm/parallel.cpp b/lstm/parallel.cpp new file mode 100644 index 00000000..c60778bf --- /dev/null +++ b/lstm/parallel.cpp @@ -0,0 +1,179 @@ +///////////////////////////////////////////////////////////////////////// +// File: parallel.cpp +// Description: Runs networks in parallel on the same input. +// Author: Ray Smith +// Created: Thu May 02 08:06:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "parallel.h" + +#ifdef _OPENMP +#include +#endif + +#include "functions.h" // For conditional undef of _OPENMP. +#include "networkscratch.h" + +namespace tesseract { + +// ni_ and no_ will be set by AddToStack. +Parallel::Parallel(const STRING& name, NetworkType type) : Plumbing(name) { + type_ = type; +} + +Parallel::~Parallel() { +} + +// Returns the shape output from the network given an input shape (which may +// be partially unknown ie zero). +StaticShape Parallel::OutputShape(const StaticShape& input_shape) const { + StaticShape result = stack_[0]->OutputShape(input_shape); + int stack_size = stack_.size(); + for (int i = 1; i < stack_size; ++i) { + StaticShape shape = stack_[i]->OutputShape(input_shape); + result.set_depth(result.depth() + shape.depth()); + } + return result; +} + +// Runs forward propagation of activations on the input line. +// See NetworkCpp for a detailed discussion of the arguments. +void Parallel::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + bool parallel_debug = false; + // If this parallel is a replicator of convolvers, or holds a 1-d LSTM pair, + // or a 2-d LSTM quad, do debug locally, and don't pass the flag on. + if (debug && type_ != NT_PARALLEL) { + parallel_debug = true; + debug = false; + } + int stack_size = stack_.size(); + if (type_ == NT_PAR_2D_LSTM) { + // Special case, run parallel in parallel. + GenericVector results; + results.init_to_size(stack_size, NetworkScratch::IO()); + for (int i = 0; i < stack_size; ++i) { + results[i].Resize(input, stack_[i]->NumOutputs(), scratch); + } +#ifdef _OPENMP +#pragma omp parallel for num_threads(stack_size) +#endif + for (int i = 0; i < stack_size; ++i) { + stack_[i]->Forward(debug, input, NULL, scratch, results[i]); + } + // Now pack all the results (serially) into the output. + int out_offset = 0; + output->Resize(*results[0], NumOutputs()); + for (int i = 0; i < stack_size; ++i) { + out_offset = output->CopyPacking(*results[i], out_offset); + } + } else { + // Revolving intermediate result. + NetworkScratch::IO result(input, scratch); + // Source for divided replicated. + NetworkScratch::IO source_part; + TransposedArray* src_transpose = NULL; + if (IsTraining() && type_ == NT_REPLICATED) { + // Make a transposed copy of the input. + input.Transpose(&transposed_input_); + src_transpose = &transposed_input_; + } + // Run each network, putting the outputs into result. + int out_offset = 0; + for (int i = 0; i < stack_size; ++i) { + stack_[i]->Forward(debug, input, src_transpose, scratch, result); + // All networks must have the same output width + if (i == 0) { + output->Resize(*result, NumOutputs()); + } else { + ASSERT_HOST(result->Width() == output->Width()); + } + out_offset = output->CopyPacking(*result, out_offset); + } + } + if (parallel_debug) { + DisplayForward(*output); + } +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool Parallel::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + // If this parallel is a replicator of convolvers, or holds a 1-d LSTM pair, + // or a 2-d LSTM quad, do debug locally, and don't pass the flag on. + if (debug && type_ != NT_PARALLEL) { + DisplayBackward(fwd_deltas); + debug = false; + } + int stack_size = stack_.size(); + if (type_ == NT_PAR_2D_LSTM) { + // Special case, run parallel in parallel. + GenericVector in_deltas, out_deltas; + in_deltas.init_to_size(stack_size, NetworkScratch::IO()); + out_deltas.init_to_size(stack_size, NetworkScratch::IO()); + // Split the forward deltas for each stack element. + int feature_offset = 0; + for (int i = 0; i < stack_.size(); ++i) { + int num_features = stack_[i]->NumOutputs(); + in_deltas[i].Resize(fwd_deltas, num_features, scratch); + out_deltas[i].Resize(fwd_deltas, stack_[i]->NumInputs(), scratch); + in_deltas[i]->CopyUnpacking(fwd_deltas, feature_offset, num_features); + feature_offset += num_features; + } +#ifdef _OPENMP +#pragma omp parallel for num_threads(stack_size) +#endif + for (int i = 0; i < stack_size; ++i) { + stack_[i]->Backward(debug, *in_deltas[i], scratch, + i == 0 ? back_deltas : out_deltas[i]); + } + if (needs_to_backprop_) { + for (int i = 1; i < stack_size; ++i) { + back_deltas->AddAllToFloat(*out_deltas[i]); + } + } + } else { + // Revolving partial deltas. + NetworkScratch::IO in_deltas(fwd_deltas, scratch); + // The sum of deltas from different sources, which will eventually go into + // back_deltas. + NetworkScratch::IO out_deltas; + int feature_offset = 0; + for (int i = 0; i < stack_.size(); ++i) { + int num_features = stack_[i]->NumOutputs(); + in_deltas->CopyUnpacking(fwd_deltas, feature_offset, num_features); + feature_offset += num_features; + if (stack_[i]->Backward(debug, *in_deltas, scratch, back_deltas)) { + if (i == 0) { + out_deltas.ResizeFloat(*back_deltas, back_deltas->NumFeatures(), + scratch); + out_deltas->CopyAll(*back_deltas); + } else if (back_deltas->NumFeatures() == out_deltas->NumFeatures()) { + // Widths are allowed to be different going back, as we may have + // input nets, so only accumulate the deltas if the widths are the + // same. + out_deltas->AddAllToFloat(*back_deltas); + } + } + } + if (needs_to_backprop_) back_deltas->CopyAll(*out_deltas); + } + if (needs_to_backprop_) back_deltas->ScaleFloatBy(1.0f / stack_size); + return needs_to_backprop_; +} + +} // namespace tesseract. diff --git a/lstm/parallel.h b/lstm/parallel.h new file mode 100644 index 00000000..ad290a7e --- /dev/null +++ b/lstm/parallel.h @@ -0,0 +1,87 @@ +/////////////////////////////////////////////////////////////////////// +// File: parallel.h +// Description: Runs networks in parallel on the same input. +// Author: Ray Smith +// Created: Thu May 02 08:02:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_PARALLEL_H_ +#define TESSERACT_LSTM_PARALLEL_H_ + +#include "plumbing.h" + +namespace tesseract { + +// Runs multiple networks in parallel, interlacing their outputs. +class Parallel : public Plumbing { + public: + // ni_ and no_ will be set by AddToStack. + Parallel(const STRING& name, NetworkType type); + virtual ~Parallel(); + + // Returns the shape output from the network given an input shape (which may + // be partially unknown ie zero). + virtual StaticShape OutputShape(const StaticShape& input_shape) const; + + virtual STRING spec() const { + STRING spec; + if (type_ == NT_PAR_2D_LSTM) { + // We have 4 LSTMs operating in parallel here, so the size of each is + // the number of outputs/4. + spec.add_str_int("L2xy", no_ / 4); + } else if (type_ == NT_PAR_RL_LSTM) { + // We have 2 LSTMs operating in parallel here, so the size of each is + // the number of outputs/2. + if (stack_[0]->type() == NT_LSTM_SUMMARY) + spec.add_str_int("Lbxs", no_ / 2); + else + spec.add_str_int("Lbx", no_ / 2); + } else { + if (type_ == NT_REPLICATED) { + spec.add_str_int("R", stack_.size()); + spec += "("; + spec += stack_[0]->spec(); + } else { + spec = "("; + for (int i = 0; i < stack_.size(); ++i) spec += stack_[i]->spec(); + } + spec += ")"; + } + return spec; + } + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + + // Runs backward propagation of errors on the deltas line. + // See Network for a detailed discussion of the arguments. + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas); + + private: + // If *this is a NT_REPLICATED, then it feeds a replicated network with + // identical inputs, and it would be extremely wasteful for them to each + // calculate and store the same transpose of the inputs, so Parallel does it + // and passes a pointer to the replicated network, allowing it to use the + // transpose on the next call to Backward. + TransposedArray transposed_input_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_PARALLEL_H_ diff --git a/lstm/plumbing.cpp b/lstm/plumbing.cpp new file mode 100644 index 00000000..bfb58254 --- /dev/null +++ b/lstm/plumbing.cpp @@ -0,0 +1,238 @@ +/////////////////////////////////////////////////////////////////////// +// File: plumbing.cpp +// Description: Base class for networks that organize other networks +// eg series or parallel. +// Author: Ray Smith +// Created: Mon May 12 08:17:34 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "plumbing.h" + +namespace tesseract { + +// ni_ and no_ will be set by AddToStack. +Plumbing::Plumbing(const STRING& name) + : Network(NT_PARALLEL, name, 0, 0) { +} + +Plumbing::~Plumbing() { +} + +// Suspends/Enables training by setting the training_ flag. Serialize and +// DeSerialize only operate on the run-time data if state is false. +void Plumbing::SetEnableTraining(TrainingState state) { + Network::SetEnableTraining(state); + for (int i = 0; i < stack_.size(); ++i) + stack_[i]->SetEnableTraining(state); +} + +// Sets flags that control the action of the network. See NetworkFlags enum +// for bit values. +void Plumbing::SetNetworkFlags(uinT32 flags) { + Network::SetNetworkFlags(flags); + for (int i = 0; i < stack_.size(); ++i) + stack_[i]->SetNetworkFlags(flags); +} + +// Sets up the network for training. Initializes weights using weights of +// scale `range` picked according to the random number generator `randomizer`. +// Note that randomizer is a borrowed pointer that should outlive the network +// and should not be deleted by any of the networks. +// Returns the number of weights initialized. +int Plumbing::InitWeights(float range, TRand* randomizer) { + num_weights_ = 0; + for (int i = 0; i < stack_.size(); ++i) + num_weights_ += stack_[i]->InitWeights(range, randomizer); + return num_weights_; +} + +// Converts a float network to an int network. +void Plumbing::ConvertToInt() { + for (int i = 0; i < stack_.size(); ++i) + stack_[i]->ConvertToInt(); +} + +// Provides a pointer to a TRand for any networks that care to use it. +// Note that randomizer is a borrowed pointer that should outlive the network +// and should not be deleted by any of the networks. +void Plumbing::SetRandomizer(TRand* randomizer) { + for (int i = 0; i < stack_.size(); ++i) + stack_[i]->SetRandomizer(randomizer); +} + +// Adds the given network to the stack. +void Plumbing::AddToStack(Network* network) { + if (stack_.empty()) { + ni_ = network->NumInputs(); + no_ = network->NumOutputs(); + } else if (type_ == NT_SERIES) { + // ni is input of first, no output of last, others match output to input. + ASSERT_HOST(no_ == network->NumInputs()); + no_ = network->NumOutputs(); + } else { + // All parallel types. Output is sum of outputs, inputs all match. + ASSERT_HOST(ni_ == network->NumInputs()); + no_ += network->NumOutputs(); + } + stack_.push_back(network); +} + +// Sets needs_to_backprop_ to needs_backprop and calls on sub-network +// according to needs_backprop || any weights in this network. +bool Plumbing::SetupNeedsBackprop(bool needs_backprop) { + if (IsTraining()) { + needs_to_backprop_ = needs_backprop; + bool retval = needs_backprop; + for (int i = 0; i < stack_.size(); ++i) { + if (stack_[i]->SetupNeedsBackprop(needs_backprop)) retval = true; + } + return retval; + } + // Frozen networks don't do backprop. + needs_to_backprop_ = false; + return false; +} + +// Returns an integer reduction factor that the network applies to the +// time sequence. Assumes that any 2-d is already eliminated. Used for +// scaling bounding boxes of truth data. +// WARNING: if GlobalMinimax is used to vary the scale, this will return +// the last used scale factor. Call it before any forward, and it will return +// the minimum scale factor of the paths through the GlobalMinimax. +int Plumbing::XScaleFactor() const { + return stack_[0]->XScaleFactor(); +} + +// Provides the (minimum) x scale factor to the network (of interest only to +// input units) so they can determine how to scale bounding boxes. +void Plumbing::CacheXScaleFactor(int factor) { + for (int i = 0; i < stack_.size(); ++i) { + stack_[i]->CacheXScaleFactor(factor); + } +} + +// Provides debug output on the weights. +void Plumbing::DebugWeights() { + for (int i = 0; i < stack_.size(); ++i) + stack_[i]->DebugWeights(); +} + +// Returns a set of strings representing the layer-ids of all layers below. +void Plumbing::EnumerateLayers(const STRING* prefix, + GenericVector* layers) const { + for (int i = 0; i < stack_.size(); ++i) { + STRING layer_name; + if (prefix) layer_name = *prefix; + layer_name.add_str_int(":", i); + if (stack_[i]->IsPlumbingType()) { + Plumbing* plumbing = reinterpret_cast(stack_[i]); + plumbing->EnumerateLayers(&layer_name, layers); + } else { + layers->push_back(layer_name); + } + } +} + +// Returns a pointer to the network layer corresponding to the given id. +Network* Plumbing::GetLayer(const char* id) const { + char* next_id; + int index = strtol(id, &next_id, 10); + if (index < 0 || index >= stack_.size()) return NULL; + if (stack_[index]->IsPlumbingType()) { + Plumbing* plumbing = reinterpret_cast(stack_[index]); + ASSERT_HOST(*next_id == ':'); + return plumbing->GetLayer(next_id + 1); + } + return stack_[index]; +} + +// Returns a pointer to the learning rate for the given layer id. +float* Plumbing::LayerLearningRatePtr(const char* id) const { + char* next_id; + int index = strtol(id, &next_id, 10); + if (index < 0 || index >= stack_.size()) return NULL; + if (stack_[index]->IsPlumbingType()) { + Plumbing* plumbing = reinterpret_cast(stack_[index]); + ASSERT_HOST(*next_id == ':'); + return plumbing->LayerLearningRatePtr(next_id + 1); + } + if (index < 0 || index >= learning_rates_.size()) return NULL; + return &learning_rates_[index]; +} + +// Writes to the given file. Returns false in case of error. +bool Plumbing::Serialize(TFile* fp) const { + if (!Network::Serialize(fp)) return false; + inT32 size = stack_.size(); + // Can't use PointerVector::Serialize here as we need a special DeSerialize. + if (fp->FWrite(&size, sizeof(size), 1) != 1) return false; + for (int i = 0; i < size; ++i) + if (!stack_[i]->Serialize(fp)) return false; + if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && + !learning_rates_.Serialize(fp)) { + return false; + } + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool Plumbing::DeSerialize(bool swap, TFile* fp) { + stack_.truncate(0); + no_ = 0; // We will be modifying this as we AddToStack. + inT32 size; + if (fp->FRead(&size, sizeof(size), 1) != 1) return false; + for (int i = 0; i < size; ++i) { + Network* network = CreateFromFile(swap, fp); + if (network == NULL) return false; + AddToStack(network); + } + if ((network_flags_ & NF_LAYER_SPECIFIC_LR) && + !learning_rates_.DeSerialize(swap, fp)) { + return false; + } + return true; +} + +// Updates the weights using the given learning rate and momentum. +// num_samples is the quotient to be used in the adagrad computation iff +// use_ada_grad_ is true. +void Plumbing::Update(float learning_rate, float momentum, int num_samples) { + for (int i = 0; i < stack_.size(); ++i) { + if (network_flags_ & NF_LAYER_SPECIFIC_LR) { + if (i < learning_rates_.size()) + learning_rate = learning_rates_[i]; + else + learning_rates_.push_back(learning_rate); + } + if (stack_[i]->IsTraining()) { + stack_[i]->Update(learning_rate, momentum, num_samples); + } + } +} + +// Sums the products of weight updates in *this and other, splitting into +// positive (same direction) in *same and negative (different direction) in +// *changed. +void Plumbing::CountAlternators(const Network& other, double* same, + double* changed) const { + ASSERT_HOST(other.type() == type_); + const Plumbing* plumbing = reinterpret_cast(&other); + ASSERT_HOST(plumbing->stack_.size() == stack_.size()); + for (int i = 0; i < stack_.size(); ++i) + stack_[i]->CountAlternators(*plumbing->stack_[i], same, changed); +} + +} // namespace tesseract. + diff --git a/lstm/plumbing.h b/lstm/plumbing.h new file mode 100644 index 00000000..bda855e0 --- /dev/null +++ b/lstm/plumbing.h @@ -0,0 +1,143 @@ +/////////////////////////////////////////////////////////////////////// +// File: plumbing.h +// Description: Base class for networks that organize other networks +// eg series or parallel. +// Author: Ray Smith +// Created: Mon May 12 08:11:36 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_PLUMBING_H_ +#define TESSERACT_LSTM_PLUMBING_H_ + +#include "genericvector.h" +#include "matrix.h" +#include "network.h" + +namespace tesseract { + +// Holds a collection of other networks and forwards calls to each of them. +class Plumbing : public Network { + public: + // ni_ and no_ will be set by AddToStack. + explicit Plumbing(const STRING& name); + virtual ~Plumbing(); + + // Returns the required shape input to the network. + virtual StaticShape InputShape() const { return stack_[0]->InputShape(); } + virtual STRING spec() const { + return "Sub-classes of Plumbing must implement spec()!"; + } + + // Returns true if the given type is derived from Plumbing, and thus contains + // multiple sub-networks that can have their own learning rate. + virtual bool IsPlumbingType() const { return true; } + + // Suspends/Enables training by setting the training_ flag. Serialize and + // DeSerialize only operate on the run-time data if state is false. + virtual void SetEnableTraining(TrainingState state); + + // Sets flags that control the action of the network. See NetworkFlags enum + // for bit values. + virtual void SetNetworkFlags(uinT32 flags); + + // Sets up the network for training. Initializes weights using weights of + // scale `range` picked according to the random number generator `randomizer`. + // Note that randomizer is a borrowed pointer that should outlive the network + // and should not be deleted by any of the networks. + // Returns the number of weights initialized. + virtual int InitWeights(float range, TRand* randomizer); + + // Converts a float network to an int network. + virtual void ConvertToInt(); + + // Provides a pointer to a TRand for any networks that care to use it. + // Note that randomizer is a borrowed pointer that should outlive the network + // and should not be deleted by any of the networks. + virtual void SetRandomizer(TRand* randomizer); + + // Adds the given network to the stack. + virtual void AddToStack(Network* network); + + // Sets needs_to_backprop_ to needs_backprop and returns true if + // needs_backprop || any weights in this network so the next layer forward + // can be told to produce backprop for this layer if needed. + virtual bool SetupNeedsBackprop(bool needs_backprop); + + // Returns an integer reduction factor that the network applies to the + // time sequence. Assumes that any 2-d is already eliminated. Used for + // scaling bounding boxes of truth data. + // WARNING: if GlobalMinimax is used to vary the scale, this will return + // the last used scale factor. Call it before any forward, and it will return + // the minimum scale factor of the paths through the GlobalMinimax. + virtual int XScaleFactor() const; + + // Provides the (minimum) x scale factor to the network (of interest only to + // input units) so they can determine how to scale bounding boxes. + virtual void CacheXScaleFactor(int factor); + + // Provides debug output on the weights. + virtual void DebugWeights(); + + // Returns the current stack. + const PointerVector& stack() const { + return stack_; + } + // Returns a set of strings representing the layer-ids of all layers below. + void EnumerateLayers(const STRING* prefix, + GenericVector* layers) const; + // Returns a pointer to the network layer corresponding to the given id. + Network* GetLayer(const char* id) const; + // Returns the learning rate for a specific layer of the stack. + float LayerLearningRate(const char* id) const { + const float* lr_ptr = LayerLearningRatePtr(id); + ASSERT_HOST(lr_ptr != NULL); + return *lr_ptr; + } + // Scales the learning rate for a specific layer of the stack. + void ScaleLayerLearningRate(const char* id, double factor) { + float* lr_ptr = LayerLearningRatePtr(id); + ASSERT_HOST(lr_ptr != NULL); + *lr_ptr *= factor; + } + // Returns a pointer to the learning rate for the given layer id. + float* LayerLearningRatePtr(const char* id) const; + + // Writes to the given file. Returns false in case of error. + virtual bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + virtual bool DeSerialize(bool swap, TFile* fp); + + // Updates the weights using the given learning rate and momentum. + // num_samples is the quotient to be used in the adagrad computation iff + // use_ada_grad_ is true. + virtual void Update(float learning_rate, float momentum, int num_samples); + // Sums the products of weight updates in *this and other, splitting into + // positive (same direction) in *same and negative (different direction) in + // *changed. + virtual void CountAlternators(const Network& other, double* same, + double* changed) const; + + protected: + // The networks. + PointerVector stack_; + // Layer-specific learning rate iff network_flags_ & NF_LAYER_SPECIFIC_LR. + // One element for each element of stack_. + GenericVector learning_rates_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_PLUMBING_H_ + diff --git a/lstm/recodebeam.cpp b/lstm/recodebeam.cpp new file mode 100644 index 00000000..1b4a7477 --- /dev/null +++ b/lstm/recodebeam.cpp @@ -0,0 +1,896 @@ +/////////////////////////////////////////////////////////////////////// +// File: recodebeam.cpp +// Description: Beam search to decode from the re-encoded CJK as a sequence of +// smaller numbers in place of a single large code. +// Author: Ray Smith +// Created: Fri Mar 13 09:39:01 PDT 2015 +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include "recodebeam.h" +#include "networkio.h" +#include "pageres.h" +#include "unicharcompress.h" + +namespace tesseract { + +// Clipping value for certainty inside Tesseract. Reflects the minimum value +// of certainty that will be returned by ExtractBestPathAsUnicharIds. +// Supposedly on a uniform scale that can be compared across languages and +// engines. +const float RecodeBeamSearch::kMinCertainty = -20.0f; + +// The beam width at each code position. +const int RecodeBeamSearch::kBeamWidths[RecodedCharID::kMaxCodeLen + 1] = { + 5, 10, 16, 16, 16, 16, 16, 16, 16, 16, +}; + +const char* kNodeContNames[] = {"Anything", "OnlyDup", "NoDup"}; + +// Prints debug details of the node. +void RecodeNode::Print(int null_char, const UNICHARSET& unicharset, + int depth) const { + if (code == null_char) { + tprintf("null_char"); + } else { + tprintf("label=%d, uid=%d=%s", code, unichar_id, + unicharset.debug_str(unichar_id).string()); + } + tprintf(" score=%g, c=%g,%s%s%s perm=%d, hash=%lx", score, certainty, + start_of_dawg ? " DawgStart" : "", start_of_word ? " Start" : "", + end_of_word ? " End" : "", permuter, code_hash); + if (depth > 0 && prev != nullptr) { + tprintf(" prev:"); + prev->Print(null_char, unicharset, depth - 1); + } else { + tprintf("\n"); + } +} + +// Borrows the pointer, which is expected to survive until *this is deleted. +RecodeBeamSearch::RecodeBeamSearch(const UnicharCompress& recoder, + int null_char, bool simple_text, Dict* dict) + : recoder_(recoder), + beam_size_(0), + top_code_(-1), + second_code_(-1), + dict_(dict), + space_delimited_(true), + is_simple_text_(simple_text), + null_char_(null_char) { + if (dict_ != NULL && !dict_->IsSpaceDelimitedLang()) space_delimited_ = false; +} + +// Decodes the set of network outputs, storing the lattice internally. +void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio, + double cert_offset, double worst_dict_cert, + const UNICHARSET* charset) { + beam_size_ = 0; + int width = output.Width(); + for (int t = 0; t < width; ++t) { + ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]); + DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert, + charset); + } +} +void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY& output, + double dict_ratio, double cert_offset, + double worst_dict_cert, + const UNICHARSET* charset) { + beam_size_ = 0; + int width = output.dim1(); + for (int t = 0; t < width; ++t) { + ComputeTopN(output[t], output.dim2(), kBeamWidths[0]); + DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset); + } +} + +// Returns the best path as labels/scores/xcoords similar to simple CTC. +void RecodeBeamSearch::ExtractBestPathAsLabels( + GenericVector* labels, GenericVector* xcoords) const { + labels->truncate(0); + xcoords->truncate(0); + GenericVector best_nodes; + ExtractBestPaths(&best_nodes, NULL); + // Now just run CTC on the best nodes. + int t = 0; + int width = best_nodes.size(); + while (t < width) { + int label = best_nodes[t]->code; + if (label != null_char_) { + labels->push_back(label); + xcoords->push_back(t); + } + while (++t < width && !is_simple_text_ && best_nodes[t]->code == label) { + } + } + xcoords->push_back(width); +} + +// Returns the best path as unichar-ids/certs/ratings/xcoords skipping +// duplicates, nulls and intermediate parts. +void RecodeBeamSearch::ExtractBestPathAsUnicharIds( + bool debug, const UNICHARSET* unicharset, GenericVector* unichar_ids, + GenericVector* certs, GenericVector* ratings, + GenericVector* xcoords) const { + GenericVector best_nodes; + ExtractBestPaths(&best_nodes, NULL); + ExtractPathAsUnicharIds(best_nodes, unichar_ids, certs, ratings, xcoords); + if (debug) { + DebugPath(unicharset, best_nodes); + DebugUnicharPath(unicharset, best_nodes, *unichar_ids, *certs, *ratings, + *xcoords); + } +} + +// Returns the best path as a set of WERD_RES. +void RecodeBeamSearch::ExtractBestPathAsWords(const TBOX& line_box, + float scale_factor, bool debug, + const UNICHARSET* unicharset, + PointerVector* words) { + words->truncate(0); + GenericVector unichar_ids; + GenericVector certs; + GenericVector ratings; + GenericVector xcoords; + GenericVector best_nodes; + GenericVector second_nodes; + ExtractBestPaths(&best_nodes, &second_nodes); + if (debug) { + DebugPath(unicharset, best_nodes); + ExtractPathAsUnicharIds(second_nodes, &unichar_ids, &certs, &ratings, + &xcoords); + tprintf("\nSecond choice path:\n"); + DebugUnicharPath(unicharset, second_nodes, unichar_ids, certs, ratings, + xcoords); + } + ExtractPathAsUnicharIds(best_nodes, &unichar_ids, &certs, &ratings, &xcoords); + int num_ids = unichar_ids.size(); + if (debug) { + DebugUnicharPath(unicharset, best_nodes, unichar_ids, certs, ratings, + xcoords); + } + // Convert labels to unichar-ids. + int word_end = 0; + float prev_space_cert = 0.0f; + for (int word_start = 0; word_start < num_ids; word_start = word_end) { + for (word_end = word_start + 1; word_end < num_ids; ++word_end) { + // A word is terminated when a space character or start_of_word flag is + // hit. We also want to force a separate word for every non + // space-delimited character when not in a dictionary context. + if (unichar_ids[word_end] == UNICHAR_SPACE) break; + int index = xcoords[word_end]; + if (best_nodes[index]->start_of_word) break; + if (best_nodes[index]->permuter == TOP_CHOICE_PERM && + (!unicharset->IsSpaceDelimited(unichar_ids[word_end]) || + !unicharset->IsSpaceDelimited(unichar_ids[word_end - 1]))) + break; + } + float space_cert = 0.0f; + if (word_end < num_ids && unichar_ids[word_end] == UNICHAR_SPACE) + space_cert = certs[word_end]; + bool leading_space = + word_start > 0 && unichar_ids[word_start - 1] == UNICHAR_SPACE; + // Create a WERD_RES for the output word. + WERD_RES* word_res = InitializeWord( + leading_space, line_box, word_start, word_end, + MIN(space_cert, prev_space_cert), unicharset, xcoords, scale_factor); + for (int i = word_start; i < word_end; ++i) { + BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST; + BLOB_CHOICE_IT bc_it(choices); + BLOB_CHOICE* choice = new BLOB_CHOICE( + unichar_ids[i], ratings[i], certs[i], -1, 1.0f, + static_cast(MAX_INT16), 0.0f, BCC_STATIC_CLASSIFIER); + int col = i - word_start; + choice->set_matrix_cell(col, col); + bc_it.add_after_then_move(choice); + word_res->ratings->put(col, col, choices); + } + int index = xcoords[word_end - 1]; + word_res->FakeWordFromRatings(best_nodes[index]->permuter); + words->push_back(word_res); + prev_space_cert = space_cert; + if (word_end < num_ids && unichar_ids[word_end] == UNICHAR_SPACE) + ++word_end; + } +} + +// Generates debug output of the content of the beams after a Decode. +void RecodeBeamSearch::DebugBeams(const UNICHARSET& unicharset) const { + for (int p = 0; p < beam_size_; ++p) { + for (int d = 0; d < 2; ++d) { + for (int c = 0; c < NC_COUNT; ++c) { + NodeContinuation cont = static_cast(c); + int index = BeamIndex(d, cont, 0); + if (beam_[p]->beams_[index].empty()) continue; + // Print all the best scoring nodes for each unichar found. + tprintf("Position %d: %s+%s beam\n", p, d ? "Dict" : "Non-Dict", + kNodeContNames[c]); + DebugBeamPos(unicharset, beam_[p]->beams_[index]); + } + } + } +} + +// Generates debug output of the content of a single beam position. +void RecodeBeamSearch::DebugBeamPos(const UNICHARSET& unicharset, + const RecodeHeap& heap) const { + GenericVector unichar_bests; + unichar_bests.init_to_size(unicharset.size(), NULL); + const RecodeNode* null_best = NULL; + int heap_size = heap.size(); + for (int i = 0; i < heap_size; ++i) { + const RecodeNode* node = &heap.get(i).data; + if (node->unichar_id == INVALID_UNICHAR_ID) { + if (null_best == NULL || null_best->score < node->score) null_best = node; + } else { + if (unichar_bests[node->unichar_id] == NULL || + unichar_bests[node->unichar_id]->score < node->score) { + unichar_bests[node->unichar_id] = node; + } + } + } + for (int u = 0; u < unichar_bests.size(); ++u) { + if (unichar_bests[u] != NULL) { + const RecodeNode& node = *unichar_bests[u]; + node.Print(null_char_, unicharset, 1); + } + } + if (null_best != NULL) { + null_best->Print(null_char_, unicharset, 1); + } +} + +// Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping +// duplicates, nulls and intermediate parts. +/* static */ +void RecodeBeamSearch::ExtractPathAsUnicharIds( + const GenericVector& best_nodes, + GenericVector* unichar_ids, GenericVector* certs, + GenericVector* ratings, GenericVector* xcoords) { + unichar_ids->truncate(0); + certs->truncate(0); + ratings->truncate(0); + xcoords->truncate(0); + // Backtrack extracting only valid, non-duplicate unichar-ids. + int t = 0; + int width = best_nodes.size(); + while (t < width) { + double certainty = 0.0; + double rating = 0.0; + while (t < width && best_nodes[t]->unichar_id == INVALID_UNICHAR_ID) { + double cert = best_nodes[t++]->certainty; + if (cert < certainty) certainty = cert; + rating -= cert; + } + if (t < width) { + int unichar_id = best_nodes[t]->unichar_id; + if (unichar_id == UNICHAR_SPACE && !certs->empty() && + best_nodes[t]->permuter != NO_PERM) { + // All the rating and certainty go on the previous character except + // for the space itself. + if (certainty < certs->back()) certs->back() = certainty; + ratings->back() += rating; + certainty = 0.0; + rating = 0.0; + } + unichar_ids->push_back(unichar_id); + xcoords->push_back(t); + do { + double cert = best_nodes[t++]->certainty; + // Special-case NO-PERM space to forget the certainty of the previous + // nulls. See long comment in ContinueContext. + if (cert < certainty || (unichar_id == UNICHAR_SPACE && + best_nodes[t - 1]->permuter == NO_PERM)) { + certainty = cert; + } + rating -= cert; + } while (t < width && best_nodes[t]->duplicate); + certs->push_back(certainty); + ratings->push_back(rating); + } else if (!certs->empty()) { + if (certainty < certs->back()) certs->back() = certainty; + ratings->back() += rating; + } + } + xcoords->push_back(width); +} + +// Sets up a word with the ratings matrix and fake blobs with boxes in the +// right places. +WERD_RES* RecodeBeamSearch::InitializeWord(bool leading_space, + const TBOX& line_box, int word_start, + int word_end, float space_certainty, + const UNICHARSET* unicharset, + const GenericVector& xcoords, + float scale_factor) { + // Make a fake blob for each non-zero label. + C_BLOB_LIST blobs; + C_BLOB_IT b_it(&blobs); + for (int i = word_start; i < word_end; ++i) { + int min_half_width = xcoords[i + 1] - xcoords[i]; + if (i > 0 && xcoords[i] - xcoords[i - 1] < min_half_width) + min_half_width = xcoords[i] - xcoords[i - 1]; + if (min_half_width < 1) min_half_width = 1; + // Make a fake blob. + TBOX box(xcoords[i] - min_half_width, 0, xcoords[i] + min_half_width, + line_box.height()); + box.scale(scale_factor); + box.move(ICOORD(line_box.left(), line_box.bottom())); + box.set_top(line_box.top()); + b_it.add_after_then_move(C_BLOB::FakeBlob(box)); + } + // Make a fake word from the blobs. + WERD* word = new WERD(&blobs, leading_space, NULL); + // Make a WERD_RES from the word. + WERD_RES* word_res = new WERD_RES(word); + word_res->uch_set = unicharset; + word_res->combination = true; // Give it ownership of the word. + word_res->space_certainty = space_certainty; + word_res->ratings = new MATRIX(word_end - word_start, 1); + return word_res; +} + +// Fills top_n_flags_ with bools that are true iff the corresponding output +// is one of the top_n. +void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs, + int top_n) { + top_n_flags_.init_to_size(num_outputs, TN_ALSO_RAN); + top_code_ = -1; + second_code_ = -1; + top_heap_.clear(); + for (int i = 0; i < num_outputs; ++i) { + if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key) { + TopPair entry(outputs[i], i); + top_heap_.Push(&entry); + if (top_heap_.size() > top_n) top_heap_.Pop(&entry); + } + } + while (!top_heap_.empty()) { + TopPair entry; + top_heap_.Pop(&entry); + if (top_heap_.size() > 1) { + top_n_flags_[entry.data] = TN_TOPN; + } else { + top_n_flags_[entry.data] = TN_TOP2; + if (top_heap_.empty()) + top_code_ = entry.data; + else + second_code_ = entry.data; + } + } + top_n_flags_[null_char_] = TN_TOP2; +} + +// Adds the computation for the current time-step to the beam. Call at each +// time-step in sequence from left to right. outputs is the activation vector +// for the current timestep. +void RecodeBeamSearch::DecodeStep(const float* outputs, int t, + double dict_ratio, double cert_offset, + double worst_dict_cert, + const UNICHARSET* charset) { + if (t == beam_.size()) beam_.push_back(new RecodeBeam); + RecodeBeam* step = beam_[t]; + beam_size_ = t + 1; + step->Clear(); + if (t == 0) { + // The first step can only use singles and initials. + ContinueContext(nullptr, BeamIndex(false, NC_ANYTHING, 0), outputs, TN_TOP2, + dict_ratio, cert_offset, worst_dict_cert, step); + if (dict_ != nullptr) { + ContinueContext(nullptr, BeamIndex(true, NC_ANYTHING, 0), outputs, + TN_TOP2, dict_ratio, cert_offset, worst_dict_cert, step); + } + } else { + RecodeBeam* prev = beam_[t - 1]; + if (charset != NULL) { + int beam_index = BeamIndex(true, NC_ANYTHING, 0); + for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { + GenericVector path; + ExtractPath(&prev->beams_[beam_index].get(i).data, &path); + tprintf("Step %d: Dawg beam %d:\n", t, i); + DebugPath(charset, path); + } + beam_index = BeamIndex(false, NC_ANYTHING, 0); + for (int i = prev->beams_[beam_index].size() - 1; i >= 0; --i) { + GenericVector path; + ExtractPath(&prev->beams_[beam_index].get(i).data, &path); + tprintf("Step %d: Non-Dawg beam %d:\n", t, i); + DebugPath(charset, path); + } + } + int total_beam = 0; + // Work through the scores by group (top-2, top-n, the rest) while the beam + // is empty. This enables extending the context using only the top-n results + // first, which may have an empty intersection with the valid codes, so we + // fall back to the rest if the beam is empty. + for (int tn = 0; tn < TN_COUNT && total_beam == 0; ++tn) { + TopNState top_n = static_cast(tn); + for (int index = 0; index < kNumBeams; ++index) { + // Working backwards through the heaps doesn't guarantee that we see the + // best first, but it comes before a lot of the worst, so it is slightly + // more efficient than going forwards. + for (int i = prev->beams_[index].size() - 1; i >= 0; --i) { + ContinueContext(&prev->beams_[index].get(i).data, index, outputs, + top_n, dict_ratio, cert_offset, worst_dict_cert, + step); + } + } + for (int index = 0; index < kNumBeams; ++index) { + if (ContinuationFromBeamsIndex(index) == NC_ANYTHING) + total_beam += step->beams_[index].size(); + } + } + // Special case for the best initial dawg. Push it on the heap if good + // enough, but there is only one, so it doesn't blow up the beam. + for (int c = 0; c < NC_COUNT; ++c) { + if (step->best_initial_dawgs_[c].code >= 0) { + int index = BeamIndex(true, static_cast(c), 0); + RecodeHeap* dawg_heap = &step->beams_[index]; + PushHeapIfBetter(kBeamWidths[0], &step->best_initial_dawgs_[c], + dawg_heap); + } + } + } +} + +// Adds to the appropriate beams the legal (according to recoder) +// continuations of context prev, which is of the given length, using the +// given network outputs to provide scores to the choices. Uses only those +// choices for which top_n_flags[index] == top_n_flag. +void RecodeBeamSearch::ContinueContext(const RecodeNode* prev, int index, + const float* outputs, + TopNState top_n_flag, double dict_ratio, + double cert_offset, + double worst_dict_cert, + RecodeBeam* step) { + RecodedCharID prefix; + RecodedCharID full_code; + const RecodeNode* previous = prev; + int length = LengthFromBeamsIndex(index); + bool use_dawgs = IsDawgFromBeamsIndex(index); + NodeContinuation prev_cont = ContinuationFromBeamsIndex(index); + for (int p = length - 1; p >= 0; --p, previous = previous->prev) { + while (previous != NULL && + (previous->duplicate || previous->code == null_char_)) { + previous = previous->prev; + } + prefix.Set(p, previous->code); + full_code.Set(p, previous->code); + } + if (prev != nullptr && !is_simple_text_) { + if (top_n_flags_[prev->code] == top_n_flag) { + if (prev_cont != NC_NO_DUP) { + float cert = + NetworkIO::ProbToCertainty(outputs[prev->code]) + cert_offset; + PushDupOrNoDawgIfBetter(length, true, prev->code, prev->unichar_id, + cert, worst_dict_cert, dict_ratio, use_dawgs, + NC_ANYTHING, prev, step); + } + if (prev_cont == NC_ANYTHING && top_n_flag == TN_TOP2 && + prev->code != null_char_) { + float cert = NetworkIO::ProbToCertainty(outputs[prev->code] + + outputs[null_char_]) + + cert_offset; + PushDupOrNoDawgIfBetter(length, true, prev->code, prev->unichar_id, + cert, worst_dict_cert, dict_ratio, use_dawgs, + NC_NO_DUP, prev, step); + } + } + if (prev_cont == NC_ONLY_DUP) return; + if (prev->code != null_char_ && length > 0 && + top_n_flags_[null_char_] == top_n_flag) { + // Allow nulls within multi code sequences, as the nulls within are not + // explicitly included in the code sequence. + float cert = + NetworkIO::ProbToCertainty(outputs[null_char_]) + cert_offset; + PushDupOrNoDawgIfBetter(length, false, null_char_, INVALID_UNICHAR_ID, + cert, worst_dict_cert, dict_ratio, use_dawgs, + NC_ANYTHING, prev, step); + } + } + const GenericVector* final_codes = recoder_.GetFinalCodes(prefix); + if (final_codes != NULL) { + for (int i = 0; i < final_codes->size(); ++i) { + int code = (*final_codes)[i]; + if (top_n_flags_[code] != top_n_flag) continue; + if (prev != nullptr && prev->code == code && !is_simple_text_) continue; + float cert = NetworkIO::ProbToCertainty(outputs[code]) + cert_offset; + if (cert < kMinCertainty && code != null_char_) continue; + full_code.Set(length, code); + int unichar_id = recoder_.DecodeUnichar(full_code); + // Map the null char to INVALID. + if (length == 0 && code == null_char_) unichar_id = INVALID_UNICHAR_ID; + ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio, + use_dawgs, NC_ANYTHING, prev, step); + if (top_n_flag == TN_TOP2 && code != null_char_) { + float prob = outputs[code] + outputs[null_char_]; + if (prev != nullptr && prev_cont == NC_ANYTHING && + prev->code != null_char_ && + ((prev->code == top_code_ && code == second_code_) || + (code == top_code_ && prev->code == second_code_))) { + prob += outputs[prev->code]; + } + float cert = NetworkIO::ProbToCertainty(prob) + cert_offset; + ContinueUnichar(code, unichar_id, cert, worst_dict_cert, dict_ratio, + use_dawgs, NC_ONLY_DUP, prev, step); + } + } + } + const GenericVector* next_codes = recoder_.GetNextCodes(prefix); + if (next_codes != NULL) { + for (int i = 0; i < next_codes->size(); ++i) { + int code = (*next_codes)[i]; + if (top_n_flags_[code] != top_n_flag) continue; + if (prev != nullptr && prev->code == code && !is_simple_text_) continue; + float cert = NetworkIO::ProbToCertainty(outputs[code]) + cert_offset; + PushDupOrNoDawgIfBetter(length + 1, false, code, INVALID_UNICHAR_ID, cert, + worst_dict_cert, dict_ratio, use_dawgs, + NC_ANYTHING, prev, step); + if (top_n_flag == TN_TOP2 && code != null_char_) { + float prob = outputs[code] + outputs[null_char_]; + if (prev != nullptr && prev_cont == NC_ANYTHING && + prev->code != null_char_ && + ((prev->code == top_code_ && code == second_code_) || + (code == top_code_ && prev->code == second_code_))) { + prob += outputs[prev->code]; + } + float cert = NetworkIO::ProbToCertainty(prob) + cert_offset; + PushDupOrNoDawgIfBetter(length + 1, false, code, INVALID_UNICHAR_ID, + cert, worst_dict_cert, dict_ratio, use_dawgs, + NC_ONLY_DUP, prev, step); + } + } + } +} + +// Continues for a new unichar, using dawg or non-dawg as per flag. +void RecodeBeamSearch::ContinueUnichar(int code, int unichar_id, float cert, + float worst_dict_cert, float dict_ratio, + bool use_dawgs, NodeContinuation cont, + const RecodeNode* prev, + RecodeBeam* step) { + if (use_dawgs) { + if (cert > worst_dict_cert) { + ContinueDawg(code, unichar_id, cert, cont, prev, step); + } + } else { + RecodeHeap* nodawg_heap = &step->beams_[BeamIndex(false, cont, 0)]; + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, TOP_CHOICE_PERM, false, + false, false, false, cert * dict_ratio, prev, nullptr, + nodawg_heap); + if (dict_ != nullptr && + ((unichar_id == UNICHAR_SPACE && cert > worst_dict_cert) || + !dict_->getUnicharset().IsSpaceDelimited(unichar_id))) { + // Any top choice position that can start a new word, ie a space or + // any non-space-delimited character, should also be considered + // by the dawg search, so push initial dawg to the dawg heap. + float dawg_cert = cert; + PermuterType permuter = TOP_CHOICE_PERM; + // Since we use the space either side of a dictionary word in the + // certainty of the word, (to properly handle weak spaces) and the + // space is coming from a non-dict word, we need special conditions + // to avoid degrading the certainty of the dict word that follows. + // With a space we don't multiply the certainty by dict_ratio, and we + // flag the space with NO_PERM to indicate that we should not use the + // predecessor nulls to generate the confidence for the space, as they + // have already been multiplied by dict_ratio, and we can't go back to + // insert more entries in any previous heaps. + if (unichar_id == UNICHAR_SPACE) + permuter = NO_PERM; + else + dawg_cert *= dict_ratio; + PushInitialDawgIfBetter(code, unichar_id, permuter, false, false, + dawg_cert, cont, prev, step); + } + } +} + +// Adds a RecodeNode composed of the tuple (code, unichar_id, cert, prev, +// appropriate-dawg-args, cert) to the given heap (dawg_beam_) if unichar_id +// is a valid continuation of whatever is in prev. +void RecodeBeamSearch::ContinueDawg(int code, int unichar_id, float cert, + NodeContinuation cont, + const RecodeNode* prev, RecodeBeam* step) { + RecodeHeap* dawg_heap = &step->beams_[BeamIndex(true, cont, 0)]; + RecodeHeap* nodawg_heap = &step->beams_[BeamIndex(false, cont, 0)]; + if (unichar_id == INVALID_UNICHAR_ID) { + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, NO_PERM, false, false, + false, false, cert, prev, nullptr, dawg_heap); + return; + } + // Avoid dictionary probe if score a total loss. + float score = cert; + if (prev != NULL) score += prev->score; + if (dawg_heap->size() >= kBeamWidths[0] && + score <= dawg_heap->PeekTop().data.score && + nodawg_heap->size() >= kBeamWidths[0] && + score <= nodawg_heap->PeekTop().data.score) { + return; + } + const RecodeNode* uni_prev = prev; + // Prev may be a partial code, null_char, or duplicate, so scan back to the + // last valid unichar_id. + while (uni_prev != NULL && + (uni_prev->unichar_id == INVALID_UNICHAR_ID || uni_prev->duplicate)) + uni_prev = uni_prev->prev; + if (unichar_id == UNICHAR_SPACE) { + if (uni_prev != NULL && uni_prev->end_of_word) { + // Space is good. Push initial state, to the dawg beam and a regular + // space to the top choice beam. + PushInitialDawgIfBetter(code, unichar_id, uni_prev->permuter, false, + false, cert, cont, prev, step); + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, uni_prev->permuter, + false, false, false, false, cert, prev, nullptr, + nodawg_heap); + } + return; + } else if (uni_prev != NULL && uni_prev->start_of_dawg && + uni_prev->unichar_id != UNICHAR_SPACE && + dict_->getUnicharset().IsSpaceDelimited(uni_prev->unichar_id) && + dict_->getUnicharset().IsSpaceDelimited(unichar_id)) { + return; // Can't break words between space delimited chars. + } + DawgPositionVector initial_dawgs; + DawgPositionVector* updated_dawgs = new DawgPositionVector; + DawgArgs dawg_args(&initial_dawgs, updated_dawgs, NO_PERM); + bool word_start = false; + if (uni_prev == NULL) { + // Starting from beginning of line. + dict_->default_dawgs(&initial_dawgs, false); + word_start = true; + } else if (uni_prev->dawgs != NULL) { + // Continuing a previous dict word. + dawg_args.active_dawgs = uni_prev->dawgs; + word_start = uni_prev->start_of_dawg; + } else { + return; // Can't continue if not a dict word. + } + PermuterType permuter = static_cast( + dict_->def_letter_is_okay(&dawg_args, unichar_id, false)); + if (permuter != NO_PERM) { + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false, + word_start, dawg_args.valid_end, false, cert, prev, + dawg_args.updated_dawgs, dawg_heap); + if (dawg_args.valid_end && !space_delimited_) { + // We can start another word right away, so push initial state as well, + // to the dawg beam, and the regular character to the top choice beam, + // since non-dict words can start here too. + PushInitialDawgIfBetter(code, unichar_id, permuter, word_start, true, + cert, cont, prev, step); + PushHeapIfBetter(kBeamWidths[0], code, unichar_id, permuter, false, + word_start, true, false, cert, prev, NULL, nodawg_heap); + } + } else { + delete updated_dawgs; + } +} + +// Adds a RecodeNode composed of the tuple (code, unichar_id, +// initial-dawg-state, prev, cert) to the given heap if/ there is room or if +// better than the current worst element if already full. +void RecodeBeamSearch::PushInitialDawgIfBetter(int code, int unichar_id, + PermuterType permuter, + bool start, bool end, float cert, + NodeContinuation cont, + const RecodeNode* prev, + RecodeBeam* step) { + RecodeNode* best_initial_dawg = &step->best_initial_dawgs_[cont]; + float score = cert; + if (prev != NULL) score += prev->score; + if (best_initial_dawg->code < 0 || score > best_initial_dawg->score) { + DawgPositionVector* initial_dawgs = new DawgPositionVector; + dict_->default_dawgs(initial_dawgs, false); + RecodeNode node(code, unichar_id, permuter, true, start, end, false, cert, + score, prev, initial_dawgs, + ComputeCodeHash(code, false, prev)); + *best_initial_dawg = node; + } +} + +// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter, +// false, false, false, false, cert, prev, NULL) to heap if there is room +// or if better than the current worst element if already full. +/* static */ +void RecodeBeamSearch::PushDupOrNoDawgIfBetter( + int length, bool dup, int code, int unichar_id, float cert, + float worst_dict_cert, float dict_ratio, bool use_dawgs, + NodeContinuation cont, const RecodeNode* prev, RecodeBeam* step) { + int index = BeamIndex(use_dawgs, cont, length); + if (use_dawgs) { + if (cert > worst_dict_cert) { + PushHeapIfBetter(kBeamWidths[length], code, unichar_id, + prev ? prev->permuter : NO_PERM, false, false, false, + dup, cert, prev, nullptr, &step->beams_[index]); + } + } else { + cert *= dict_ratio; + if (cert >= kMinCertainty || code == null_char_) { + PushHeapIfBetter(kBeamWidths[length], code, unichar_id, + prev ? prev->permuter : TOP_CHOICE_PERM, false, false, + false, dup, cert, prev, nullptr, &step->beams_[index]); + } + } +} + +// Adds a RecodeNode composed of the tuple (code, unichar_id, permuter, +// dawg_start, word_start, end, dup, cert, prev, d) to heap if there is room +// or if better than the current worst element if already full. +void RecodeBeamSearch::PushHeapIfBetter(int max_size, int code, int unichar_id, + PermuterType permuter, bool dawg_start, + bool word_start, bool end, bool dup, + float cert, const RecodeNode* prev, + DawgPositionVector* d, + RecodeHeap* heap) { + float score = cert; + if (prev != NULL) score += prev->score; + if (heap->size() < max_size || score > heap->PeekTop().data.score) { + uinT64 hash = ComputeCodeHash(code, dup, prev); + RecodeNode node(code, unichar_id, permuter, dawg_start, word_start, end, + dup, cert, score, prev, d, hash); + if (UpdateHeapIfMatched(&node, heap)) return; + RecodePair entry(score, node); + heap->Push(&entry); + ASSERT_HOST(entry.data.dawgs == NULL); + if (heap->size() > max_size) heap->Pop(&entry); + } else { + delete d; + } +} + +// Adds a RecodeNode to heap if there is room +// or if better than the current worst element if already full. +void RecodeBeamSearch::PushHeapIfBetter(int max_size, RecodeNode* node, + RecodeHeap* heap) { + if (heap->size() < max_size || node->score > heap->PeekTop().data.score) { + if (UpdateHeapIfMatched(node, heap)) { + return; + } + RecodePair entry(node->score, *node); + heap->Push(&entry); + ASSERT_HOST(entry.data.dawgs == NULL); + if (heap->size() > max_size) heap->Pop(&entry); + } +} + +// Searches the heap for a matching entry, and updates the score with +// reshuffle if needed. Returns true if there was a match. +bool RecodeBeamSearch::UpdateHeapIfMatched(RecodeNode* new_node, + RecodeHeap* heap) { + // TODO(rays) consider hash map instead of linear search. + // It might not be faster because the hash map would have to be updated + // every time a heap reshuffle happens, and that would be a lot of overhead. + GenericVector* nodes = heap->heap(); + for (int i = 0; i < nodes->size(); ++i) { + RecodeNode& node = (*nodes)[i].data; + if (node.code == new_node->code && node.code_hash == new_node->code_hash && + node.permuter == new_node->permuter && + node.start_of_dawg == new_node->start_of_dawg) { + if (new_node->score > node.score) { + // The new one is better. Update the entire node in the heap and + // reshuffle. + node = *new_node; + (*nodes)[i].key = node.score; + heap->Reshuffle(&(*nodes)[i]); + } + return true; + } + } + return false; +} + +// Computes and returns the code-hash for the given code and prev. +uinT64 RecodeBeamSearch::ComputeCodeHash(int code, bool dup, + const RecodeNode* prev) const { + uinT64 hash = prev == nullptr ? 0 : prev->code_hash; + if (!dup && code != null_char_) { + int num_classes = recoder_.code_range(); + uinT64 carry = (((hash >> 32) * num_classes) >> 32); + hash *= num_classes; + hash += carry; + hash += code; + } + return hash; +} + +// Backtracks to extract the best path through the lattice that was built +// during Decode. On return the best_nodes vector essentially contains the set +// of code, score pairs that make the optimal path with the constraint that +// the recoder can decode the code sequence back to a sequence of unichar-ids. +void RecodeBeamSearch::ExtractBestPaths( + GenericVector* best_nodes, + GenericVector* second_nodes) const { + // Scan both beams to extract the best and second best paths. + const RecodeNode* best_node = NULL; + const RecodeNode* second_best_node = NULL; + const RecodeBeam* last_beam = beam_[beam_size_ - 1]; + for (int c = 0; c < NC_COUNT; ++c) { + if (c == NC_ONLY_DUP) continue; + NodeContinuation cont = static_cast(c); + for (int is_dawg = 0; is_dawg < 2; ++is_dawg) { + int beam_index = BeamIndex(is_dawg, cont, 0); + int heap_size = last_beam->beams_[beam_index].size(); + for (int h = 0; h < heap_size; ++h) { + const RecodeNode* node = &last_beam->beams_[beam_index].get(h).data; + if (is_dawg) { + // dawg_node may be a null_char, or duplicate, so scan back to the + // last valid unichar_id. + const RecodeNode* dawg_node = node; + while (dawg_node != NULL && + (dawg_node->unichar_id == INVALID_UNICHAR_ID || + dawg_node->duplicate)) + dawg_node = dawg_node->prev; + if (dawg_node == NULL || (!dawg_node->end_of_word && + dawg_node->unichar_id != UNICHAR_SPACE)) { + // Dawg node is not valid. + continue; + } + } + if (best_node == NULL || node->score > best_node->score) { + second_best_node = best_node; + best_node = node; + } else if (second_best_node == NULL || + node->score > second_best_node->score) { + second_best_node = node; + } + } + } + } + if (second_nodes != NULL) ExtractPath(second_best_node, second_nodes); + ExtractPath(best_node, best_nodes); +} + +// Helper backtracks through the lattice from the given node, storing the +// path and reversing it. +void RecodeBeamSearch::ExtractPath( + const RecodeNode* node, GenericVector* path) const { + path->truncate(0); + while (node != NULL) { + path->push_back(node); + node = node->prev; + } + path->reverse(); +} + +// Helper prints debug information on the given lattice path. +void RecodeBeamSearch::DebugPath( + const UNICHARSET* unicharset, + const GenericVector& path) const { + for (int c = 0; c < path.size(); ++c) { + const RecodeNode& node = *path[c]; + tprintf("%d ", c); + node.Print(null_char_, *unicharset, 1); + } +} + +// Helper prints debug information on the given unichar path. +void RecodeBeamSearch::DebugUnicharPath( + const UNICHARSET* unicharset, const GenericVector& path, + const GenericVector& unichar_ids, const GenericVector& certs, + const GenericVector& ratings, + const GenericVector& xcoords) const { + int num_ids = unichar_ids.size(); + double total_rating = 0.0; + for (int c = 0; c < num_ids; ++c) { + int coord = xcoords[c]; + tprintf("%d %d=%s r=%g, c=%g, s=%d, e=%d, perm=%d\n", coord, unichar_ids[c], + unicharset->debug_str(unichar_ids[c]).string(), ratings[c], + certs[c], path[coord]->start_of_word, path[coord]->end_of_word, + path[coord]->permuter); + total_rating += ratings[c]; + } + tprintf("Path total rating = %g\n", total_rating); +} + +} // namespace tesseract. diff --git a/lstm/recodebeam.h b/lstm/recodebeam.h new file mode 100644 index 00000000..21959145 --- /dev/null +++ b/lstm/recodebeam.h @@ -0,0 +1,392 @@ +/////////////////////////////////////////////////////////////////////// +// File: recodebeam.h +// Description: Beam search to decode from the re-encoded CJK as a sequence of +// smaller numbers in place of a single large code. +// Author: Ray Smith +// Created: Fri Mar 13 09:12:01 PDT 2015 +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#ifndef THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_ +#define THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_ + +#include "dawg.h" +#include "dict.h" +#include "genericheap.h" +#include "kdpair.h" +#include "networkio.h" +#include "ratngs.h" +#include "unicharcompress.h" + +namespace tesseract { + +// Enum describing what can follow the current node. +// Consider the following softmax outputs: +// Timestep 0 1 2 3 4 5 6 7 8 +// X-score 0.01 0.55 0.98 0.42 0.01 0.01 0.40 0.95 0.01 +// Y-score 0.00 0.01 0.01 0.01 0.01 0.97 0.59 0.04 0.01 +// Null-score 0.99 0.44 0.01 0.57 0.98 0.02 0.01 0.01 0.98 +// Then the correct CTC decoding (in which adjacent equal classes are folded, +// and then all nulls are dropped) is clearly XYX, but simple decoding (taking +// the max at each timestep) leads to: +// Null@0.99 X@0.55 X@0.98 Null@0.57 Null@0.98 Y@0.97 Y@0.59 X@0.95 Null@0.98, +// which folds to the correct XYX. The conversion to Tesseract rating and +// certainty uses the sum of the log probs (log of the product of probabilities) +// for the Rating and the minimum log prob for the certainty, but that yields a +// minimum certainty of log(0.55), which is poor for such an obvious case. +// CTC says that the probability of the result is the SUM of the products of the +// probabilities over ALL PATHS that decode to the same result, which includes: +// NXXNNYYXN, NNXNNYYN, NXXXNYYXN, NNXXNYXXN, and others including XXXXXYYXX. +// That is intractable, so some compromise between simple and ideal is needed. +// Observing that evenly split timesteps rarely happen next to each other, we +// allow scores at a transition between classes to be added for decoding thus: +// N@0.99 (N+X)@0.99 X@0.98 (N+X)@0.99 N@0.98 Y@0.97 (X+Y+N)@1.00 X@0.95 N@0.98. +// This works because NNX and NXX both decode to X, so in the middle we can use +// N+X. Note that the classes either side of a sum must stand alone, i.e. use a +// single score, to force all paths to pass through them and decode to the same +// result. Also in the special case of a transition from X to Y, with only one +// timestep between, it is possible to add X+Y+N, since XXY, XYY, and XNY all +// decode to XY. +// An important condition is that we cannot combine X and Null between two +// stand-alone Xs, since that can decode as XNX->XX or XXX->X, so the scores for +// X and Null have to go in separate paths. Combining scores in this way +// provides a much better minimum certainty of log(0.95). +// In the implementation of the beam search, we have to place the possibilities +// X, X+N and X+Y+N in the beam under appropriate conditions of the previous +// node, and constrain what can follow, to enforce the rules explained above. +// We therefore have 3 different types of node determined by what can follow: +enum NodeContinuation { + NC_ANYTHING, // This node used just its own score, so anything can follow. + NC_ONLY_DUP, // The current node combined another score with the score for + // itself, without a stand-alone duplicate before, so must be + // followed by a stand-alone duplicate. + NC_NO_DUP, // The current node combined another score with the score for + // itself, after a stand-alone, so can only be followed by + // something other than a duplicate of the current node. + NC_COUNT +}; + +// Enum describing the top-n status of a code. +enum TopNState { + TN_TOP2, // Winner or 2nd. + TN_TOPN, // Runner up in top-n, but not 1st or 2nd. + TN_ALSO_RAN, // Not in the top-n. + TN_COUNT +}; + +// Lattice element for Re-encode beam search. +struct RecodeNode { + RecodeNode() + : code(-1), + unichar_id(INVALID_UNICHAR_ID), + permuter(TOP_CHOICE_PERM), + start_of_dawg(false), + start_of_word(false), + end_of_word(false), + duplicate(false), + certainty(0.0f), + score(0.0f), + prev(NULL), + dawgs(NULL), + code_hash(0) {} + RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start, + bool word_start, bool end, bool dup, float cert, float s, + const RecodeNode* p, DawgPositionVector* d, uinT64 hash) + : code(c), + unichar_id(uni_id), + permuter(perm), + start_of_dawg(dawg_start), + start_of_word(word_start), + end_of_word(end), + duplicate(dup), + certainty(cert), + score(s), + prev(p), + dawgs(d), + code_hash(hash) {} + // NOTE: If we could use C++11, then this would be a move constructor. + // Instead we have copy constructor that does a move!! This is because we + // don't want to copy the whole DawgPositionVector each time, and true + // copying isn't necessary for this struct. It does get moved around a lot + // though inside the heap and during heap push, hence the move semantics. + RecodeNode(RecodeNode& src) : dawgs(NULL) { + *this = src; + ASSERT_HOST(src.dawgs == NULL); + } + RecodeNode& operator=(RecodeNode& src) { + delete dawgs; + memcpy(this, &src, sizeof(src)); + src.dawgs = NULL; + return *this; + } + ~RecodeNode() { delete dawgs; } + // Prints details of the node. + void Print(int null_char, const UNICHARSET& unicharset, int depth) const; + + // The re-encoded code here = index to network output. + int code; + // The decoded unichar_id is only valid for the final code of a sequence. + int unichar_id; + // The type of permuter active at this point. Intervals between start_of_word + // and end_of_word make valid words of type given by permuter where + // end_of_word is true. These aren't necessarily delimited by spaces. + PermuterType permuter; + // True if this is the initial dawg state. May be attached to a space or, + // in a non-space-delimited lang, the end of the previous word. + bool start_of_dawg; + // True if this is the first node in a dictionary word. + bool start_of_word; + // True if this represents a valid candidate end of word position. Does not + // necessarily mark the end of a word, since a word can be extended beyond a + // candidate end by a continuation, eg 'the' continues to 'these'. + bool end_of_word; + // True if this->code is a duplicate of prev->code. Some training modes + // allow the network to output duplicate characters and crush them with CTC, + // but that would mess up the dictionary search, so we just smash them + // together on the fly using the duplicate flag. + bool duplicate; + // Certainty (log prob) of (just) this position. + float certainty; + // Total certainty of the path to this position. + float score; + // The previous node in this chain. Borrowed pointer. + const RecodeNode* prev; + // The currently active dawgs at this position. Owned pointer. + DawgPositionVector* dawgs; + // A hash of all codes in the prefix and this->code as well. Used for + // duplicate path removal. + uinT64 code_hash; +}; + +typedef KDPairInc RecodePair; +typedef GenericHeap RecodeHeap; + +// Class that holds the entire beam search for recognition of a text line. +class RecodeBeamSearch { + public: + // Borrows the pointer, which is expected to survive until *this is deleted. + RecodeBeamSearch(const UnicharCompress& recoder, int null_char, + bool simple_text, Dict* dict); + + // Decodes the set of network outputs, storing the lattice internally. + // If charset is not null, it enables detailed debugging of the beam search. + void Decode(const NetworkIO& output, double dict_ratio, double cert_offset, + double worst_dict_cert, const UNICHARSET* charset); + void Decode(const GENERIC_2D_ARRAY& output, double dict_ratio, + double cert_offset, double worst_dict_cert, + const UNICHARSET* charset); + + // Returns the best path as labels/scores/xcoords similar to simple CTC. + void ExtractBestPathAsLabels(GenericVector* labels, + GenericVector* xcoords) const; + // Returns the best path as unichar-ids/certs/ratings/xcoords skipping + // duplicates, nulls and intermediate parts. + void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET* unicharset, + GenericVector* unichar_ids, + GenericVector* certs, + GenericVector* ratings, + GenericVector* xcoords) const; + + // Returns the best path as a set of WERD_RES. + void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor, + bool debug, const UNICHARSET* unicharset, + PointerVector* words); + + // Generates debug output of the content of the beams after a Decode. + void DebugBeams(const UNICHARSET& unicharset) const; + + // Clipping value for certainty inside Tesseract. Reflects the minimum value + // of certainty that will be returned by ExtractBestPathAsUnicharIds. + // Supposedly on a uniform scale that can be compared across languages and + // engines. + static const float kMinCertainty; + // Number of different code lengths for which we have a separate beam. + static const int kNumLengths = RecodedCharID::kMaxCodeLen + 1; + // Total number of beams: dawg/nodawg * number of NodeContinuation * number + // of different lengths. + static const int kNumBeams = 2 * NC_COUNT * kNumLengths; + // Returns the relevant factor in the beams_ index. + static int LengthFromBeamsIndex(int index) { return index % kNumLengths; } + static NodeContinuation ContinuationFromBeamsIndex(int index) { + return static_cast((index / kNumLengths) % NC_COUNT); + } + static bool IsDawgFromBeamsIndex(int index) { + return index / (kNumLengths * NC_COUNT) > 0; + } + // Computes a beams_ index from the given factors. + static int BeamIndex(bool is_dawg, NodeContinuation cont, int length) { + return (is_dawg * NC_COUNT + cont) * kNumLengths + length; + } + + private: + // Struct for the Re-encode beam search. This struct holds the data for + // a single time-step position of the output. Use a PointerVector + // to hold all the timesteps and prevent reallocation of the individual heaps. + struct RecodeBeam { + // Resets to the initial state without deleting all the memory. + void Clear() { + for (int i = 0; i < kNumBeams; ++i) { + beams_[i].clear(); + } + RecodeNode empty; + for (int i = 0; i < NC_COUNT; ++i) { + best_initial_dawgs_[i] = empty; + } + } + + // A separate beam for each combination of code length, + // NodeContinuation, and dictionary flag. Separating out all these types + // allows the beam to be quite narrow, and yet still have a low chance of + // losing the best path. + // We have to keep all these beams separate, since the highest scoring paths + // come from the paths that are most likely to dead-end at any time, like + // dawg paths, NC_ONLY_DUP etc. + // Each heap is stored with the WORST result at the top, so we can quickly + // get the top-n values. + RecodeHeap beams_[kNumBeams]; + // While the language model is only a single word dictionary, we can use + // word starts as a choke point in the beam, and keep only a single dict + // start node at each step (for each NodeContinuation type), so we find the + // best one here and push it on the heap, if it qualifies, after processing + // all of the step. + RecodeNode best_initial_dawgs_[NC_COUNT]; + }; + typedef KDPairInc TopPair; + + // Generates debug output of the content of a single beam position. + void DebugBeamPos(const UNICHARSET& unicharset, const RecodeHeap& heap) const; + + // Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping + // duplicates, nulls and intermediate parts. + static void ExtractPathAsUnicharIds( + const GenericVector& best_nodes, + GenericVector* unichar_ids, GenericVector* certs, + GenericVector* ratings, GenericVector* xcoords); + + // Sets up a word with the ratings matrix and fake blobs with boxes in the + // right places. + WERD_RES* InitializeWord(bool leading_space, const TBOX& line_box, + int word_start, int word_end, float space_certainty, + const UNICHARSET* unicharset, + const GenericVector& xcoords, + float scale_factor); + + // Fills top_n_flags_ with bools that are true iff the corresponding output + // is one of the top_n. + void ComputeTopN(const float* outputs, int num_outputs, int top_n); + + // Adds the computation for the current time-step to the beam. Call at each + // time-step in sequence from left to right. outputs is the activation vector + // for the current timestep. + void DecodeStep(const float* outputs, int t, double dict_ratio, + double cert_offset, double worst_dict_cert, + const UNICHARSET* charset); + + // Adds to the appropriate beams the legal (according to recoder) + // continuations of context prev, which is from the given index to beams_, + // using the given network outputs to provide scores to the choices. Uses only + // those choices for which top_n_flags[code] == top_n_flag. + void ContinueContext(const RecodeNode* prev, int index, const float* outputs, + TopNState top_n_flag, double dict_ratio, + double cert_offset, double worst_dict_cert, + RecodeBeam* step); + // Continues for a new unichar, using dawg or non-dawg as per flag. + void ContinueUnichar(int code, int unichar_id, float cert, + float worst_dict_cert, float dict_ratio, bool use_dawgs, + NodeContinuation cont, const RecodeNode* prev, + RecodeBeam* step); + // Adds a RecodeNode composed of the args to the correct heap in step if + // unichar_id is a valid dictionary continuation of whatever is in prev. + void ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont, + const RecodeNode* prev, RecodeBeam* step); + // Sets the correct best_initial_dawgs_ with a RecodeNode composed of the args + // if better than what is already there. + void PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter, + bool start, bool end, float cert, + NodeContinuation cont, const RecodeNode* prev, + RecodeBeam* step); + // Adds a RecodeNode composed of the args to the correct heap in step for + // partial unichar or duplicate if there is room or if better than the + // current worst element if already full. + void PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id, + float cert, float worst_dict_cert, + float dict_ratio, bool use_dawgs, + NodeContinuation cont, const RecodeNode* prev, + RecodeBeam* step); + // Adds a RecodeNode composed of the args to the correct heap in step if there + // is room or if better than the current worst element if already full. + void PushHeapIfBetter(int max_size, int code, int unichar_id, + PermuterType permuter, bool dawg_start, bool word_start, + bool end, bool dup, float cert, const RecodeNode* prev, + DawgPositionVector* d, RecodeHeap* heap); + // Adds a RecodeNode to heap if there is room + // or if better than the current worst element if already full. + void PushHeapIfBetter(int max_size, RecodeNode* node, RecodeHeap* heap); + // Searches the heap for an entry matching new_node, and updates the entry + // with reshuffle if needed. Returns true if there was a match. + bool UpdateHeapIfMatched(RecodeNode* new_node, RecodeHeap* heap); + // Computes and returns the code-hash for the given code and prev. + uinT64 ComputeCodeHash(int code, bool dup, const RecodeNode* prev) const; + // Backtracks to extract the best path through the lattice that was built + // during Decode. On return the best_nodes vector essentially contains the set + // of code, score pairs that make the optimal path with the constraint that + // the recoder can decode the code sequence back to a sequence of unichar-ids. + void ExtractBestPaths(GenericVector* best_nodes, + GenericVector* second_nodes) const; + // Helper backtracks through the lattice from the given node, storing the + // path and reversing it. + void ExtractPath(const RecodeNode* node, + GenericVector* path) const; + // Helper prints debug information on the given lattice path. + void DebugPath(const UNICHARSET* unicharset, + const GenericVector& path) const; + // Helper prints debug information on the given unichar path. + void DebugUnicharPath(const UNICHARSET* unicharset, + const GenericVector& path, + const GenericVector& unichar_ids, + const GenericVector& certs, + const GenericVector& ratings, + const GenericVector& xcoords) const; + + static const int kBeamWidths[RecodedCharID::kMaxCodeLen + 1]; + + // The encoder/decoder that we will be using. + const UnicharCompress& recoder_; + // The beam for each timestep in the output. + PointerVector beam_; + // The number of timesteps valid in beam_; + int beam_size_; + // A flag to indicate which outputs are the top-n choices. Current timestep + // only. + GenericVector top_n_flags_; + // A record of the highest and second scoring codes. + int top_code_; + int second_code_; + // Heap used to compute the top_n_flags_. + GenericHeap top_heap_; + // Borrowed pointer to the dictionary to use in the search. + Dict* dict_; + // True if the language is space-delimited, which is true for most languages + // except chi*, jpn, tha. + bool space_delimited_; + // True if the input is simple text, ie adjacent equal chars are not to be + // eliminated. + bool is_simple_text_; + // The encoded (class label) of the null/reject character. + int null_char_; +}; + +} // namespace tesseract. + +#endif // THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_ diff --git a/lstm/reconfig.cpp b/lstm/reconfig.cpp new file mode 100644 index 00000000..aa5e01b9 --- /dev/null +++ b/lstm/reconfig.cpp @@ -0,0 +1,128 @@ +/////////////////////////////////////////////////////////////////////// +// File: reconfig.cpp +// Description: Network layer that reconfigures the scaling vs feature +// depth. +// Author: Ray Smith +// Created: Wed Feb 26 15:42:25 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// +#include "reconfig.h" +#include "tprintf.h" + +namespace tesseract { + +Reconfig::Reconfig(const STRING& name, int ni, int x_scale, int y_scale) + : Network(NT_RECONFIG, name, ni, ni * x_scale * y_scale), + x_scale_(x_scale), y_scale_(y_scale) { +} + +Reconfig::~Reconfig() { +} + +// Returns the shape output from the network given an input shape (which may +// be partially unknown ie zero). +StaticShape Reconfig::OutputShape(const StaticShape& input_shape) const { + StaticShape result = input_shape; + result.set_height(result.height() / y_scale_); + result.set_width(result.width() / x_scale_); + if (type_ != NT_MAXPOOL) + result.set_depth(result.depth() * y_scale_ * x_scale_); + return result; +} + +// Returns an integer reduction factor that the network applies to the +// time sequence. Assumes that any 2-d is already eliminated. Used for +// scaling bounding boxes of truth data. +// WARNING: if GlobalMinimax is used to vary the scale, this will return +// the last used scale factor. Call it before any forward, and it will return +// the minimum scale factor of the paths through the GlobalMinimax. +int Reconfig::XScaleFactor() const { + return x_scale_; +} + +// Writes to the given file. Returns false in case of error. +bool Reconfig::Serialize(TFile* fp) const { + if (!Network::Serialize(fp)) return false; + if (fp->FWrite(&x_scale_, sizeof(x_scale_), 1) != 1) return false; + if (fp->FWrite(&y_scale_, sizeof(y_scale_), 1) != 1) return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool Reconfig::DeSerialize(bool swap, TFile* fp) { + if (fp->FRead(&x_scale_, sizeof(x_scale_), 1) != 1) return false; + if (fp->FRead(&y_scale_, sizeof(y_scale_), 1) != 1) return false; + if (swap) { + ReverseN(&x_scale_, sizeof(x_scale_)); + ReverseN(&y_scale_, sizeof(y_scale_)); + } + no_ = ni_ * x_scale_ * y_scale_; + return true; +} + +// Runs forward propagation of activations on the input line. +// See NetworkCpp for a detailed discussion of the arguments. +void Reconfig::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + output->ResizeScaled(input, x_scale_, y_scale_, no_); + back_map_ = input.stride_map(); + StrideMap::Index dest_index(output->stride_map()); + do { + int out_t = dest_index.t(); + StrideMap::Index src_index(input.stride_map(), dest_index.index(FD_BATCH), + dest_index.index(FD_HEIGHT) * y_scale_, + dest_index.index(FD_WIDTH) * x_scale_); + // Stack x_scale_ groups of y_scale_ inputs together. + for (int x = 0; x < x_scale_; ++x) { + for (int y = 0; y < y_scale_; ++y) { + StrideMap::Index src_xy(src_index); + if (src_xy.AddOffset(x, FD_WIDTH) && src_xy.AddOffset(y, FD_HEIGHT)) { + output->CopyTimeStepGeneral(out_t, (x * y_scale_ + y) * ni_, ni_, + input, src_xy.t(), 0); + } + } + } + } while (dest_index.Increment()); +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool Reconfig::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + back_deltas->ResizeToMap(fwd_deltas.int_mode(), back_map_, ni_); + StrideMap::Index src_index(fwd_deltas.stride_map()); + do { + int in_t = src_index.t(); + StrideMap::Index dest_index(back_deltas->stride_map(), + src_index.index(FD_BATCH), + src_index.index(FD_HEIGHT) * y_scale_, + src_index.index(FD_WIDTH) * x_scale_); + // Unstack x_scale_ groups of y_scale_ inputs that are together. + for (int x = 0; x < x_scale_; ++x) { + for (int y = 0; y < y_scale_; ++y) { + StrideMap::Index dest_xy(dest_index); + if (dest_xy.AddOffset(x, FD_WIDTH) && dest_xy.AddOffset(y, FD_HEIGHT)) { + back_deltas->CopyTimeStepGeneral(dest_xy.t(), 0, ni_, fwd_deltas, + in_t, (x * y_scale_ + y) * ni_); + } + } + } + } while (src_index.Increment()); + return needs_to_backprop_; +} + + +} // namespace tesseract. diff --git a/lstm/reconfig.h b/lstm/reconfig.h new file mode 100644 index 00000000..4409cf0a --- /dev/null +++ b/lstm/reconfig.h @@ -0,0 +1,86 @@ +/////////////////////////////////////////////////////////////////////// +// File: reconfig.h +// Description: Network layer that reconfigures the scaling vs feature +// depth. +// Author: Ray Smith +// Created: Wed Feb 26 15:37:42 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// +#ifndef TESSERACT_LSTM_RECONFIG_H_ +#define TESSERACT_LSTM_RECONFIG_H_ + + +#include "genericvector.h" +#include "matrix.h" +#include "network.h" + +namespace tesseract { + +// Reconfigures (Shrinks) the inputs by concatenating an x_scale by y_scale tile +// of inputs together, producing a single, deeper output per tile. +// Note that fractional parts are truncated for efficiency, so make sure the +// input stride is a multiple of the y_scale factor! +class Reconfig : public Network { + public: + Reconfig(const STRING& name, int ni, int x_scale, int y_scale); + virtual ~Reconfig(); + + // Returns the shape output from the network given an input shape (which may + // be partially unknown ie zero). + virtual StaticShape OutputShape(const StaticShape& input_shape) const; + + virtual STRING spec() const { + STRING spec; + spec.add_str_int("S", y_scale_); + spec.add_str_int(",", x_scale_); + return spec; + } + + // Returns an integer reduction factor that the network applies to the + // time sequence. Assumes that any 2-d is already eliminated. Used for + // scaling bounding boxes of truth data. + // WARNING: if GlobalMinimax is used to vary the scale, this will return + // the last used scale factor. Call it before any forward, and it will return + // the minimum scale factor of the paths through the GlobalMinimax. + virtual int XScaleFactor() const; + + // Writes to the given file. Returns false in case of error. + virtual bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + virtual bool DeSerialize(bool swap, TFile* fp); + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + + // Runs backward propagation of errors on the deltas line. + // See Network for a detailed discussion of the arguments. + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas); + + protected: + // Non-serialized data used to store parameters between forward and back. + StrideMap back_map_; + // Serialized data. + inT32 x_scale_; + inT32 y_scale_; +}; + +} // namespace tesseract. + + +#endif // TESSERACT_LSTM_SUBSAMPLE_H_ diff --git a/lstm/reversed.cpp b/lstm/reversed.cpp new file mode 100644 index 00000000..9cdc4f96 --- /dev/null +++ b/lstm/reversed.cpp @@ -0,0 +1,91 @@ +/////////////////////////////////////////////////////////////////////// +// File: reversed.cpp +// Description: Runs a single network on time-reversed input, reversing output. +// Author: Ray Smith +// Created: Thu May 02 08:42:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "reversed.h" + +#include + +#include "networkscratch.h" + +namespace tesseract { + +Reversed::Reversed(const STRING& name, NetworkType type) : Plumbing(name) { + type_ = type; +} +Reversed::~Reversed() { +} + +// Returns the shape output from the network given an input shape (which may +// be partially unknown ie zero). +StaticShape Reversed::OutputShape(const StaticShape& input_shape) const { + if (type_ == NT_XYTRANSPOSE) { + StaticShape x_shape(input_shape); + x_shape.set_width(input_shape.height()); + x_shape.set_height(input_shape.width()); + x_shape = stack_[0]->OutputShape(x_shape); + x_shape.SetShape(x_shape.batch(), x_shape.width(), x_shape.height(), + x_shape.depth()); + return x_shape; + } + return stack_[0]->OutputShape(input_shape); +} + +// Takes ownership of the given network to make it the reversed one. +void Reversed::SetNetwork(Network* network) { + stack_.clear(); + AddToStack(network); +} + +// Runs forward propagation of activations on the input line. +// See NetworkCpp for a detailed discussion of the arguments. +void Reversed::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + NetworkScratch::IO rev_input(input, scratch); + ReverseData(input, rev_input); + NetworkScratch::IO rev_output(input, scratch); + stack_[0]->Forward(debug, *rev_input, NULL, scratch, rev_output); + ReverseData(*rev_output, output); +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool Reversed::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + NetworkScratch::IO rev_input(fwd_deltas, scratch); + ReverseData(fwd_deltas, rev_input); + NetworkScratch::IO rev_output(fwd_deltas, scratch); + if (stack_[0]->Backward(debug, *rev_input, scratch, rev_output)) { + ReverseData(*rev_output, back_deltas); + return true; + } + return false; +} + +// Copies src to *dest with the reversal according to type_. +void Reversed::ReverseData(const NetworkIO& src, NetworkIO* dest) const { + if (type_ == NT_XREVERSED) + dest->CopyWithXReversal(src); + else if (type_ == NT_YREVERSED) + dest->CopyWithYReversal(src); + else + dest->CopyWithXYTranspose(src); +} + +} // namespace tesseract. diff --git a/lstm/reversed.h b/lstm/reversed.h new file mode 100644 index 00000000..97c2aebb --- /dev/null +++ b/lstm/reversed.h @@ -0,0 +1,89 @@ +/////////////////////////////////////////////////////////////////////// +// File: reversed.h +// Description: Runs a single network on time-reversed input, reversing output. +// Author: Ray Smith +// Created: Thu May 02 08:38:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_REVERSED_H_ +#define TESSERACT_LSTM_REVERSED_H_ + +#include "matrix.h" +#include "plumbing.h" + +namespace tesseract { + +// C++ Implementation of the Reversed class from lstm.py. +class Reversed : public Plumbing { + public: + explicit Reversed(const STRING& name, NetworkType type); + virtual ~Reversed(); + + // Returns the shape output from the network given an input shape (which may + // be partially unknown ie zero). + virtual StaticShape OutputShape(const StaticShape& input_shape) const; + + virtual STRING spec() const { + STRING spec(type_ == NT_XREVERSED ? "Rx" + : (type_ == NT_YREVERSED ? "Ry" : "Txy")); + // For most simple cases, we will output Rx or Ry where is + // the network in stack_[0], but in the special case that is an + // LSTM, we will just output the LSTM's spec modified to take the reversal + // into account. This is because when the user specified Lfy64, we actually + // generated TxyLfx64, and if the user specified Lrx64 we actually + // generated RxLfx64, and we want to display what the user asked for. + STRING net_spec = stack_[0]->spec(); + if (net_spec[0] == 'L') { + // Setup a from and to character according to the type of the reversal + // such that the LSTM spec gets modified to the spec that the user + // asked for + char from = 'f'; + char to = 'r'; + if (type_ == NT_XYTRANSPOSE) { + from = 'x'; + to = 'y'; + } + // Change the from char to the to char. + for (int i = 0; i < net_spec.length(); ++i) { + if (net_spec[i] == from) net_spec[i] = to; + } + return net_spec; + } + spec += net_spec; + return spec; + } + + // Takes ownership of the given network to make it the reversed one. + void SetNetwork(Network* network); + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + + // Runs backward propagation of errors on the deltas line. + // See Network for a detailed discussion of the arguments. + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas); + + private: + // Copies src to *dest with the reversal according to type_. + void ReverseData(const NetworkIO& src, NetworkIO* dest) const; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_REVERSED_H_ diff --git a/lstm/series.cpp b/lstm/series.cpp new file mode 100644 index 00000000..83d26cbf --- /dev/null +++ b/lstm/series.cpp @@ -0,0 +1,188 @@ +/////////////////////////////////////////////////////////////////////// +// File: series.cpp +// Description: Runs networks in series on the same input. +// Author: Ray Smith +// Created: Thu May 02 08:26:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "series.h" + +#include "fullyconnected.h" +#include "networkscratch.h" +#include "scrollview.h" +#include "tprintf.h" + +namespace tesseract { + +// ni_ and no_ will be set by AddToStack. +Series::Series(const STRING& name) : Plumbing(name) { + type_ = NT_SERIES; +} + +Series::~Series() { +} + +// Returns the shape output from the network given an input shape (which may +// be partially unknown ie zero). +StaticShape Series::OutputShape(const StaticShape& input_shape) const { + StaticShape result(input_shape); + int stack_size = stack_.size(); + for (int i = 0; i < stack_size; ++i) { + result = stack_[i]->OutputShape(result); + } + return result; +} + +// Sets up the network for training. Initializes weights using weights of +// scale `range` picked according to the random number generator `randomizer`. +// Note that series has its own implementation just for debug purposes. +int Series::InitWeights(float range, TRand* randomizer) { + num_weights_ = 0; + tprintf("Num outputs,weights in serial:\n"); + for (int i = 0; i < stack_.size(); ++i) { + int weights = stack_[i]->InitWeights(range, randomizer); + tprintf(" %s:%d, %d\n", + stack_[i]->spec().string(), stack_[i]->NumOutputs(), weights); + num_weights_ += weights; + } + tprintf("Total weights = %d\n", num_weights_); + return num_weights_; +} + +// Sets needs_to_backprop_ to needs_backprop and returns true if +// needs_backprop || any weights in this network so the next layer forward +// can be told to produce backprop for this layer if needed. +bool Series::SetupNeedsBackprop(bool needs_backprop) { + needs_to_backprop_ = needs_backprop; + for (int i = 0; i < stack_.size(); ++i) + needs_backprop = stack_[i]->SetupNeedsBackprop(needs_backprop); + return needs_backprop; +} + +// Returns an integer reduction factor that the network applies to the +// time sequence. Assumes that any 2-d is already eliminated. Used for +// scaling bounding boxes of truth data. +// WARNING: if GlobalMinimax is used to vary the scale, this will return +// the last used scale factor. Call it before any forward, and it will return +// the minimum scale factor of the paths through the GlobalMinimax. +int Series::XScaleFactor() const { + int factor = 1; + for (int i = 0; i < stack_.size(); ++i) + factor *= stack_[i]->XScaleFactor(); + return factor; +} + +// Provides the (minimum) x scale factor to the network (of interest only to +// input units) so they can determine how to scale bounding boxes. +void Series::CacheXScaleFactor(int factor) { + stack_[0]->CacheXScaleFactor(factor); +} + +// Runs forward propagation of activations on the input line. +// See NetworkCpp for a detailed discussion of the arguments. +void Series::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + int stack_size = stack_.size(); + ASSERT_HOST(stack_size > 1); + // Revolving intermediate buffers. + NetworkScratch::IO buffer1(input, scratch); + NetworkScratch::IO buffer2(input, scratch); + // Run each network in turn, giving the output of n as the input to n + 1, + // with the final network providing the real output. + stack_[0]->Forward(debug, input, input_transpose, scratch, buffer1); + for (int i = 1; i < stack_size; i += 2) { + stack_[i]->Forward(debug, *buffer1, NULL, scratch, + i + 1 < stack_size ? buffer2 : output); + if (i + 1 == stack_size) return; + stack_[i + 1]->Forward(debug, *buffer2, NULL, scratch, + i + 2 < stack_size ? buffer1 : output); + } +} + +// Runs backward propagation of errors on the deltas line. +// See NetworkCpp for a detailed discussion of the arguments. +bool Series::Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas) { + if (!IsTraining()) return false; + int stack_size = stack_.size(); + ASSERT_HOST(stack_size > 1); + // Revolving intermediate buffers. + NetworkScratch::IO buffer1(fwd_deltas, scratch); + NetworkScratch::IO buffer2(fwd_deltas, scratch); + // Run each network in reverse order, giving the back_deltas output of n as + // the fwd_deltas input to n-1, with the 0 network providing the real output. + if (!stack_.back()->IsTraining() || + !stack_.back()->Backward(debug, fwd_deltas, scratch, buffer1)) + return false; + for (int i = stack_size - 2; i >= 0; i -= 2) { + if (!stack_[i]->IsTraining() || + !stack_[i]->Backward(debug, *buffer1, scratch, + i > 0 ? buffer2 : back_deltas)) + return false; + if (i == 0) return needs_to_backprop_; + if (!stack_[i - 1]->IsTraining() || + !stack_[i - 1]->Backward(debug, *buffer2, scratch, + i > 1 ? buffer1 : back_deltas)) + return false; + } + return needs_to_backprop_; +} + +// Splits the series after the given index, returning the two parts and +// deletes itself. The first part, up to network with index last_start, goes +// into start, and the rest goes into end. +void Series::SplitAt(int last_start, Series** start, Series** end) { + *start = NULL; + *end = NULL; + if (last_start < 0 || last_start >= stack_.size()) { + tprintf("Invalid split index %d must be in range [0,%d]!\n", + last_start, stack_.size() - 1); + return; + } + Series* master_series = new Series("MasterSeries"); + Series* boosted_series = new Series("BoostedSeries"); + for (int s = 0; s <= last_start; ++s) { + if (s + 1 == stack_.size() && stack_[s]->type() == NT_SOFTMAX) { + // Change the softmax to a tanh. + FullyConnected* fc = reinterpret_cast(stack_[s]); + fc->ChangeType(NT_TANH); + } + master_series->AddToStack(stack_[s]); + stack_[s] = NULL; + } + for (int s = last_start + 1; s < stack_.size(); ++s) { + boosted_series->AddToStack(stack_[s]); + stack_[s] = NULL; + } + *start = master_series; + *end = boosted_series; + delete this; +} + +// Appends the elements of the src series to this, removing from src and +// deleting it. +void Series::AppendSeries(Network* src) { + ASSERT_HOST(src->type() == NT_SERIES); + Series* src_series = reinterpret_cast(src); + for (int s = 0; s < src_series->stack_.size(); ++s) { + AddToStack(src_series->stack_[s]); + src_series->stack_[s] = NULL; + } + delete src; +} + + +} // namespace tesseract. diff --git a/lstm/series.h b/lstm/series.h new file mode 100644 index 00000000..04799359 --- /dev/null +++ b/lstm/series.h @@ -0,0 +1,91 @@ +/////////////////////////////////////////////////////////////////////// +// File: series.h +// Description: Runs networks in series on the same input. +// Author: Ray Smith +// Created: Thu May 02 08:20:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_SERIES_H_ +#define TESSERACT_LSTM_SERIES_H_ + +#include "plumbing.h" + +namespace tesseract { + +// Runs two or more networks in series (layers) on the same input. +class Series : public Plumbing { + public: + // ni_ and no_ will be set by AddToStack. + explicit Series(const STRING& name); + virtual ~Series(); + + // Returns the shape output from the network given an input shape (which may + // be partially unknown ie zero). + virtual StaticShape OutputShape(const StaticShape& input_shape) const; + + virtual STRING spec() const { + STRING spec("["); + for (int i = 0; i < stack_.size(); ++i) + spec += stack_[i]->spec(); + spec += "]"; + return spec; + } + + // Sets up the network for training. Initializes weights using weights of + // scale `range` picked according to the random number generator `randomizer`. + // Returns the number of weights initialized. + virtual int InitWeights(float range, TRand* randomizer); + + // Sets needs_to_backprop_ to needs_backprop and returns true if + // needs_backprop || any weights in this network so the next layer forward + // can be told to produce backprop for this layer if needed. + virtual bool SetupNeedsBackprop(bool needs_backprop); + + // Returns an integer reduction factor that the network applies to the + // time sequence. Assumes that any 2-d is already eliminated. Used for + // scaling bounding boxes of truth data. + // WARNING: if GlobalMinimax is used to vary the scale, this will return + // the last used scale factor. Call it before any forward, and it will return + // the minimum scale factor of the paths through the GlobalMinimax. + virtual int XScaleFactor() const; + + // Provides the (minimum) x scale factor to the network (of interest only to + // input units) so they can determine how to scale bounding boxes. + virtual void CacheXScaleFactor(int factor); + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + + // Runs backward propagation of errors on the deltas line. + // See Network for a detailed discussion of the arguments. + virtual bool Backward(bool debug, const NetworkIO& fwd_deltas, + NetworkScratch* scratch, + NetworkIO* back_deltas); + + // Splits the series after the given index, returning the two parts and + // deletes itself. The first part, up to network with index last_start, goes + // into start, and the rest goes into end. + void SplitAt(int last_start, Series** start, Series** end); + + // Appends the elements of the src series to this, removing from src and + // deleting it. + void AppendSeries(Network* src); +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_SERIES_H_ diff --git a/lstm/static_shape.h b/lstm/static_shape.h new file mode 100644 index 00000000..4822a5af --- /dev/null +++ b/lstm/static_shape.h @@ -0,0 +1,80 @@ +/////////////////////////////////////////////////////////////////////// +// File: static_shape.h +// Description: Defines the size of the 4-d tensor input/output from a network. +// Author: Ray Smith +// Created: Fri Oct 14 09:07:31 PST 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// +#ifndef TESSERACT_LSTM_STATIC_SHAPE_H_ +#define TESSERACT_LSTM_STATIC_SHAPE_H_ + +#include "tprintf.h" + +namespace tesseract { + +// Enum describing the loss function to apply during training and/or the +// decoding method to apply at runtime. +enum LossType { + LT_NONE, // Undefined. + LT_CTC, // Softmax with standard CTC for training/decoding. + LT_SOFTMAX, // Outputs sum to 1 in fixed positions. + LT_LOGISTIC, // Logistic outputs with independent values. +}; + +// Simple class to hold the tensor shape that is known at network build time +// and the LossType of the loss function. +class StaticShape { + public: + StaticShape() + : batch_(0), height_(0), width_(0), depth_(0), loss_type_(LT_NONE) {} + int batch() const { return batch_; } + void set_batch(int value) { batch_ = value; } + int height() const { return height_; } + void set_height(int value) { height_ = value; } + int width() const { return width_; } + void set_width(int value) { width_ = value; } + int depth() const { return depth_; } + void set_depth(int value) { depth_ = value; } + LossType loss_type() const { return loss_type_; } + void set_loss_type(LossType value) { loss_type_ = value; } + void SetShape(int batch, int height, int width, int depth) { + batch_ = batch; + height_ = height; + width_ = width; + depth_ = depth; + } + + void Print() const { + tprintf("Batch=%d, Height=%d, Width=%d, Depth=%d, loss=%d\n", batch_, + height_, width_, depth_, loss_type_); + } + + private: + // Size of the 4-D tensor input/output to a network. A value of zero is + // allowed for all except depth_ and means to be determined at runtime, and + // regarded as variable. + // Number of elements in a batch, or number of frames in a video stream. + int batch_; + // Height of the image. + int height_; + // Width of the image. + int width_; + // Depth of the image. (Number of "nodes"). + int depth_; + // How to train/interpret the output. + LossType loss_type_; +}; + +} // namespace tesseract + +#endif // TESSERACT_LSTM_STATIC_SHAPE_H_ diff --git a/lstm/stridemap.cpp b/lstm/stridemap.cpp new file mode 100644 index 00000000..b3e70bca --- /dev/null +++ b/lstm/stridemap.cpp @@ -0,0 +1,173 @@ +/////////////////////////////////////////////////////////////////////// +// File: stridemap.cpp +// Description: Indexing into a 4-d tensor held in a 2-d Array. +// Author: Ray Smith +// Created: Fri Sep 20 15:30:31 PST 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "stridemap.h" + +namespace tesseract { + +// Returns true if *this is a valid index. +bool StrideMap::Index::IsValid() const { + // Cheap check first. + for (int d = 0; d < FD_DIMSIZE; ++d) { + if (indices_[d] < 0) return false; + } + for (int d = 0; d < FD_DIMSIZE; ++d) { + if (indices_[d] > MaxIndexOfDim(static_cast(d))) + return false; + } + return true; +} + +// Returns true if the index of the given dimension is the last. +bool StrideMap::Index::IsLast(FlexDimensions dimension) const { + return MaxIndexOfDim(dimension) == indices_[dimension]; +} + +// Given that the dimensions up to and including dim-1 are valid, returns the +// maximum index for dimension dim. +int StrideMap::Index::MaxIndexOfDim(FlexDimensions dim) const { + int max_index = stride_map_->shape_[dim] - 1; + if (dim == FD_BATCH) return max_index; + int batch = indices_[FD_BATCH]; + if (dim == FD_HEIGHT) { + if (batch >= stride_map_->heights_.size() || + stride_map_->heights_[batch] > max_index) + return max_index; + return stride_map_->heights_[batch] - 1; + } + if (batch >= stride_map_->widths_.size() || + stride_map_->widths_[batch] > max_index) + return max_index; + return stride_map_->widths_[batch] - 1; +} + +// Adds the given offset to the given dimension. Returns true if the result +// makes a valid index. +bool StrideMap::Index::AddOffset(int offset, FlexDimensions dimension) { + indices_[dimension] += offset; + SetTFromIndices(); + return IsValid(); +} + +// Increments the index in some encapsulated way that guarantees to remain +// valid until it returns false, meaning that the iteration is complete. +bool StrideMap::Index::Increment() { + for (int d = FD_DIMSIZE - 1; d >= 0; --d) { + if (!IsLast(static_cast(d))) { + t_ += stride_map_->t_increments_[d]; + ++indices_[d]; + return true; + } + t_ -= stride_map_->t_increments_[d] * indices_[d]; + indices_[d] = 0; + // Now carry to the next dimension. + } + return false; +} + +// Decrements the index in some encapsulated way that guarantees to remain +// valid until it returns false, meaning that the iteration (that started +// with InitToLast()) is complete. +bool StrideMap::Index::Decrement() { + for (int d = FD_DIMSIZE - 1; d >= 0; --d) { + if (indices_[d] > 0) { + --indices_[d]; + if (d == FD_BATCH) { + // The upper limits of the other dimensions may have changed as a result + // of a different batch index, so they have to be reset. + InitToLastOfBatch(indices_[FD_BATCH]); + } else { + t_ -= stride_map_->t_increments_[d]; + } + return true; + } + indices_[d] = MaxIndexOfDim(static_cast(d)); + t_ += stride_map_->t_increments_[d] * indices_[d]; + // Now borrow from the next dimension. + } + return false; +} + +// Initializes the indices to the last valid location in the given batch +// index. +void StrideMap::Index::InitToLastOfBatch(int batch) { + indices_[FD_BATCH] = batch; + for (int d = FD_BATCH + 1; d < FD_DIMSIZE; ++d) { + indices_[d] = MaxIndexOfDim(static_cast(d)); + } + SetTFromIndices(); +} + +// Computes and sets t_ from the current indices_. +void StrideMap::Index::SetTFromIndices() { + t_ = 0; + for (int d = 0; d < FD_DIMSIZE; ++d) { + t_ += stride_map_->t_increments_[d] * indices_[d]; + } +} + +// Sets up the stride for the given array of height, width pairs. +void StrideMap::SetStride(const std::vector>& h_w_pairs) { + int max_height = 0; + int max_width = 0; + for (const std::pair& hw : h_w_pairs) { + int height = hw.first; + int width = hw.second; + heights_.push_back(height); + widths_.push_back(width); + if (height > max_height) max_height = height; + if (width > max_width) max_width = width; + } + shape_[FD_BATCH] = heights_.size(); + shape_[FD_HEIGHT] = max_height; + shape_[FD_WIDTH] = max_width; + ComputeTIncrements(); +} + +// Scales width and height dimensions by the given factors. +void StrideMap::ScaleXY(int x_factor, int y_factor) { + for (int& height : heights_) height /= y_factor; + for (int& width : widths_) width /= x_factor; + shape_[FD_HEIGHT] /= y_factor; + shape_[FD_WIDTH] /= x_factor; + ComputeTIncrements(); +} + +// Reduces width to 1, across the batch, whatever the input size. +void StrideMap::ReduceWidthTo1() { + widths_.assign(widths_.size(), 1); + shape_[FD_WIDTH] = 1; + ComputeTIncrements(); +} + +// Transposes the width and height dimensions. +void StrideMap::TransposeXY() { + std::swap(shape_[FD_HEIGHT], shape_[FD_WIDTH]); + std::swap(heights_, widths_); + ComputeTIncrements(); +} + +// Computes t_increments_ from shape_. +void StrideMap::ComputeTIncrements() { + t_increments_[FD_DIMSIZE - 1] = 1; + for (int d = FD_DIMSIZE - 2; d >= 0; --d) { + t_increments_[d] = t_increments_[d + 1] * shape_[d + 1]; + } +} + +} // namespace tesseract diff --git a/lstm/stridemap.h b/lstm/stridemap.h new file mode 100644 index 00000000..2dd9e49b --- /dev/null +++ b/lstm/stridemap.h @@ -0,0 +1,137 @@ +/////////////////////////////////////////////////////////////////////// +// File: stridemap.h +// Description: Indexing into a 4-d tensor held in a 2-d Array. +// Author: Ray Smith +// Created: Fri Sep 20 16:00:31 PST 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// +#ifndef TESSERACT_LSTM_STRIDEMAP_H_ +#define TESSERACT_LSTM_STRIDEMAP_H_ + +#include +#include +#include "tprintf.h" + +namespace tesseract { + +// Enum describing the dimensions of the 'Tensor' in a NetworkIO. +// A NetworkIO is analogous to a TF Tensor, except that the number of dimensions +// is fixed (4), and they always have the same meaning. The underlying +// representation is a 2-D array, for which the product batch*height*width +// is always dim1 and depth is always dim2. FlexDimensions is used only for +// batch, height, width with the StrideMap, and therefore represents the runtime +// shape. The build-time shape is defined by StaticShape. +enum FlexDimensions { + FD_BATCH, // Index of multiple images. + FD_HEIGHT, // y-coordinate in image. + FD_WIDTH, // x-coordinate in image. + FD_DIMSIZE, // Number of flexible non-depth dimensions. +}; + +// Encapsulation of information relating to the mapping from [batch][y][x] to +// the first index into the 2-d array underlying a NetworkIO. +class StrideMap { + public: + // Class holding the non-depth indices. + class Index { + public: + explicit Index(const StrideMap& stride_map) : stride_map_(&stride_map) { + InitToFirst(); + } + Index(const StrideMap& stride_map, int batch, int y, int x) + : stride_map_(&stride_map) { + indices_[FD_BATCH] = batch; + indices_[FD_HEIGHT] = y; + indices_[FD_WIDTH] = x; + SetTFromIndices(); + } + // Accesses the index to the underlying array. + int t() const { return t_; } + int index(FlexDimensions dimension) const { return indices_[dimension]; } + // Initializes the indices to the first valid location. + void InitToFirst() { + memset(indices_, 0, sizeof(indices_)); + t_ = 0; + } + // Initializes the indices to the last valid location. + void InitToLast() { InitToLastOfBatch(MaxIndexOfDim(FD_BATCH)); } + // Returns true if *this is a valid index. + bool IsValid() const; + // Returns true if the index of the given dimension is the last. + bool IsLast(FlexDimensions dimension) const; + // Given that the dimensions up to and including dim-1 are valid, returns + // the maximum index for dimension dim. + int MaxIndexOfDim(FlexDimensions dim) const; + // Adds the given offset to the given dimension. Returns true if the result + // makes a valid index. + bool AddOffset(int offset, FlexDimensions dimension); + // Increments the index in some encapsulated way that guarantees to remain + // valid until it returns false, meaning that the iteration is complete. + bool Increment(); + // Decrements the index in some encapsulated way that guarantees to remain + // valid until it returns false, meaning that the iteration (that started + // with InitToLast()) is complete. + bool Decrement(); + + private: + // Initializes the indices to the last valid location in the given batch + // index. + void InitToLastOfBatch(int batch); + // Computes and sets t_ from the current indices_. + void SetTFromIndices(); + + // Map into which *this is an index. + const StrideMap* stride_map_; + // Index to the first dimension of the underlying array. + int t_; + // Indices into the individual dimensions. + int indices_[FD_DIMSIZE]; + }; + + StrideMap() { + memset(shape_, 0, sizeof(shape_)); + memset(t_increments_, 0, sizeof(t_increments_)); + } + // Default copy constructor and operator= are OK to use here! + + // Sets up the stride for the given array of height, width pairs. + void SetStride(const std::vector>& h_w_pairs); + // Scales width and height dimensions by the given factors. + void ScaleXY(int x_factor, int y_factor); + // Reduces width to 1, across the batch, whatever the input size. + void ReduceWidthTo1(); + // Transposes the width and height dimensions. + void TransposeXY(); + // Returns the size of the given dimension. + int Size(FlexDimensions dimension) const { return shape_[dimension]; } + // Returns the total width required. + int Width() const { return t_increments_[FD_BATCH] * shape_[FD_BATCH]; } + + private: + // Computes t_increments_ from shape_. + void ComputeTIncrements(); + + // The size of each non-depth dimension. + int shape_[FD_DIMSIZE]; + // Precomputed 't' increments for each dimension. This is the value of + // the given dimension in the packed 3-d array that the shape_ represents. + int t_increments_[FD_DIMSIZE]; + // Vector of size shape_[FD_BATCH] holds the height of each image in a batch. + std::vector heights_; + // Vector of size shape_[FD_BATCH] holds the width of each image in a batch. + std::vector widths_; +}; + +} // namespace tesseract + +#endif // TESSERACT_LSTM_STRIDEMAP_H_ diff --git a/lstm/tfnetwork.cpp b/lstm/tfnetwork.cpp new file mode 100644 index 00000000..13d487a4 --- /dev/null +++ b/lstm/tfnetwork.cpp @@ -0,0 +1,146 @@ +/////////////////////////////////////////////////////////////////////// +// File: tfnetwork.h +// Description: Encapsulation of an entire tensorflow graph as a +// Tesseract Network. +// Author: Ray Smith +// Created: Fri Feb 26 09:35:29 PST 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// +#ifdef INCLUDE_TENSORFLOW + +#include "tfnetwork.h" + +#include "allheaders.h" +#include "input.h" +#include "networkscratch.h" + +using tensorflow::Status; +using tensorflow::Tensor; +using tensorflow::TensorShape; + +namespace tesseract { + +TFNetwork::TFNetwork(const STRING& name) : Network(NT_TENSORFLOW, name, 0, 0) {} + +TFNetwork::~TFNetwork() {} + +int TFNetwork::InitFromProtoStr(const string& proto_str) { + if (!model_proto_.ParseFromString(proto_str)) return 0; + return InitFromProto(); +} + +// Writes to the given file. Returns false in case of error. +// Should be overridden by subclasses, but called by their Serialize. +bool TFNetwork::Serialize(TFile* fp) const { + if (!Network::Serialize(fp)) return false; + string proto_str; + model_proto_.SerializeToString(&proto_str); + GenericVector data; + data.init_to_size(proto_str.size(), 0); + memcpy(&data[0], proto_str.data(), proto_str.size()); + if (!data.Serialize(fp)) return false; + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +// Should be overridden by subclasses, but NOT called by their DeSerialize. +bool TFNetwork::DeSerialize(bool swap, TFile* fp) { + GenericVector data; + if (!data.DeSerialize(swap, fp)) return false; + if (!model_proto_.ParseFromArray(&data[0], data.size())) { + return false; + } + return InitFromProto(); +} + +// Runs forward propagation of activations on the input line. +// See Network for a detailed discussion of the arguments. +void TFNetwork::Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output) { + std::vector> tf_inputs; + int depth = input_shape_.depth(); + ASSERT_HOST(depth == input.NumFeatures()); + // TODO(rays) Allow batching. For now batch_size = 1. + const StrideMap& stride_map = input.stride_map(); + // TF requires a tensor of shape float[batch, height, width, depth]. + TensorShape shape{1, stride_map.Size(FD_HEIGHT), stride_map.Size(FD_WIDTH), + depth}; + Tensor input_tensor(tensorflow::DT_FLOAT, shape); + // The flat() member gives a 1d array, with a data() member to get the data. + auto eigen_tensor = input_tensor.flat(); + memcpy(eigen_tensor.data(), input.f(0), + input.Width() * depth * sizeof(input.f(0)[0])); + // Add the tensor to the vector of inputs. + tf_inputs.emplace_back(model_proto_.image_input(), input_tensor); + + // Provide tensors giving the width and/or height of the image if they are + // required. Some tf ops require a separate tensor with knowledge of the + // size of the input as they cannot obtain it from the input tensor. This is + // usually true in the case of ops that process a batch of variable-sized + // objects. + if (!model_proto_.image_widths().empty()) { + TensorShape size_shape{1}; + Tensor width_tensor(tensorflow::DT_INT32, size_shape); + auto eigen_wtensor = width_tensor.flat(); + *eigen_wtensor.data() = stride_map.Size(FD_WIDTH); + tf_inputs.emplace_back(model_proto_.image_widths(), width_tensor); + } + if (!model_proto_.image_heights().empty()) { + TensorShape size_shape{1}; + Tensor height_tensor(tensorflow::DT_INT32, size_shape); + auto eigen_htensor = height_tensor.flat(); + *eigen_htensor.data() = stride_map.Size(FD_HEIGHT); + tf_inputs.emplace_back(model_proto_.image_heights(), height_tensor); + } + std::vector target_layers = {model_proto_.output_layer()}; + std::vector outputs; + Status s = session_->Run(tf_inputs, target_layers, {}, &outputs); + ASSERT_HOST(s.ok()); + ASSERT_HOST(outputs.size() == 1); + const Tensor& output_tensor = outputs[0]; + // Check the dimensions of the output. + ASSERT_HOST(output_tensor.shape().dims() == 2); + int output_dim0 = output_tensor.shape().dim_size(0); + int output_dim1 = output_tensor.shape().dim_size(1); + ASSERT_HOST(output_dim1 == output_shape_.depth()); + output->Resize2d(false, output_dim0, output_dim1); + auto eigen_output = output_tensor.flat(); + memcpy(output->f(0), eigen_output.data(), + output_dim0 * output_dim1 * sizeof(output->f(0)[0])); +} + +int TFNetwork::InitFromProto() { + spec_ = model_proto_.spec(); + input_shape_.SetShape( + model_proto_.batch_size(), std::max(0, model_proto_.y_size()), + std::max(0, model_proto_.x_size()), model_proto_.depth()); + output_shape_.SetShape(model_proto_.batch_size(), 1, 0, + model_proto_.num_classes()); + output_shape_.set_loss_type(model_proto_.using_ctc() ? LT_CTC : LT_SOFTMAX); + ni_ = input_shape_.height(); + no_ = output_shape_.depth(); + // Initialize the session_ with the graph. Since we can't get the graph + // back from the session_, we have to keep the proto as well + tensorflow::SessionOptions options; + session_.reset(NewSession(options)); + Status s = session_->Create(model_proto_.graph()); + if (s.ok()) return model_proto_.global_step(); + tprintf("Session_->Create returned '%s'\n", s.error_message().c_str()); + return 0; +} + +} // namespace tesseract + +#endif // ifdef INCLUDE_TENSORFLOW diff --git a/lstm/tfnetwork.h b/lstm/tfnetwork.h new file mode 100644 index 00000000..749706cd --- /dev/null +++ b/lstm/tfnetwork.h @@ -0,0 +1,91 @@ +/////////////////////////////////////////////////////////////////////// +// File: tfnetwork.h +// Description: Encapsulation of an entire tensorflow graph as a +// Tesseract Network. +// Author: Ray Smith +// Created: Fri Feb 26 09:35:29 PST 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_TFNETWORK_H_ +#define TESSERACT_LSTM_TFNETWORK_H_ + +#ifdef INCLUDE_TENSORFLOW + +#include +#include + +#include "network.h" +#include "static_shape.h" +#include "tfnetwork.proto.h" +#include "third_party/tensorflow/core/framework/graph.pb.h" +#include "third_party/tensorflow/core/public/session.h" + +namespace tesseract { + +class TFNetwork : public Network { + public: + explicit TFNetwork(const STRING& name); + virtual ~TFNetwork(); + + // Returns the required shape input to the network. + virtual StaticShape InputShape() const { return input_shape_; } + // Returns the shape output from the network given an input shape (which may + // be partially unknown ie zero). + virtual StaticShape OutputShape(const StaticShape& input_shape) const { + return output_shape_; + } + + virtual STRING spec() const { return spec_.c_str(); } + + // Deserializes *this from a serialized TFNetwork proto. Returns 0 if failed, + // otherwise the global step of the serialized graph. + int InitFromProtoStr(const string& proto_str); + // The number of classes in this network should be equal to those in the + // recoder_ in LSTMRecognizer. + int num_classes() const { return output_shape_.depth(); } + + // Writes to the given file. Returns false in case of error. + // Should be overridden by subclasses, but called by their Serialize. + virtual bool Serialize(TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + // Should be overridden by subclasses, but NOT called by their DeSerialize. + virtual bool DeSerialize(bool swap, TFile* fp); + + // Runs forward propagation of activations on the input line. + // See Network for a detailed discussion of the arguments. + virtual void Forward(bool debug, const NetworkIO& input, + const TransposedArray* input_transpose, + NetworkScratch* scratch, NetworkIO* output); + + private: + int InitFromProto(); + + // The original network definition for reference. + string spec_; + // Input tensor parameters. + StaticShape input_shape_; + // Output tensor parameters. + StaticShape output_shape_; + // The tensor flow graph is contained in here. + std::unique_ptr session_; + // The serialized graph is also contained in here. + TFNetworkModel model_proto_; +}; + +} // namespace tesseract. + +#endif // ifdef INCLUDE_TENSORFLOW + +#endif // TESSERACT_TENSORFLOW_TFNETWORK_H_ diff --git a/lstm/tfnetwork.proto b/lstm/tfnetwork.proto new file mode 100644 index 00000000..0942fd27 --- /dev/null +++ b/lstm/tfnetwork.proto @@ -0,0 +1,61 @@ +syntax = "proto3"; + +package tesseract; + +// TODO(rays) How to make this usable both in Google and open source? +import "third_party/tensorflow/core/framework/graph.proto"; + +// This proto is the interface between a python TF graph builder/trainer and +// the C++ world. The writer of this proto must provide fields as documented +// by the comments below. +// The graph must have a placeholder for NetworkIO, Widths and Heights. The +// following python code creates the appropriate placeholders: +// +// input_layer = tf.placeholder(tf.float32, +// shape=[batch_size, xsize, ysize, depth_dim], +// name='NetworkIO') +// widths = tf.placeholder(tf.int32, shape=[batch_size], name='Widths') +// heights = tf.placeholder(tf.int32, shape=[batch_size], name='Heights') +// # Flip x and y to the TF convention. +// input_layer = tf.transpose(input_layer, [0, 2, 1, 3]) +// +// The widths and heights will be set to indicate the post-scaling size of the +// input image(s). +// For now batch_size is ignored and set to 1. +// The graph should return a 2-dimensional float32 tensor called 'softmax' of +// shape [sequence_length, num_classes], where sequence_length is allowed to +// be variable, given by the tensor itself. +// TODO(rays) determine whether it is worth providing for batch_size >1 and if +// so, how. +message TFNetworkModel { + // The TF graph definition. Required. + tensorflow.GraphDef graph = 1; + // The training index. Required to be > 0. + int64 global_step = 2; + // The original network definition for reference. Optional + string spec = 3; + // Input tensor parameters. + // Values per pixel. Required to be 1 or 3. Inputs assumed to be float32. + int32 depth = 4; + // Image size. Required. Zero implies flexible sizes, fixed if non-zero. + // If x_size > 0, images will be cropped/padded to the given size, after + // any scaling required by the y_size. + // If y_size > 0, images will be scaled isotropically to the given height. + int32 x_size = 5; + int32 y_size = 6; + // Number of images in a batch. Optional. + int32 batch_size = 8; + // Output tensor parameters. + // Number of output classes. Required to match the depth of the softmax. + int32 num_classes = 9; + // True if this network needs CTC-like decoding, dropping duplicated labels. + // The decoder always drops the null character. + bool using_ctc = 10; + // Name of input image tensor. + string image_input = 11; + // Name of image height and width tensors. + string image_widths = 12; + string image_heights = 13; + // Name of output (softmax) tensor. + string output_layer = 14; +} diff --git a/lstm/weightmatrix.cpp b/lstm/weightmatrix.cpp new file mode 100644 index 00000000..477de466 --- /dev/null +++ b/lstm/weightmatrix.cpp @@ -0,0 +1,382 @@ +/////////////////////////////////////////////////////////////////////// +// File: weightmatrix.h +// Description: Hides distinction between float/int implementations. +// Author: Ray Smith +// Created: Tue Jun 17 11:46:20 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "weightmatrix.h" + +#include "dotproductavx.h" +#include "dotproductsse.h" +#include "simddetect.h" +#include "statistc.h" +#include "tprintf.h" + +namespace tesseract { + +// Copies the whole input transposed, converted to double, into *this. +void TransposedArray::Transpose(const GENERIC_2D_ARRAY& input) { + int width = input.dim1(); + int num_features = input.dim2(); + ResizeNoInit(num_features, width); + for (int t = 0; t < width; ++t) WriteStrided(t, input[t]); +} + +// Sets up the network for training. Initializes weights using weights of +// scale `range` picked according to the random number generator `randomizer`. +int WeightMatrix::InitWeightsFloat(int no, int ni, bool ada_grad, + float weight_range, TRand* randomizer) { + int_mode_ = false; + wf_.Resize(no, ni, 0.0); + if (randomizer != NULL) { + for (int i = 0; i < no; ++i) { + for (int j = 0; j < ni; ++j) { + wf_[i][j] = randomizer->SignedRand(weight_range); + } + } + } + InitBackward(ada_grad); + return ni * no; +} + +// Converts a float network to an int network. Each set of input weights that +// corresponds to a single output weight is converted independently: +// Compute the max absolute value of the weight set. +// Scale so the max absolute value becomes MAX_INT8. +// Round to integer. +// Store a multiplicative scale factor (as a double) that will reproduce +// the original value, subject to rounding errors. +void WeightMatrix::ConvertToInt() { + wi_.ResizeNoInit(wf_.dim1(), wf_.dim2()); + scales_.init_to_size(wi_.dim1(), 0.0); + int dim2 = wi_.dim2(); + for (int t = 0; t < wi_.dim1(); ++t) { + double* f_line = wf_[t]; + inT8* i_line = wi_[t]; + double max_abs = 0.0; + for (int f = 0; f < dim2; ++f) { + double abs_val = fabs(f_line[f]); + if (abs_val > max_abs) max_abs = abs_val; + } + double scale = max_abs / MAX_INT8; + scales_[t] = scale; + if (scale == 0.0) scale = 1.0; + for (int f = 0; f < dim2; ++f) { + i_line[f] = IntCastRounded(f_line[f] / scale); + } + } + wf_.Resize(1, 1, 0.0); + int_mode_ = true; +} + +// Allocates any needed memory for running Backward, and zeroes the deltas, +// thus eliminating any existing momentum. +void WeightMatrix::InitBackward(bool ada_grad) { + int no = int_mode_ ? wi_.dim1() : wf_.dim1(); + int ni = int_mode_ ? wi_.dim2() : wf_.dim2(); + use_ada_grad_ = ada_grad; + dw_.Resize(no, ni, 0.0); + updates_.Resize(no, ni, 0.0); + wf_t_.Transpose(wf_); + if (use_ada_grad_) dw_sq_sum_.Resize(no, ni, 0.0); +} + +// Flag on mode to indicate that this weightmatrix uses inT8. +const int kInt8Flag = 1; +// Flag on mode to indicate that this weightmatrix uses ada grad. +const int kAdaGradFlag = 4; +// Flag on mode to indicate that this weightmatrix uses double. Set +// independently of kInt8Flag as even in int mode the scales can +// be float or double. +const int kDoubleFlag = 128; + +// Writes to the given file. Returns false in case of error. +bool WeightMatrix::Serialize(bool training, TFile* fp) const { + // For backward compatibility, add kDoubleFlag to mode to indicate the doubles + // format, without errs, so we can detect and read old format weight matrices. + uinT8 mode = (int_mode_ ? kInt8Flag : 0) | + (use_ada_grad_ ? kAdaGradFlag : 0) | kDoubleFlag; + if (fp->FWrite(&mode, sizeof(mode), 1) != 1) return false; + if (int_mode_) { + if (!wi_.Serialize(fp)) return false; + if (!scales_.Serialize(fp)) return false; + } else { + if (!wf_.Serialize(fp)) return false; + if (training && !updates_.Serialize(fp)) return false; + if (training && use_ada_grad_ && !dw_sq_sum_.Serialize(fp)) return false; + } + return true; +} + +// Reads from the given file. Returns false in case of error. +// If swap is true, assumes a big/little-endian swap is needed. +bool WeightMatrix::DeSerialize(bool training, bool swap, TFile* fp) { + uinT8 mode = 0; + if (fp->FRead(&mode, sizeof(mode), 1) != 1) return false; + int_mode_ = (mode & kInt8Flag) != 0; + use_ada_grad_ = (mode & kAdaGradFlag) != 0; + if ((mode & kDoubleFlag) == 0) return DeSerializeOld(training, swap, fp); + if (int_mode_) { + if (!wi_.DeSerialize(swap, fp)) return false; + if (!scales_.DeSerialize(swap, fp)) return false; + } else { + if (!wf_.DeSerialize(swap, fp)) return false; + if (training) { + InitBackward(use_ada_grad_); + if (!updates_.DeSerialize(swap, fp)) return false; + if (use_ada_grad_ && !dw_sq_sum_.DeSerialize(swap, fp)) return false; + } + } + return true; +} + +// As DeSerialize, but reads an old (float) format WeightMatrix for +// backward compatibility. +bool WeightMatrix::DeSerializeOld(bool training, bool swap, TFile* fp) { + GENERIC_2D_ARRAY float_array; + if (int_mode_) { + if (!wi_.DeSerialize(swap, fp)) return false; + GenericVector old_scales; + if (!old_scales.DeSerialize(swap, fp)) return false; + scales_.init_to_size(old_scales.size(), 0.0); + for (int i = 0; i < old_scales.size(); ++i) scales_[i] = old_scales[i]; + } else { + if (!float_array.DeSerialize(swap, fp)) return false; + FloatToDouble(float_array, &wf_); + } + if (training) { + InitBackward(use_ada_grad_); + if (!float_array.DeSerialize(swap, fp)) return false; + FloatToDouble(float_array, &updates_); + // Errs was only used in int training, which is now dead. + if (!float_array.DeSerialize(swap, fp)) return false; + } + return true; +} + +// Computes matrix.vector v = Wu. +// u is of size W.dim2() - 1 and the output v is of size W.dim1(). +// u is imagined to have an extra element at the end with value 1, to +// implement the bias, but it doesn't actually have it. +// Asserts that the call matches what we have. +void WeightMatrix::MatrixDotVector(const double* u, double* v) const { + ASSERT_HOST(!int_mode_); + MatrixDotVectorInternal(wf_, true, false, u, v); +} + +void WeightMatrix::MatrixDotVector(const inT8* u, double* v) const { + ASSERT_HOST(int_mode_); + int num_out = wi_.dim1(); + int num_in = wi_.dim2() - 1; + for (int i = 0; i < num_out; ++i) { + const inT8* Wi = wi_[i]; + int total = 0; + if (SIMDDetect::IsSSEAvailable()) { + total = IntDotProductSSE(u, Wi, num_in); + } else { + for (int j = 0; j < num_in; ++j) total += Wi[j] * u[j]; + } + // Add in the bias and correct for integer values. + v[i] = (static_cast(total) / MAX_INT8 + Wi[num_in]) * scales_[i]; + } +} + +// MatrixDotVector for peep weights, MultiplyAccumulate adds the +// component-wise products of *this[0] and v to inout. +void WeightMatrix::MultiplyAccumulate(const double* v, double* inout) { + ASSERT_HOST(!int_mode_); + ASSERT_HOST(wf_.dim1() == 1); + int n = wf_.dim2(); + const double* u = wf_[0]; + for (int i = 0; i < n; ++i) { + inout[i] += u[i] * v[i]; + } +} + +// Computes vector.matrix v = uW. +// u is of size W.dim1() and the output v is of size W.dim2() - 1. +// The last result is discarded, as v is assumed to have an imaginary +// last value of 1, as with MatrixDotVector. +void WeightMatrix::VectorDotMatrix(const double* u, double* v) const { + ASSERT_HOST(!int_mode_); + MatrixDotVectorInternal(wf_t_, false, true, u, v); +} + +// Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements from +// u and v. In terms of the neural network, u is the gradients and v is the +// inputs. +// Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0. +// Runs parallel if requested. Note that u and v must be transposed. +void WeightMatrix::SumOuterTransposed(const TransposedArray& u, + const TransposedArray& v, + bool in_parallel) { + ASSERT_HOST(!int_mode_); + int num_outputs = dw_.dim1(); + ASSERT_HOST(u.dim1() == num_outputs); + ASSERT_HOST(u.dim2() == v.dim2()); + int num_inputs = dw_.dim2() - 1; + int num_samples = u.dim2(); + // v is missing the last element in dim1. + ASSERT_HOST(v.dim1() == num_inputs); +#ifdef _OPENMP +#pragma omp parallel for num_threads(4) if (in_parallel) +#endif + for (int i = 0; i < num_outputs; ++i) { + double* dwi = dw_[i]; + const double* ui = u[i]; + for (int j = 0; j < num_inputs; ++j) { + dwi[j] = DotProduct(ui, v[j], num_samples); + } + // The last element of v is missing, presumed 1.0f. + double total = 0.0; + for (int k = 0; k < num_samples; ++k) total += ui[k]; + dwi[num_inputs] = total; + } +} + +// Updates the weights using the given learning rate and momentum. +// num_samples is the quotient to be used in the adagrad computation iff +// use_ada_grad_ is true. +void WeightMatrix::Update(double learning_rate, double momentum, + int num_samples) { + ASSERT_HOST(!int_mode_); + if (use_ada_grad_ && num_samples > 0) { + dw_sq_sum_.SumSquares(dw_); + dw_.AdaGradScaling(dw_sq_sum_, num_samples); + } + dw_ *= learning_rate; + updates_ += dw_; + if (momentum > 0.0) wf_ += updates_; + if (momentum >= 0.0) updates_ *= momentum; + wf_t_.Transpose(wf_); +} + +// Adds the dw_ in other to the dw_ is *this. +void WeightMatrix::AddDeltas(const WeightMatrix& other) { + ASSERT_HOST(dw_.dim1() == other.dw_.dim1()); + ASSERT_HOST(dw_.dim2() == other.dw_.dim2()); + dw_ += other.dw_; +} + +// Sums the products of weight updates in *this and other, splitting into +// positive (same direction) in *same and negative (different direction) in +// *changed. +void WeightMatrix::CountAlternators(const WeightMatrix& other, double* same, + double* changed) const { + int num_outputs = updates_.dim1(); + int num_inputs = updates_.dim2(); + ASSERT_HOST(num_outputs == other.updates_.dim1()); + ASSERT_HOST(num_inputs == other.updates_.dim2()); + for (int i = 0; i < num_outputs; ++i) { + const double* this_i = updates_[i]; + const double* other_i = other.updates_[i]; + for (int j = 0; j < num_inputs; ++j) { + double product = this_i[j] * other_i[j]; + if (product < 0.0) + *changed -= product; + else + *same += product; + } + } +} + +// Helper computes an integer histogram bucket for a weight and adds it +// to the histogram. +const int kHistogramBuckets = 16; +static void HistogramWeight(double weight, STATS* histogram) { + int bucket = kHistogramBuckets - 1; + if (weight != 0.0) { + double logval = -log2(fabs(weight)); + bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1); + } + histogram->add(bucket, 1); +} + +void WeightMatrix::Debug2D(const char* msg) { + STATS histogram(0, kHistogramBuckets); + if (int_mode_) { + for (int i = 0; i < wi_.dim1(); ++i) { + for (int j = 0; j < wi_.dim2(); ++j) { + HistogramWeight(wi_[i][j] * scales_[i], &histogram); + } + } + } else { + for (int i = 0; i < wf_.dim1(); ++i) { + for (int j = 0; j < wf_.dim2(); ++j) { + HistogramWeight(wf_[i][j], &histogram); + } + } + } + tprintf("%s\n", msg); + histogram.print(); +} + +// Computes and returns the dot product of the two n-vectors u and v. +/* static */ +double WeightMatrix::DotProduct(const double* u, const double* v, int n) { + // Note: because the order of addition is different among the 3 DotProduct + // functions, the results can (and do) vary slightly (although they agree + // to within about 4e-15). This produces different results when running + // training, despite all random inputs being precisely equal. + // To get consistent results, use just one of these DotProduct functions. + // On a test multi-layer network, serial is 57% slower than sse, and avx + // is about 8% faster than sse. This suggests that the time is memory + // bandwidth constrained and could benefit from holding the reused vector + // in AVX registers. + if (SIMDDetect::IsAVXAvailable()) return DotProductAVX(u, v, n); + if (SIMDDetect::IsSSEAvailable()) return DotProductSSE(u, v, n); + double total = 0.0; + for (int k = 0; k < n; ++k) total += u[k] * v[k]; + return total; +} + +// Utility function converts an array of float to the corresponding array +// of double. +/* static */ +void WeightMatrix::FloatToDouble(const GENERIC_2D_ARRAY& wf, + GENERIC_2D_ARRAY* wd) { + int dim1 = wf.dim1(); + int dim2 = wf.dim2(); + wd->ResizeNoInit(dim1, dim2); + for (int i = 0; i < dim1; ++i) { + const float* wfi = wf[i]; + double* wdi = (*wd)[i]; + for (int j = 0; j < dim2; ++j) wdi[j] = static_cast(wfi[j]); + } +} + +// Computes matrix.vector v = Wu. +// u is of size W.dim2() - add_bias_fwd and the output v is of size +// W.dim1() - skip_bias_back. +// If add_bias_fwd, u is imagined to have an extra element at the end with value +// 1, to implement the bias, weight. +// If skip_bias_back, we are actullay performing the backwards product on a +// transposed matrix, so we need to drop the v output corresponding to the last +// element in dim1. +void WeightMatrix::MatrixDotVectorInternal(const GENERIC_2D_ARRAY& w, + bool add_bias_fwd, + bool skip_bias_back, const double* u, + double* v) { + int num_results = w.dim1() - skip_bias_back; + int extent = w.dim2() - add_bias_fwd; + for (int i = 0; i < num_results; ++i) { + const double* wi = w[i]; + double total = DotProduct(wi, u, extent); + if (add_bias_fwd) total += wi[extent]; // The bias value. + v[i] = total; + } +} + +} // namespace tesseract. diff --git a/lstm/weightmatrix.h b/lstm/weightmatrix.h new file mode 100644 index 00000000..635c6618 --- /dev/null +++ b/lstm/weightmatrix.h @@ -0,0 +1,183 @@ +/////////////////////////////////////////////////////////////////////// +// File: weightmatrix.h +// Description: Hides distinction between float/int implementations. +// Author: Ray Smith +// Created: Tue Jun 17 09:05:39 PST 2014 +// +// (C) Copyright 2014, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_LSTM_WEIGHTMATRIX_H_ +#define TESSERACT_LSTM_WEIGHTMATRIX_H_ + +#include "genericvector.h" +#include "matrix.h" +#include "tprintf.h" + +namespace tesseract { + +// Convenience instantiation of GENERIC_2D_ARRAY with additional +// operations to write a strided vector, so the transposed form of the input +// is memory-contiguous. +class TransposedArray : public GENERIC_2D_ARRAY { + public: + // Copies the whole input transposed, converted to double, into *this. + void Transpose(const GENERIC_2D_ARRAY& input); + // Writes a vector of data representing a timestep (gradients or sources). + // The data is assumed to be of size1 in size (the strided dimension). + void WriteStrided(int t, const float* data) { + int size1 = dim1(); + for (int i = 0; i < size1; ++i) put(i, t, data[i]); + } + void WriteStrided(int t, const double* data) { + int size1 = dim1(); + for (int i = 0; i < size1; ++i) put(i, t, data[i]); + } + // Prints the first and last num elements of the un-transposed array. + void PrintUnTransposed(int num) { + int num_features = dim1(); + int width = dim2(); + for (int y = 0; y < num_features; ++y) { + for (int t = 0; t < width; ++t) { + if (num == 0 || t < num || t + num >= width) { + tprintf(" %g", (*this)(y, t)); + } + } + tprintf("\n"); + } + } +}; // class TransposedArray + +// Generic weight matrix for network layers. Can store the matrix as either +// an array of floats or inT8. Provides functions to compute the forward and +// backward steps with the matrix and updates to the weights. +class WeightMatrix { + public: + WeightMatrix() : int_mode_(false), use_ada_grad_(false) {} + // Sets up the network for training. Initializes weights using weights of + // scale `range` picked according to the random number generator `randomizer`. + // Note the order is outputs, inputs, as this is the order of indices to + // the matrix, so the adjacent elements are multiplied by the input during + // a forward operation. + int InitWeightsFloat(int no, int ni, bool ada_grad, float weight_range, + TRand* randomizer); + + // Converts a float network to an int network. Each set of input weights that + // corresponds to a single output weight is converted independently: + // Compute the max absolute value of the weight set. + // Scale so the max absolute value becomes MAX_INT8. + // Round to integer. + // Store a multiplicative scale factor (as a float) that will reproduce + // the original value, subject to rounding errors. + void ConvertToInt(); + + // Accessors. + bool is_int_mode() const { + return int_mode_; + } + int NumOutputs() const { return int_mode_ ? wi_.dim1() : wf_.dim1(); } + // Provides one set of weights. Only used by peep weight maxpool. + const double* GetWeights(int index) const { return wf_[index]; } + // Provides access to the deltas (dw_). + double GetDW(int i, int j) const { return dw_(i, j); } + + // Allocates any needed memory for running Backward, and zeroes the deltas, + // thus eliminating any existing momentum. + void InitBackward(bool ada_grad); + + // Writes to the given file. Returns false in case of error. + bool Serialize(bool training, TFile* fp) const; + // Reads from the given file. Returns false in case of error. + // If swap is true, assumes a big/little-endian swap is needed. + bool DeSerialize(bool training, bool swap, TFile* fp); + // As DeSerialize, but reads an old (float) format WeightMatrix for + // backward compatibility. + bool DeSerializeOld(bool training, bool swap, TFile* fp); + + // Computes matrix.vector v = Wu. + // u is of size W.dim2() - 1 and the output v is of size W.dim1(). + // u is imagined to have an extra element at the end with value 1, to + // implement the bias, but it doesn't actually have it. + // Asserts that the call matches what we have. + void MatrixDotVector(const double* u, double* v) const; + void MatrixDotVector(const inT8* u, double* v) const; + // MatrixDotVector for peep weights, MultiplyAccumulate adds the + // component-wise products of *this[0] and v to inout. + void MultiplyAccumulate(const double* v, double* inout); + // Computes vector.matrix v = uW. + // u is of size W.dim1() and the output v is of size W.dim2() - 1. + // The last result is discarded, as v is assumed to have an imaginary + // last value of 1, as with MatrixDotVector. + void VectorDotMatrix(const double* u, double* v) const; + // Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements + // from u and v, starting with u[i][offset] and v[j][offset]. + // Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0. + // Runs parallel if requested. Note that inputs must be transposed. + void SumOuterTransposed(const TransposedArray& u, const TransposedArray& v, + bool parallel); + // Updates the weights using the given learning rate and momentum. + // num_samples is the quotient to be used in the adagrad computation iff + // use_ada_grad_ is true. + void Update(double learning_rate, double momentum, int num_samples); + // Adds the dw_ in other to the dw_ is *this. + void AddDeltas(const WeightMatrix& other); + // Sums the products of weight updates in *this and other, splitting into + // positive (same direction) in *same and negative (different direction) in + // *changed. + void CountAlternators(const WeightMatrix& other, double* same, + double* changed) const; + + void Debug2D(const char* msg); + + // Computes and returns the dot product of the two n-vectors u and v. + static double DotProduct(const double* u, const double* v, int n); + // Utility function converts an array of float to the corresponding array + // of double. + static void FloatToDouble(const GENERIC_2D_ARRAY& wf, + GENERIC_2D_ARRAY* wd); + + private: + // Computes matrix.vector v = Wu. + // u is of size starts.back()+extents.back() and the output v is of size + // starts.size(). + // The weight matrix w, is of size starts.size()xMAX(extents)+add_bias_fwd. + // If add_bias_fwd, an extra element at the end of w[i] is the bias weight + // and is added to v[i]. + static void MatrixDotVectorInternal(const GENERIC_2D_ARRAY& w, + bool add_bias_fwd, bool skip_bias_back, + const double* u, double* v); + + private: + // Choice between float and 8 bit int implementations. + GENERIC_2D_ARRAY wf_; + GENERIC_2D_ARRAY wi_; + // Transposed copy of wf_, used only for Backward, and set with each Update. + TransposedArray wf_t_; + // Which of wf_ and wi_ are we actually using. + bool int_mode_; + // True if we are running adagrad in this weight matrix. + bool use_ada_grad_; + // If we are using wi_, then scales_ is a factor to restore the row product + // with a vector to the correct range. + GenericVector scales_; + // Weight deltas. dw_ is the new delta, and updates_ the momentum-decaying + // amount to be added to wf_/wi_. + GENERIC_2D_ARRAY dw_; + GENERIC_2D_ARRAY updates_; + // Iff use_ada_grad_, the sum of squares of dw_. The number of samples is + // given to Update(). Serialized iff use_ada_grad_. + GENERIC_2D_ARRAY dw_sq_sum_; +}; + +} // namespace tesseract. + +#endif // TESSERACT_LSTM_WEIGHTMATRIX_H_ diff --git a/neural_networks/runtime/Makefile.am b/neural_networks/runtime/Makefile.am deleted file mode 100644 index de43768f..00000000 --- a/neural_networks/runtime/Makefile.am +++ /dev/null @@ -1,25 +0,0 @@ -AM_CPPFLAGS += \ - -DUSE_STD_NAMESPACE \ - -I$(top_srcdir)/cutil -I$(top_srcdir)/ccutil \ - -I$(top_srcdir)/ccstruct -I$(top_srcdir)/dict \ - -I$(top_srcdir)/image -I$(top_srcdir)/viewer - -if VISIBILITY -AM_CPPFLAGS += -DTESS_EXPORTS \ - -fvisibility=hidden -fvisibility-inlines-hidden -endif - -noinst_HEADERS = \ - input_file_buffer.h neural_net.h neuron.h - -if !USING_MULTIPLELIBS -noinst_LTLIBRARIES = libtesseract_neural.la -else -lib_LTLIBRARIES = libtesseract_neural.la -libtesseract_neural_la_LDFLAGS = -version-info $(GENERIC_LIBRARY_VERSION) -endif - -libtesseract_neural_la_SOURCES = \ - input_file_buffer.cpp neural_net.cpp neuron.cpp sigmoid_table.cpp - - diff --git a/neural_networks/runtime/input_file_buffer.cpp b/neural_networks/runtime/input_file_buffer.cpp deleted file mode 100644 index c3ca67b6..00000000 --- a/neural_networks/runtime/input_file_buffer.cpp +++ /dev/null @@ -1,36 +0,0 @@ -// Copyright 2008 Google Inc. -// All Rights Reserved. -// Author: ahmadab@google.com (Ahmad Abdulkader) -// -// input_file_buffer.h: Declarations of a class for an object that -// represents an input file buffer. - -#include -#include "input_file_buffer.h" - -namespace tesseract { -// default and only contsructor -InputFileBuffer::InputFileBuffer(const string &file_name) - : file_name_(file_name) { - fp_ = NULL; -} - -// virtual destructor -InputFileBuffer::~InputFileBuffer() { - if (fp_ != NULL) { - fclose(fp_); - } -} - -// Read the specified number of bytes to the specified input buffer -int InputFileBuffer::Read(void *buffer, int bytes_to_read) { - // open the file if necessary - if (fp_ == NULL) { - fp_ = fopen(file_name_.c_str(), "rb"); - if (fp_ == NULL) { - return 0; - } - } - return fread(buffer, 1, bytes_to_read, fp_); -} -} diff --git a/neural_networks/runtime/input_file_buffer.h b/neural_networks/runtime/input_file_buffer.h deleted file mode 100644 index 5aa7465c..00000000 --- a/neural_networks/runtime/input_file_buffer.h +++ /dev/null @@ -1,31 +0,0 @@ -// Copyright 2008 Google Inc. -// All Rights Reserved. -// Author: ahmadab@google.com (Ahmad Abdulkader) -// -// input_file_buffer.h: Declarations of a class for an object that -// represents an input file buffer. -// - -#ifndef INPUT_FILE_BUFFER_H -#define INPUT_FILE_BUFFER_H - -#include -#include -#ifdef USE_STD_NAMESPACE -using std::string; -#endif - -namespace tesseract { -class InputFileBuffer { - public: - explicit InputFileBuffer(const string &file_name); - virtual ~InputFileBuffer(); - int Read(void *buffer, int bytes_to_read); - - protected: - string file_name_; - FILE *fp_; -}; -} - -#endif // INPUT_FILE_BUFFER_H__ diff --git a/neural_networks/runtime/neural_net.cpp b/neural_networks/runtime/neural_net.cpp deleted file mode 100644 index fd2c65af..00000000 --- a/neural_networks/runtime/neural_net.cpp +++ /dev/null @@ -1,305 +0,0 @@ -// Copyright 2008 Google Inc. -// All Rights Reserved. -// Author: ahmadab@google.com (Ahmad Abdulkader) -// -// neural_net.cpp: Declarations of a class for an object that -// represents an arbitrary network of neurons -// -#include -#include -#include "neural_net.h" -#include "input_file_buffer.h" - -namespace tesseract { - -NeuralNet::NeuralNet() { - Init(); -} - -NeuralNet::~NeuralNet() { - // clean up the wts chunks vector - for (int vec = 0; vec < static_cast(wts_vec_.size()); vec++) { - delete wts_vec_[vec]; - } - // clean up neurons - delete []neurons_; - // clean up nodes - for (int node_idx = 0; node_idx < neuron_cnt_; node_idx++) { - delete []fast_nodes_[node_idx].inputs; - } - -} - -// Initiaization function -void NeuralNet::Init() { - read_only_ = true; - auto_encoder_ = false; - alloc_wgt_cnt_ = 0; - wts_cnt_ = 0; - neuron_cnt_ = 0; - in_cnt_ = 0; - out_cnt_ = 0; - wts_vec_.clear(); - neurons_ = NULL; - inputs_mean_.clear(); - inputs_std_dev_.clear(); - inputs_min_.clear(); - inputs_max_.clear(); -} - -// Does a fast feedforward for read_only nets -// Templatized for float and double Types -template bool NeuralNet::FastFeedForward(const Type *inputs, - Type *outputs) { - int node_idx = 0; - Node *node = &fast_nodes_[0]; - // feed inputs in and offset them by the pre-computed bias - for (node_idx = 0; node_idx < in_cnt_; node_idx++, node++) { - node->out = inputs[node_idx] - node->bias; - } - // compute nodes activations and outputs - for (;node_idx < neuron_cnt_; node_idx++, node++) { - double activation = -node->bias; - for (int fan_in_idx = 0; fan_in_idx < node->fan_in_cnt; fan_in_idx++) { - activation += (node->inputs[fan_in_idx].input_weight * - node->inputs[fan_in_idx].input_node->out); - } - node->out = Neuron::Sigmoid(activation); - } - // copy the outputs to the output buffers - node = &fast_nodes_[neuron_cnt_ - out_cnt_]; - for (node_idx = 0; node_idx < out_cnt_; node_idx++, node++) { - outputs[node_idx] = node->out; - } - return true; -} - -// Performs a feedforward for general nets. Used mainly in training mode -// Templatized for float and double Types -template bool NeuralNet::FeedForward(const Type *inputs, - Type *outputs) { - // call the fast version in case of readonly nets - if (read_only_) { - return FastFeedForward(inputs, outputs); - } - // clear all neurons - Clear(); - // for auto encoders, apply no input normalization - if (auto_encoder_) { - for (int in = 0; in < in_cnt_; in++) { - neurons_[in].set_output(inputs[in]); - } - } else { - // Input normalization : subtract mean and divide by stddev - for (int in = 0; in < in_cnt_; in++) { - neurons_[in].set_output((inputs[in] - inputs_min_[in]) / - (inputs_max_[in] - inputs_min_[in])); - neurons_[in].set_output((neurons_[in].output() - inputs_mean_[in]) / - inputs_std_dev_[in]); - } - } - // compute the net outputs: follow a pull model each output pulls the - // outputs of its input nodes and so on - for (int out = neuron_cnt_ - out_cnt_; out < neuron_cnt_; out++) { - neurons_[out].FeedForward(); - // copy the values to the output buffer - outputs[out] = neurons_[out].output(); - } - return true; -} - -// Sets a connection between two neurons -bool NeuralNet::SetConnection(int from, int to) { - // allocate the wgt - float *wts = AllocWgt(1); - if (wts == NULL) { - return false; - } - // register the connection - neurons_[to].AddFromConnection(neurons_ + from, wts, 1); - return true; -} - -// Create a fast readonly version of the net -bool NeuralNet::CreateFastNet() { - fast_nodes_.resize(neuron_cnt_); - // build the node structures - int wts_cnt = 0; - for (int node_idx = 0; node_idx < neuron_cnt_; node_idx++) { - Node *node = &fast_nodes_[node_idx]; - if (neurons_[node_idx].node_type() == Neuron::Input) { - // Input neurons have no fan-in - node->fan_in_cnt = 0; - node->inputs = NULL; - // Input bias is the normalization offset computed from - // training input stats - if (fabs(inputs_max_[node_idx] - inputs_min_[node_idx]) < - kMinInputRange) { - // if the range approaches zero, the stdev is not defined, - // this indicates that this input does not change. - // Set the bias to zero - node->bias = 0.0f; - } else { - node->bias = inputs_min_[node_idx] + (inputs_mean_[node_idx] * - (inputs_max_[node_idx] - inputs_min_[node_idx])); - } - } else { - node->bias = neurons_[node_idx].bias(); - node->fan_in_cnt = neurons_[node_idx].fan_in_cnt(); - // allocate memory for fan-in nodes - node->inputs = new WeightedNode[node->fan_in_cnt]; - if (node->inputs == NULL) { - return false; - } - for (int fan_in = 0; fan_in < node->fan_in_cnt; fan_in++) { - // identify fan-in neuron - const int id = neurons_[node_idx].fan_in(fan_in)->id(); - // Feedback connections are not allowed and should never happen - if (id >= node_idx) { - return false; - } - // add the the fan-in neuron and its wgt - node->inputs[fan_in].input_node = &fast_nodes_[id]; - float wgt_val = neurons_[node_idx].fan_in_wts(fan_in); - // for input neurons normalize the wgt by the input scaling - // values to save time during feedforward - if (neurons_[node_idx].fan_in(fan_in)->node_type() == Neuron::Input) { - // if the range approaches zero, the stdev is not defined, - // this indicates that this input does not change. - // Set the weight to zero - if (fabs(inputs_max_[id] - inputs_min_[id]) < kMinInputRange) { - wgt_val = 0.0f; - } else { - wgt_val /= ((inputs_max_[id] - inputs_min_[id]) * - inputs_std_dev_[id]); - } - } - node->inputs[fan_in].input_weight = wgt_val; - } - // incr wgt count to validate against at the end - wts_cnt += node->fan_in_cnt; - } - } - // sanity check - return wts_cnt_ == wts_cnt; -} - -// returns a pointer to the requested set of weights -// Allocates in chunks -float * NeuralNet::AllocWgt(int wgt_cnt) { - // see if need to allocate a new chunk of wts - if (wts_vec_.size() == 0 || (alloc_wgt_cnt_ + wgt_cnt) > kWgtChunkSize) { - // add the new chunck to the wts_chunks vector - wts_vec_.push_back(new vector (kWgtChunkSize)); - alloc_wgt_cnt_ = 0; - } - float *ret_ptr = &((*wts_vec_.back())[alloc_wgt_cnt_]); - // incr usage counts - alloc_wgt_cnt_ += wgt_cnt; - wts_cnt_ += wgt_cnt; - return ret_ptr; -} - -// create a new net object using an input file as a source -NeuralNet *NeuralNet::FromFile(const string file_name) { - // open the file - InputFileBuffer input_buff(file_name); - // create a new net object using input buffer - NeuralNet *net_obj = FromInputBuffer(&input_buff); - return net_obj; -} - -// create a net object from an input buffer -NeuralNet *NeuralNet::FromInputBuffer(InputFileBuffer *ib) { - // create a new net object - NeuralNet *net_obj = new NeuralNet(); - if (net_obj == NULL) { - return NULL; - } - // load the net - if (!net_obj->ReadBinary(ib)) { - delete net_obj; - net_obj = NULL; - } - return net_obj; -} - -// Compute the output of a specific output node. -// This function is useful for application that are interested in a single -// output of the net and do not want to waste time on the rest -// This is the fast-read-only version of this function -template bool NeuralNet::FastGetNetOutput(const Type *inputs, - int output_id, - Type *output) { - // feed inputs in and offset them by the pre-computed bias - int node_idx = 0; - Node *node = &fast_nodes_[0]; - for (node_idx = 0; node_idx < in_cnt_; node_idx++, node++) { - node->out = inputs[node_idx] - node->bias; - } - - // compute nodes' activations and outputs for hidden nodes if any - int hidden_node_cnt = neuron_cnt_ - out_cnt_; - for (;node_idx < hidden_node_cnt; node_idx++, node++) { - double activation = -node->bias; - for (int fan_in_idx = 0; fan_in_idx < node->fan_in_cnt; fan_in_idx++) { - activation += (node->inputs[fan_in_idx].input_weight * - node->inputs[fan_in_idx].input_node->out); - } - node->out = Neuron::Sigmoid(activation); - } - - // compute the output of the required output node - node += output_id; - double activation = -node->bias; - for (int fan_in_idx = 0; fan_in_idx < node->fan_in_cnt; fan_in_idx++) { - activation += (node->inputs[fan_in_idx].input_weight * - node->inputs[fan_in_idx].input_node->out); - } - (*output) = Neuron::Sigmoid(activation); - return true; -} - -// Performs a feedforward for general nets. Used mainly in training mode -// Templatized for float and double Types -template bool NeuralNet::GetNetOutput(const Type *inputs, - int output_id, - Type *output) { - // validate output id - if (output_id < 0 || output_id >= out_cnt_) { - return false; - } - - // call the fast version in case of readonly nets - if (read_only_) { - return FastGetNetOutput(inputs, output_id, output); - } - - // For the slow version, we'll just call FeedForward and return the - // appropriate output - vector outputs(out_cnt_); - if (!FeedForward(inputs, &outputs[0])) { - return false; - } - (*output) = outputs[output_id]; - - return true; -} - -// Instantiate all supported templates now that the functions have been defined. -template bool NeuralNet::FeedForward(const float *inputs, float *outputs); -template bool NeuralNet::FeedForward(const double *inputs, double *outputs); -template bool NeuralNet::FastFeedForward(const float *inputs, float *outputs); -template bool NeuralNet::FastFeedForward(const double *inputs, - double *outputs); -template bool NeuralNet::GetNetOutput(const float *inputs, int output_id, - float *output); -template bool NeuralNet::GetNetOutput(const double *inputs, int output_id, - double *output); -template bool NeuralNet::FastGetNetOutput(const float *inputs, int output_id, - float *output); -template bool NeuralNet::FastGetNetOutput(const double *inputs, int output_id, - double *output); -template bool NeuralNet::ReadBinary(InputFileBuffer *input_buffer); - -} diff --git a/neural_networks/runtime/neural_net.h b/neural_networks/runtime/neural_net.h deleted file mode 100644 index 91d0d68a..00000000 --- a/neural_networks/runtime/neural_net.h +++ /dev/null @@ -1,246 +0,0 @@ -// Copyright 2008 Google Inc. -// All Rights Reserved. -// Author: ahmadab@google.com (Ahmad Abdulkader) -// -// neural_net.h: Declarations of a class for an object that -// represents an arbitrary network of neurons -// - -#ifndef NEURAL_NET_H -#define NEURAL_NET_H - -#include -#include -#include "neuron.h" -#include "input_file_buffer.h" - -namespace tesseract { - -// Minimum input range below which we set the input weight to zero -static const float kMinInputRange = 1e-6f; - -class NeuralNet { - public: - NeuralNet(); - virtual ~NeuralNet(); - // create a net object from a file. Uses stdio - static NeuralNet *FromFile(const string file_name); - // create a net object from an input buffer - static NeuralNet *FromInputBuffer(InputFileBuffer *ib); - // Different flavors of feed forward function - template bool FeedForward(const Type *inputs, - Type *outputs); - // Compute the output of a specific output node. - // This function is useful for application that are interested in a single - // output of the net and do not want to waste time on the rest - template bool GetNetOutput(const Type *inputs, - int output_id, - Type *output); - // Accessor functions - int in_cnt() const { return in_cnt_; } - int out_cnt() const { return out_cnt_; } - - protected: - struct Node; - // A node-weight pair - struct WeightedNode { - Node *input_node; - float input_weight; - }; - // node struct used for fast feedforward in - // Read only nets - struct Node { - float out; - float bias; - int fan_in_cnt; - WeightedNode *inputs; - }; - // Read-Only flag (no training: On by default) - // will presumeably be set to false by - // the inherting TrainableNeuralNet class - bool read_only_; - // input count - int in_cnt_; - // output count - int out_cnt_; - // Total neuron count (including inputs) - int neuron_cnt_; - // count of unique weights - int wts_cnt_; - // Neuron vector - Neuron *neurons_; - // size of allocated weight chunk (in weights) - // This is basically the size of the biggest network - // that I have trained. However, the class will allow - // a bigger sized net if desired - static const int kWgtChunkSize = 0x10000; - // Magic number expected at the beginning of the NN - // binary file - static const unsigned int kNetSignature = 0xFEFEABD0; - // count of allocated wgts in the last chunk - int alloc_wgt_cnt_; - // vector of weights buffers - vector *>wts_vec_; - // Is the net an auto-encoder type - bool auto_encoder_; - // vector of input max values - vector inputs_max_; - // vector of input min values - vector inputs_min_; - // vector of input mean values - vector inputs_mean_; - // vector of input standard deviation values - vector inputs_std_dev_; - // vector of input offsets used by fast read-only - // feedforward function - vector fast_nodes_; - // Network Initialization function - void Init(); - // Clears all neurons - void Clear() { - for (int node = 0; node < neuron_cnt_; node++) { - neurons_[node].Clear(); - } - } - // Reads the net from an input buffer - template bool ReadBinary(ReadBuffType *input_buff) { - // Init vars - Init(); - // is this an autoencoder - unsigned int read_val; - unsigned int auto_encode; - // read and verify signature - if (input_buff->Read(&read_val, sizeof(read_val)) != sizeof(read_val)) { - return false; - } - if (read_val != kNetSignature) { - return false; - } - if (input_buff->Read(&auto_encode, sizeof(auto_encode)) != - sizeof(auto_encode)) { - return false; - } - auto_encoder_ = auto_encode; - // read and validate total # of nodes - if (input_buff->Read(&read_val, sizeof(read_val)) != sizeof(read_val)) { - return false; - } - neuron_cnt_ = read_val; - if (neuron_cnt_ <= 0) { - return false; - } - // set the size of the neurons vector - neurons_ = new Neuron[neuron_cnt_]; - if (neurons_ == NULL) { - return false; - } - // read & validate inputs - if (input_buff->Read(&read_val, sizeof(read_val)) != sizeof(read_val)) { - return false; - } - in_cnt_ = read_val; - if (in_cnt_ <= 0) { - return false; - } - // read outputs - if (input_buff->Read(&read_val, sizeof(read_val)) != sizeof(read_val)) { - return false; - } - out_cnt_ = read_val; - if (out_cnt_ <= 0) { - return false; - } - // set neuron ids and types - for (int idx = 0; idx < neuron_cnt_; idx++) { - neurons_[idx].set_id(idx); - // input type - if (idx < in_cnt_) { - neurons_[idx].set_node_type(Neuron::Input); - } else if (idx >= (neuron_cnt_ - out_cnt_)) { - neurons_[idx].set_node_type(Neuron::Output); - } else { - neurons_[idx].set_node_type(Neuron::Hidden); - } - } - // read the connections - for (int node_idx = 0; node_idx < neuron_cnt_; node_idx++) { - // read fanout - if (input_buff->Read(&read_val, sizeof(read_val)) != sizeof(read_val)) { - return false; - } - // read the neuron's info - int fan_out_cnt = read_val; - for (int fan_out_idx = 0; fan_out_idx < fan_out_cnt; fan_out_idx++) { - // read the neuron id - if (input_buff->Read(&read_val, sizeof(read_val)) != sizeof(read_val)) { - return false; - } - // create the connection - if (!SetConnection(node_idx, read_val)) { - return false; - } - } - } - // read all the neurons' fan-in connections - for (int node_idx = 0; node_idx < neuron_cnt_; node_idx++) { - // read - if (!neurons_[node_idx].ReadBinary(input_buff)) { - return false; - } - } - // size input stats vector to expected input size - inputs_mean_.resize(in_cnt_); - inputs_std_dev_.resize(in_cnt_); - inputs_min_.resize(in_cnt_); - inputs_max_.resize(in_cnt_); - // read stats - if (input_buff->Read(&(inputs_mean_.front()), - sizeof(inputs_mean_[0]) * in_cnt_) != - sizeof(inputs_mean_[0]) * in_cnt_) { - return false; - } - if (input_buff->Read(&(inputs_std_dev_.front()), - sizeof(inputs_std_dev_[0]) * in_cnt_) != - sizeof(inputs_std_dev_[0]) * in_cnt_) { - return false; - } - if (input_buff->Read(&(inputs_min_.front()), - sizeof(inputs_min_[0]) * in_cnt_) != - sizeof(inputs_min_[0]) * in_cnt_) { - return false; - } - if (input_buff->Read(&(inputs_max_.front()), - sizeof(inputs_max_[0]) * in_cnt_) != - sizeof(inputs_max_[0]) * in_cnt_) { - return false; - } - // create a readonly version for fast feedforward - if (read_only_) { - return CreateFastNet(); - } - return true; - } - - // creates a connection between two nodes - bool SetConnection(int from, int to); - // Create a read only version of the net that - // has faster feedforward performance - bool CreateFastNet(); - // internal function to allocate a new set of weights - // Centralized weight allocation attempts to increase - // weights locality of reference making it more cache friendly - float *AllocWgt(int wgt_cnt); - // different flavors read-only feedforward function - template bool FastFeedForward(const Type *inputs, - Type *outputs); - // Compute the output of a specific output node. - // This function is useful for application that are interested in a single - // output of the net and do not want to waste time on the rest - // This is the fast-read-only version of this function - template bool FastGetNetOutput(const Type *inputs, - int output_id, - Type *output); -}; -} - -#endif // NEURAL_NET_H__ diff --git a/neural_networks/runtime/neuron.cpp b/neural_networks/runtime/neuron.cpp deleted file mode 100644 index 36309082..00000000 --- a/neural_networks/runtime/neuron.cpp +++ /dev/null @@ -1,94 +0,0 @@ -// Copyright 2008 Google Inc. -// All Rights Reserved. -// Author: ahmadab@google.com (Ahmad Abdulkader) -// -// neuron.cpp: The implementation of a class for an object -// that represents a single neuron in a neural network - -#include "neuron.h" -#include "input_file_buffer.h" - -namespace tesseract { - -// Instantiate all supported templates -template bool Neuron::ReadBinary(InputFileBuffer *input_buffer); - -// default and only constructor -Neuron::Neuron() { - Init(); -} - -// virtual destructor -Neuron::~Neuron() { -} - -// Initializer -void Neuron::Init() { - id_ = -1; - frwd_dirty_ = false; - fan_in_.clear(); - fan_in_weights_.clear(); - activation_ = 0.0f; - output_ = 0.0f; - bias_ = 0.0f; - node_type_ = Unknown; -} - -// Computes the activation and output of the neuron if not fresh -// by pulling the outputs of all fan-in neurons -void Neuron::FeedForward() { - if (!frwd_dirty_ ) { - return; - } - // nothing to do for input nodes: just pass the input to the o/p - // otherwise, pull the output of all fan-in neurons - if (node_type_ != Input) { - int fan_in_cnt = fan_in_.size(); - // sum out the activation - activation_ = -bias_; - for (int in = 0; in < fan_in_cnt; in++) { - if (fan_in_[in]->frwd_dirty_) { - fan_in_[in]->FeedForward(); - } - activation_ += ((*(fan_in_weights_[in])) * fan_in_[in]->output_); - } - // sigmoid it - output_ = Sigmoid(activation_); - } - frwd_dirty_ = false; -} - -// set the type of the neuron -void Neuron::set_node_type(NeuronTypes Type) { - node_type_ = Type; -} - -// Adds new connections *to* this neuron *From* -// a target neuron using specfied params -// Note that what is actually copied in this function are pointers to the -// specified Neurons and weights and not the actualt values. This is by -// design to centralize the alloction of neurons and weights and so -// increase the locality of reference and improve cache-hits resulting -// in a faster net. This technique resulted in a 2X-10X speedup -// (depending on network size and processor) -void Neuron::AddFromConnection(Neuron *neurons, - float *wts_offset, - int from_cnt) { - for (int in = 0; in < from_cnt; in++) { - fan_in_.push_back(neurons + in); - fan_in_weights_.push_back(wts_offset + in); - } -} - -// fast computation of sigmoid function using a lookup table -// defined in sigmoid_table.cpp -float Neuron::Sigmoid(float activation) { - if (activation <= -10.0f) { - return 0.0f; - } else if (activation >= 10.0f) { - return 1.0f; - } else { - return kSigmoidTable[static_cast(100 * (activation + 10.0))]; - } -} -} diff --git a/neural_networks/runtime/neuron.h b/neural_networks/runtime/neuron.h deleted file mode 100644 index a13d4a2e..00000000 --- a/neural_networks/runtime/neuron.h +++ /dev/null @@ -1,147 +0,0 @@ -// Copyright 2008 Google Inc. -// All Rights Reserved. -// Author: ahmadab@google.com (Ahmad Abdulkader) -// -// neuron.h: Declarations of a class for an object that -// represents a single neuron in a neural network -// - -#ifndef NEURON_H -#define NEURON_H - -#include -#include - -#ifdef USE_STD_NAMESPACE -using std::vector; -#endif - -namespace tesseract { - -// Input Node bias values -static const float kInputNodeBias = 0.0f; - -class Neuron { - public: - // Types of nodes - enum NeuronTypes { - Unknown = 0, - Input, - Hidden, - Output - }; - Neuron(); - ~Neuron(); - // set the forward dirty flag indicating that the - // activation of the net is not fresh - void Clear() { - frwd_dirty_ = true; - } - // Read a binary representation of the neuron info from - // an input buffer. - template bool ReadBinary(BuffType *input_buff) { - float val; - if (input_buff->Read(&val, sizeof(val)) != sizeof(val)) { - return false; - } - // input nodes should have no biases - if (node_type_ == Input) { - bias_ = kInputNodeBias; - } else { - bias_ = val; - } - // read fanin count - int fan_in_cnt; - if (input_buff->Read(&fan_in_cnt, sizeof(fan_in_cnt)) != - sizeof(fan_in_cnt)) { - return false; - } - // validate fan-in cnt - if (fan_in_cnt != fan_in_.size()) { - return false; - } - // read the weights - for (int in = 0; in < fan_in_cnt; in++) { - if (input_buff->Read(&val, sizeof(val)) != sizeof(val)) { - return false; - } - *(fan_in_weights_[in]) = val; - } - return true; - } - - // Add a new connection from this neuron *From* - // a target neuron using specfied params - // Note that what is actually copied in this function are pointers to the - // specified Neurons and weights and not the actualt values. This is by - // design to centralize the alloction of neurons and weights and so - // increase the locality of reference and improve cache-hits resulting - // in a faster net. This technique resulted in a 2X-10X speedup - // (depending on network size and processor) - void AddFromConnection(Neuron *neuron_vec, - float *wts_offset, - int from_cnt); - // Set the type of a neuron - void set_node_type(NeuronTypes type); - // Computes the output of the node by - // "pulling" the output of the fan-in nodes - void FeedForward(); - // fast computation of sigmoid function using a lookup table - // defined in sigmoid_table.cpp - static float Sigmoid(float activation); - // Accessor functions - float output() const { - return output_; - } - void set_output(float out_val) { - output_ = out_val; - } - int id() const { - return id_; - } - int fan_in_cnt() const { - return fan_in_.size(); - } - Neuron * fan_in(int idx) const { - return fan_in_[idx]; - } - float fan_in_wts(int idx) const { - return *(fan_in_weights_[idx]); - } - void set_id(int id) { - id_ = id; - } - float bias() const { - return bias_; - } - Neuron::NeuronTypes node_type() const { - return node_type_; - } - - protected: - // Type of Neuron - NeuronTypes node_type_; - // unqique id of the neuron - int id_; - // node bias - float bias_; - // node net activation - float activation_; - // node output - float output_; - // pointers to fanin nodes - vector fan_in_; - // pointers to fanin weights - vector fan_in_weights_; - // Sigmoid function lookup table used for fast computation - // of sigmoid function - static const float kSigmoidTable[]; - // flag determining if the activation of the node - // is fresh or not (dirty) - bool frwd_dirty_; - // Initializer - void Init(); -}; -} - -#endif // NEURON_H__ diff --git a/neural_networks/runtime/sigmoid_table.cpp b/neural_networks/runtime/sigmoid_table.cpp deleted file mode 100644 index f170a108..00000000 --- a/neural_networks/runtime/sigmoid_table.cpp +++ /dev/null @@ -1,514 +0,0 @@ -// Copyright 2007 Google Inc. -// All Rights Reserved. -// Author: ahmadab@google.com (Ahmad Abdulkader) -// -// sigmoid_table.cpp: Sigmoid function lookup table - -#include "neuron.h" - -namespace tesseract { - -const float Neuron::kSigmoidTable[] = { - 4.53979E-05f, 4.58541E-05f, 4.63149E-05f, 4.67804E-05f, - 4.72505E-05f, 4.77254E-05f, 4.8205E-05f, 4.86894E-05f, - 4.91787E-05f, 4.9673E-05f, 5.01722E-05f, 5.06764E-05f, - 5.11857E-05f, 5.17001E-05f, 5.22196E-05f, 5.27444E-05f, - 5.32745E-05f, 5.38099E-05f, 5.43506E-05f, 5.48968E-05f, - 5.54485E-05f, 5.60058E-05f, 5.65686E-05f, 5.71371E-05f, - 5.77113E-05f, 5.82913E-05f, 5.88771E-05f, 5.94688E-05f, - 6.00664E-05f, 6.067E-05f, 6.12797E-05f, 6.18956E-05f, - 6.25176E-05f, 6.31459E-05f, 6.37805E-05f, 6.44214E-05f, - 6.50688E-05f, 6.57227E-05f, 6.63832E-05f, 6.70503E-05f, - 6.77241E-05f, 6.84047E-05f, 6.90922E-05f, 6.97865E-05f, - 7.04878E-05f, 7.11962E-05f, 7.19117E-05f, 7.26343E-05f, - 7.33643E-05f, 7.41016E-05f, 7.48462E-05f, 7.55984E-05f, - 7.63581E-05f, 7.71255E-05f, 7.79005E-05f, 7.86834E-05f, - 7.94741E-05f, 8.02728E-05f, 8.10794E-05f, 8.18942E-05f, - 8.27172E-05f, 8.35485E-05f, 8.43881E-05f, 8.52361E-05f, - 8.60927E-05f, 8.69579E-05f, 8.78317E-05f, 8.87144E-05f, - 8.96059E-05f, 9.05064E-05f, 9.14159E-05f, 9.23345E-05f, - 9.32624E-05f, 9.41996E-05f, 9.51463E-05f, 9.61024E-05f, - 9.70682E-05f, 9.80436E-05f, 9.90289E-05f, 0.000100024f, - 0.000101029f, 0.000102044f, 0.00010307f, 0.000104106f, - 0.000105152f, 0.000106209f, 0.000107276f, 0.000108354f, - 0.000109443f, 0.000110542f, 0.000111653f, 0.000112775f, - 0.000113909f, 0.000115053f, 0.000116209f, 0.000117377f, - 0.000118557f, 0.000119748f, 0.000120951f, 0.000122167f, - 0.000123395f, 0.000124635f, 0.000125887f, 0.000127152f, - 0.00012843f, 0.00012972f, 0.000131024f, 0.000132341f, - 0.00013367f, 0.000135014f, 0.00013637f, 0.000137741f, - 0.000139125f, 0.000140523f, 0.000141935f, 0.000143361f, - 0.000144802f, 0.000146257f, 0.000147727f, 0.000149211f, - 0.00015071f, 0.000152225f, 0.000153754f, 0.000155299f, - 0.00015686f, 0.000158436f, 0.000160028f, 0.000161636f, - 0.000163261f, 0.000164901f, 0.000166558f, 0.000168232f, - 0.000169922f, 0.00017163f, 0.000173354f, 0.000175096f, - 0.000176856f, 0.000178633f, 0.000180428f, 0.000182241f, - 0.000184072f, 0.000185922f, 0.00018779f, 0.000189677f, - 0.000191583f, 0.000193508f, 0.000195452f, 0.000197416f, - 0.0001994f, 0.000201403f, 0.000203427f, 0.000205471f, - 0.000207536f, 0.000209621f, 0.000211727f, 0.000213855f, - 0.000216003f, 0.000218174f, 0.000220366f, 0.00022258f, - 0.000224817f, 0.000227076f, 0.000229357f, 0.000231662f, - 0.00023399f, 0.000236341f, 0.000238715f, 0.000241114f, - 0.000243537f, 0.000245984f, 0.000248455f, 0.000250951f, - 0.000253473f, 0.00025602f, 0.000258592f, 0.00026119f, - 0.000263815f, 0.000266465f, 0.000269143f, 0.000271847f, - 0.000274578f, 0.000277337f, 0.000280123f, 0.000282938f, - 0.000285781f, 0.000288652f, 0.000291552f, 0.000294481f, - 0.00029744f, 0.000300429f, 0.000303447f, 0.000306496f, - 0.000309575f, 0.000312685f, 0.000315827f, 0.000319f, - 0.000322205f, 0.000325442f, 0.000328712f, 0.000332014f, - 0.00033535f, 0.000338719f, 0.000342122f, 0.00034556f, - 0.000349031f, 0.000352538f, 0.00035608f, 0.000359657f, - 0.00036327f, 0.00036692f, 0.000370606f, 0.000374329f, - 0.00037809f, 0.000381888f, 0.000385725f, 0.0003896f, - 0.000393514f, 0.000397467f, 0.00040146f, 0.000405494f, - 0.000409567f, 0.000413682f, 0.000417838f, 0.000422035f, - 0.000426275f, 0.000430557f, 0.000434882f, 0.000439251f, - 0.000443664f, 0.000448121f, 0.000452622f, 0.000457169f, - 0.000461762f, 0.0004664f, 0.000471085f, 0.000475818f, - 0.000480597f, 0.000485425f, 0.000490301f, 0.000495226f, - 0.000500201f, 0.000505226f, 0.000510301f, 0.000515427f, - 0.000520604f, 0.000525833f, 0.000531115f, 0.00053645f, - 0.000541839f, 0.000547281f, 0.000552779f, 0.000558331f, - 0.000563939f, 0.000569604f, 0.000575325f, 0.000581104f, - 0.00058694f, 0.000592836f, 0.00059879f, 0.000604805f, - 0.000610879f, 0.000617015f, 0.000623212f, 0.000629472f, - 0.000635794f, 0.00064218f, 0.00064863f, 0.000655144f, - 0.000661724f, 0.00066837f, 0.000675083f, 0.000681863f, - 0.000688711f, 0.000695628f, 0.000702614f, 0.00070967f, - 0.000716798f, 0.000723996f, 0.000731267f, 0.000738611f, - 0.000746029f, 0.000753521f, 0.000761088f, 0.000768731f, - 0.000776451f, 0.000784249f, 0.000792124f, 0.000800079f, - 0.000808113f, 0.000816228f, 0.000824425f, 0.000832703f, - 0.000841065f, 0.000849511f, 0.000858041f, 0.000866657f, - 0.00087536f, 0.000884149f, 0.000893027f, 0.000901994f, - 0.000911051f, 0.000920199f, 0.000929439f, 0.000938771f, - 0.000948197f, 0.000957717f, 0.000967333f, 0.000977045f, - 0.000986855f, 0.000996763f, 0.001006771f, 0.001016879f, - 0.001027088f, 0.0010374f, 0.001047815f, 0.001058334f, - 0.00106896f, 0.001079691f, 0.00109053f, 0.001101478f, - 0.001112536f, 0.001123705f, 0.001134985f, 0.001146379f, - 0.001157887f, 0.00116951f, 0.00118125f, 0.001193108f, - 0.001205084f, 0.001217181f, 0.001229399f, 0.001241739f, - 0.001254203f, 0.001266792f, 0.001279507f, 0.00129235f, - 0.001305321f, 0.001318423f, 0.001331655f, 0.001345021f, - 0.00135852f, 0.001372155f, 0.001385926f, 0.001399835f, - 0.001413884f, 0.001428073f, 0.001442405f, 0.00145688f, - 0.001471501f, 0.001486267f, 0.001501182f, 0.001516247f, - 0.001531462f, 0.001546829f, 0.001562351f, 0.001578028f, - 0.001593862f, 0.001609855f, 0.001626008f, 0.001642323f, - 0.001658801f, 0.001675444f, 0.001692254f, 0.001709233f, - 0.001726381f, 0.001743701f, 0.001761195f, 0.001778864f, - 0.00179671f, 0.001814734f, 0.001832939f, 0.001851326f, - 0.001869898f, 0.001888655f, 0.0019076f, 0.001926735f, - 0.001946061f, 0.001965581f, 0.001985296f, 0.002005209f, - 0.00202532f, 0.002045634f, 0.00206615f, 0.002086872f, - 0.002107801f, 0.00212894f, 0.00215029f, 0.002171854f, - 0.002193633f, 0.002215631f, 0.002237849f, 0.002260288f, - 0.002282953f, 0.002305844f, 0.002328964f, 0.002352316f, - 0.002375901f, 0.002399721f, 0.002423781f, 0.00244808f, - 0.002472623f, 0.002497411f, 0.002522447f, 0.002547734f, - 0.002573273f, 0.002599068f, 0.00262512f, 0.002651433f, - 0.002678009f, 0.002704851f, 0.002731961f, 0.002759342f, - 0.002786996f, 0.002814927f, 0.002843137f, 0.002871629f, - 0.002900406f, 0.00292947f, 0.002958825f, 0.002988472f, - 0.003018416f, 0.003048659f, 0.003079205f, 0.003110055f, - 0.003141213f, 0.003172683f, 0.003204467f, 0.003236568f, - 0.00326899f, 0.003301735f, 0.003334807f, 0.00336821f, - 0.003401946f, 0.003436018f, 0.003470431f, 0.003505187f, - 0.00354029f, 0.003575744f, 0.003611551f, 0.003647715f, - 0.00368424f, 0.003721129f, 0.003758387f, 0.003796016f, - 0.00383402f, 0.003872403f, 0.00391117f, 0.003950322f, - 0.003989865f, 0.004029802f, 0.004070138f, 0.004110875f, - 0.004152019f, 0.004193572f, 0.00423554f, 0.004277925f, - 0.004320734f, 0.004363968f, 0.004407633f, 0.004451734f, - 0.004496273f, 0.004541256f, 0.004586687f, 0.004632571f, - 0.004678911f, 0.004725713f, 0.00477298f, 0.004820718f, - 0.004868931f, 0.004917624f, 0.004966802f, 0.005016468f, - 0.005066629f, 0.005117289f, 0.005168453f, 0.005220126f, - 0.005272312f, 0.005325018f, 0.005378247f, 0.005432006f, - 0.005486299f, 0.005541132f, 0.005596509f, 0.005652437f, - 0.005708921f, 0.005765966f, 0.005823577f, 0.005881761f, - 0.005940522f, 0.005999867f, 0.006059801f, 0.006120331f, - 0.006181461f, 0.006243198f, 0.006305547f, 0.006368516f, - 0.006432108f, 0.006496332f, 0.006561193f, 0.006626697f, - 0.006692851f, 0.006759661f, 0.006827132f, 0.006895273f, - 0.006964089f, 0.007033587f, 0.007103774f, 0.007174656f, - 0.00724624f, 0.007318533f, 0.007391541f, 0.007465273f, - 0.007539735f, 0.007614933f, 0.007690876f, 0.00776757f, - 0.007845023f, 0.007923242f, 0.008002235f, 0.008082009f, - 0.008162571f, 0.00824393f, 0.008326093f, 0.008409068f, - 0.008492863f, 0.008577485f, 0.008662944f, 0.008749246f, - 0.0088364f, 0.008924415f, 0.009013299f, 0.009103059f, - 0.009193705f, 0.009285246f, 0.009377689f, 0.009471044f, - 0.009565319f, 0.009660523f, 0.009756666f, 0.009853756f, - 0.009951802f, 0.010050814f, 0.010150801f, 0.010251772f, - 0.010353738f, 0.010456706f, 0.010560688f, 0.010665693f, - 0.01077173f, 0.01087881f, 0.010986943f, 0.011096138f, - 0.011206406f, 0.011317758f, 0.011430203f, 0.011543752f, - 0.011658417f, 0.011774206f, 0.011891132f, 0.012009204f, - 0.012128435f, 0.012248835f, 0.012370415f, 0.012493186f, - 0.012617161f, 0.012742349f, 0.012868764f, 0.012996417f, - 0.013125318f, 0.013255481f, 0.013386918f, 0.01351964f, - 0.013653659f, 0.013788989f, 0.01392564f, 0.014063627f, - 0.014202961f, 0.014343656f, 0.014485724f, 0.014629178f, - 0.014774032f, 0.014920298f, 0.01506799f, 0.015217121f, - 0.015367706f, 0.015519757f, 0.015673288f, 0.015828314f, - 0.015984848f, 0.016142905f, 0.016302499f, 0.016463645f, - 0.016626356f, 0.016790648f, 0.016956536f, 0.017124033f, - 0.017293157f, 0.01746392f, 0.01763634f, 0.017810432f, - 0.01798621f, 0.018163691f, 0.018342891f, 0.018523825f, - 0.01870651f, 0.018890962f, 0.019077197f, 0.019265233f, - 0.019455085f, 0.01964677f, 0.019840306f, 0.020035709f, - 0.020232997f, 0.020432187f, 0.020633297f, 0.020836345f, - 0.021041347f, 0.021248323f, 0.02145729f, 0.021668266f, - 0.021881271f, 0.022096322f, 0.022313439f, 0.022532639f, - 0.022753943f, 0.02297737f, 0.023202938f, 0.023430668f, - 0.023660578f, 0.023892689f, 0.024127021f, 0.024363594f, - 0.024602428f, 0.024843544f, 0.025086962f, 0.025332703f, - 0.025580788f, 0.025831239f, 0.026084075f, 0.02633932f, - 0.026596994f, 0.026857119f, 0.027119717f, 0.027384811f, - 0.027652422f, 0.027922574f, 0.028195288f, 0.028470588f, - 0.028748496f, 0.029029036f, 0.029312231f, 0.029598104f, - 0.02988668f, 0.030177981f, 0.030472033f, 0.030768859f, - 0.031068484f, 0.031370932f, 0.031676228f, 0.031984397f, - 0.032295465f, 0.032609455f, 0.032926395f, 0.033246309f, - 0.033569223f, 0.033895164f, 0.034224158f, 0.03455623f, - 0.034891409f, 0.035229719f, 0.035571189f, 0.035915846f, - 0.036263716f, 0.036614828f, 0.036969209f, 0.037326887f, - 0.037687891f, 0.038052247f, 0.038419986f, 0.038791134f, - 0.039165723f, 0.03954378f, 0.039925334f, 0.040310415f, - 0.040699054f, 0.041091278f, 0.041487119f, 0.041886607f, - 0.042289772f, 0.042696644f, 0.043107255f, 0.043521635f, - 0.043939815f, 0.044361828f, 0.044787703f, 0.045217473f, - 0.045651171f, 0.046088827f, 0.046530475f, 0.046976146f, - 0.047425873f, 0.04787969f, 0.048337629f, 0.048799723f, - 0.049266006f, 0.049736512f, 0.050211273f, 0.050690325f, - 0.051173701f, 0.051661435f, 0.052153563f, 0.052650118f, - 0.053151136f, 0.053656652f, 0.0541667f, 0.054681317f, - 0.055200538f, 0.055724398f, 0.056252934f, 0.056786181f, - 0.057324176f, 0.057866955f, 0.058414556f, 0.058967013f, - 0.059524366f, 0.06008665f, 0.060653903f, 0.061226163f, - 0.061803466f, 0.062385851f, 0.062973356f, 0.063566018f, - 0.064163876f, 0.064766969f, 0.065375333f, 0.065989009f, - 0.066608036f, 0.067232451f, 0.067862294f, 0.068497604f, - 0.06913842f, 0.069784783f, 0.070436731f, 0.071094304f, - 0.071757542f, 0.072426485f, 0.073101173f, 0.073781647f, - 0.074467945f, 0.075160109f, 0.07585818f, 0.076562197f, - 0.077272202f, 0.077988235f, 0.078710337f, 0.079438549f, - 0.080172912f, 0.080913467f, 0.081660255f, 0.082413318f, - 0.083172696f, 0.083938432f, 0.084710566f, 0.085489139f, - 0.086274194f, 0.087065772f, 0.087863915f, 0.088668663f, - 0.089480059f, 0.090298145f, 0.091122961f, 0.09195455f, - 0.092792953f, 0.093638212f, 0.094490369f, 0.095349465f, - 0.096215542f, 0.097088641f, 0.097968804f, 0.098856073f, - 0.099750489f, 0.100652094f, 0.101560928f, 0.102477033f, - 0.103400451f, 0.104331223f, 0.10526939f, 0.106214992f, - 0.10716807f, 0.108128667f, 0.109096821f, 0.110072574f, - 0.111055967f, 0.112047039f, 0.11304583f, 0.114052381f, - 0.115066732f, 0.116088922f, 0.117118991f, 0.118156978f, - 0.119202922f, 0.120256862f, 0.121318838f, 0.122388887f, - 0.123467048f, 0.124553358f, 0.125647857f, 0.12675058f, - 0.127861566f, 0.128980852f, 0.130108474f, 0.131244469f, - 0.132388874f, 0.133541723f, 0.134703052f, 0.135872897f, - 0.137051293f, 0.138238273f, 0.139433873f, 0.140638126f, - 0.141851065f, 0.143072723f, 0.144303134f, 0.145542329f, - 0.14679034f, 0.148047198f, 0.149312935f, 0.15058758f, - 0.151871164f, 0.153163716f, 0.154465265f, 0.15577584f, - 0.157095469f, 0.158424179f, 0.159761997f, 0.16110895f, - 0.162465063f, 0.163830361f, 0.16520487f, 0.166588614f, - 0.167981615f, 0.169383897f, 0.170795482f, 0.172216392f, - 0.173646647f, 0.175086268f, 0.176535275f, 0.177993686f, - 0.179461519f, 0.180938793f, 0.182425524f, 0.183921727f, - 0.185427419f, 0.186942614f, 0.188467325f, 0.190001566f, - 0.191545349f, 0.193098684f, 0.194661584f, 0.196234056f, - 0.197816111f, 0.199407757f, 0.201009f, 0.202619846f, - 0.204240302f, 0.205870372f, 0.207510059f, 0.209159365f, - 0.210818293f, 0.212486844f, 0.214165017f, 0.215852811f, - 0.217550224f, 0.219257252f, 0.220973892f, 0.222700139f, - 0.224435986f, 0.226181426f, 0.227936451f, 0.229701051f, - 0.231475217f, 0.233258936f, 0.235052196f, 0.236854984f, - 0.238667285f, 0.240489083f, 0.242320361f, 0.244161101f, - 0.246011284f, 0.247870889f, 0.249739894f, 0.251618278f, - 0.253506017f, 0.255403084f, 0.257309455f, 0.259225101f, - 0.261149994f, 0.263084104f, 0.265027401f, 0.266979851f, - 0.268941421f, 0.270912078f, 0.272891784f, 0.274880502f, - 0.276878195f, 0.278884822f, 0.280900343f, 0.282924715f, - 0.284957894f, 0.286999837f, 0.289050497f, 0.291109827f, - 0.293177779f, 0.295254302f, 0.297339346f, 0.299432858f, - 0.301534784f, 0.30364507f, 0.30576366f, 0.307890496f, - 0.310025519f, 0.312168669f, 0.314319886f, 0.316479106f, - 0.318646266f, 0.320821301f, 0.323004144f, 0.325194727f, - 0.327392983f, 0.32959884f, 0.331812228f, 0.334033073f, - 0.336261303f, 0.338496841f, 0.340739612f, 0.342989537f, - 0.345246539f, 0.347510538f, 0.349781451f, 0.352059198f, - 0.354343694f, 0.356634854f, 0.358932594f, 0.361236825f, - 0.36354746f, 0.365864409f, 0.368187582f, 0.370516888f, - 0.372852234f, 0.375193526f, 0.377540669f, 0.379893568f, - 0.382252125f, 0.384616244f, 0.386985824f, 0.389360766f, - 0.391740969f, 0.394126332f, 0.39651675f, 0.398912121f, - 0.40131234f, 0.403717301f, 0.406126897f, 0.408541022f, - 0.410959566f, 0.413382421f, 0.415809477f, 0.418240623f, - 0.420675748f, 0.423114739f, 0.425557483f, 0.428003867f, - 0.430453776f, 0.432907095f, 0.435363708f, 0.437823499f, - 0.440286351f, 0.442752145f, 0.445220765f, 0.44769209f, - 0.450166003f, 0.452642382f, 0.455121108f, 0.457602059f, - 0.460085115f, 0.462570155f, 0.465057055f, 0.467545694f, - 0.470035948f, 0.472527696f, 0.475020813f, 0.477515175f, - 0.48001066f, 0.482507142f, 0.485004498f, 0.487502604f, - 0.490001333f, 0.492500562f, 0.495000167f, 0.497500021f, - 0.5f, 0.502499979f, 0.504999833f, 0.507499438f, - 0.509998667f, 0.512497396f, 0.514995502f, 0.517492858f, - 0.51998934f, 0.522484825f, 0.524979187f, 0.527472304f, - 0.529964052f, 0.532454306f, 0.534942945f, 0.537429845f, - 0.539914885f, 0.542397941f, 0.544878892f, 0.547357618f, - 0.549833997f, 0.55230791f, 0.554779235f, 0.557247855f, - 0.559713649f, 0.562176501f, 0.564636292f, 0.567092905f, - 0.569546224f, 0.571996133f, 0.574442517f, 0.576885261f, - 0.579324252f, 0.581759377f, 0.584190523f, 0.586617579f, - 0.589040434f, 0.591458978f, 0.593873103f, 0.596282699f, - 0.59868766f, 0.601087879f, 0.60348325f, 0.605873668f, - 0.608259031f, 0.610639234f, 0.613014176f, 0.615383756f, - 0.617747875f, 0.620106432f, 0.622459331f, 0.624806474f, - 0.627147766f, 0.629483112f, 0.631812418f, 0.634135591f, - 0.63645254f, 0.638763175f, 0.641067406f, 0.643365146f, - 0.645656306f, 0.647940802f, 0.650218549f, 0.652489462f, - 0.654753461f, 0.657010463f, 0.659260388f, 0.661503159f, - 0.663738697f, 0.665966927f, 0.668187772f, 0.67040116f, - 0.672607017f, 0.674805273f, 0.676995856f, 0.679178699f, - 0.681353734f, 0.683520894f, 0.685680114f, 0.687831331f, - 0.689974481f, 0.692109504f, 0.69423634f, 0.69635493f, - 0.698465216f, 0.700567142f, 0.702660654f, 0.704745698f, - 0.706822221f, 0.708890173f, 0.710949503f, 0.713000163f, - 0.715042106f, 0.717075285f, 0.719099657f, 0.721115178f, - 0.723121805f, 0.725119498f, 0.727108216f, 0.729087922f, - 0.731058579f, 0.733020149f, 0.734972599f, 0.736915896f, - 0.738850006f, 0.740774899f, 0.742690545f, 0.744596916f, - 0.746493983f, 0.748381722f, 0.750260106f, 0.752129111f, - 0.753988716f, 0.755838899f, 0.757679639f, 0.759510917f, - 0.761332715f, 0.763145016f, 0.764947804f, 0.766741064f, - 0.768524783f, 0.770298949f, 0.772063549f, 0.773818574f, - 0.775564014f, 0.777299861f, 0.779026108f, 0.780742748f, - 0.782449776f, 0.784147189f, 0.785834983f, 0.787513156f, - 0.789181707f, 0.790840635f, 0.792489941f, 0.794129628f, - 0.795759698f, 0.797380154f, 0.798991f, 0.800592243f, - 0.802183889f, 0.803765944f, 0.805338416f, 0.806901316f, - 0.808454651f, 0.809998434f, 0.811532675f, 0.813057386f, - 0.814572581f, 0.816078273f, 0.817574476f, 0.819061207f, - 0.820538481f, 0.822006314f, 0.823464725f, 0.824913732f, - 0.826353353f, 0.827783608f, 0.829204518f, 0.830616103f, - 0.832018385f, 0.833411386f, 0.83479513f, 0.836169639f, - 0.837534937f, 0.83889105f, 0.840238003f, 0.841575821f, - 0.842904531f, 0.84422416f, 0.845534735f, 0.846836284f, - 0.848128836f, 0.84941242f, 0.850687065f, 0.851952802f, - 0.85320966f, 0.854457671f, 0.855696866f, 0.856927277f, - 0.858148935f, 0.859361874f, 0.860566127f, 0.861761727f, - 0.862948707f, 0.864127103f, 0.865296948f, 0.866458277f, - 0.867611126f, 0.868755531f, 0.869891526f, 0.871019148f, - 0.872138434f, 0.87324942f, 0.874352143f, 0.875446642f, - 0.876532952f, 0.877611113f, 0.878681162f, 0.879743138f, - 0.880797078f, 0.881843022f, 0.882881009f, 0.883911078f, - 0.884933268f, 0.885947619f, 0.88695417f, 0.887952961f, - 0.888944033f, 0.889927426f, 0.890903179f, 0.891871333f, - 0.89283193f, 0.893785008f, 0.89473061f, 0.895668777f, - 0.896599549f, 0.897522967f, 0.898439072f, 0.899347906f, - 0.900249511f, 0.901143927f, 0.902031196f, 0.902911359f, - 0.903784458f, 0.904650535f, 0.905509631f, 0.906361788f, - 0.907207047f, 0.90804545f, 0.908877039f, 0.909701855f, - 0.910519941f, 0.911331337f, 0.912136085f, 0.912934228f, - 0.913725806f, 0.914510861f, 0.915289434f, 0.916061568f, - 0.916827304f, 0.917586682f, 0.918339745f, 0.919086533f, - 0.919827088f, 0.920561451f, 0.921289663f, 0.922011765f, - 0.922727798f, 0.923437803f, 0.92414182f, 0.924839891f, - 0.925532055f, 0.926218353f, 0.926898827f, 0.927573515f, - 0.928242458f, 0.928905696f, 0.929563269f, 0.930215217f, - 0.93086158f, 0.931502396f, 0.932137706f, 0.932767549f, - 0.933391964f, 0.934010991f, 0.934624667f, 0.935233031f, - 0.935836124f, 0.936433982f, 0.937026644f, 0.937614149f, - 0.938196534f, 0.938773837f, 0.939346097f, 0.93991335f, - 0.940475634f, 0.941032987f, 0.941585444f, 0.942133045f, - 0.942675824f, 0.943213819f, 0.943747066f, 0.944275602f, - 0.944799462f, 0.945318683f, 0.9458333f, 0.946343348f, - 0.946848864f, 0.947349882f, 0.947846437f, 0.948338565f, - 0.948826299f, 0.949309675f, 0.949788727f, 0.950263488f, - 0.950733994f, 0.951200277f, 0.951662371f, 0.95212031f, - 0.952574127f, 0.953023854f, 0.953469525f, 0.953911173f, - 0.954348829f, 0.954782527f, 0.955212297f, 0.955638172f, - 0.956060185f, 0.956478365f, 0.956892745f, 0.957303356f, - 0.957710228f, 0.958113393f, 0.958512881f, 0.958908722f, - 0.959300946f, 0.959689585f, 0.960074666f, 0.96045622f, - 0.960834277f, 0.961208866f, 0.961580014f, 0.961947753f, - 0.962312109f, 0.962673113f, 0.963030791f, 0.963385172f, - 0.963736284f, 0.964084154f, 0.964428811f, 0.964770281f, - 0.965108591f, 0.96544377f, 0.965775842f, 0.966104836f, - 0.966430777f, 0.966753691f, 0.967073605f, 0.967390545f, - 0.967704535f, 0.968015603f, 0.968323772f, 0.968629068f, - 0.968931516f, 0.969231141f, 0.969527967f, 0.969822019f, - 0.97011332f, 0.970401896f, 0.970687769f, 0.970970964f, - 0.971251504f, 0.971529412f, 0.971804712f, 0.972077426f, - 0.972347578f, 0.972615189f, 0.972880283f, 0.973142881f, - 0.973403006f, 0.97366068f, 0.973915925f, 0.974168761f, - 0.974419212f, 0.974667297f, 0.974913038f, 0.975156456f, - 0.975397572f, 0.975636406f, 0.975872979f, 0.976107311f, - 0.976339422f, 0.976569332f, 0.976797062f, 0.97702263f, - 0.977246057f, 0.977467361f, 0.977686561f, 0.977903678f, - 0.978118729f, 0.978331734f, 0.97854271f, 0.978751677f, - 0.978958653f, 0.979163655f, 0.979366703f, 0.979567813f, - 0.979767003f, 0.979964291f, 0.980159694f, 0.98035323f, - 0.980544915f, 0.980734767f, 0.980922803f, 0.981109038f, - 0.98129349f, 0.981476175f, 0.981657109f, 0.981836309f, - 0.98201379f, 0.982189568f, 0.98236366f, 0.98253608f, - 0.982706843f, 0.982875967f, 0.983043464f, 0.983209352f, - 0.983373644f, 0.983536355f, 0.983697501f, 0.983857095f, - 0.984015152f, 0.984171686f, 0.984326712f, 0.984480243f, - 0.984632294f, 0.984782879f, 0.98493201f, 0.985079702f, - 0.985225968f, 0.985370822f, 0.985514276f, 0.985656344f, - 0.985797039f, 0.985936373f, 0.98607436f, 0.986211011f, - 0.986346341f, 0.98648036f, 0.986613082f, 0.986744519f, - 0.986874682f, 0.987003583f, 0.987131236f, 0.987257651f, - 0.987382839f, 0.987506814f, 0.987629585f, 0.987751165f, - 0.987871565f, 0.987990796f, 0.988108868f, 0.988225794f, - 0.988341583f, 0.988456248f, 0.988569797f, 0.988682242f, - 0.988793594f, 0.988903862f, 0.989013057f, 0.98912119f, - 0.98922827f, 0.989334307f, 0.989439312f, 0.989543294f, - 0.989646262f, 0.989748228f, 0.989849199f, 0.989949186f, - 0.990048198f, 0.990146244f, 0.990243334f, 0.990339477f, - 0.990434681f, 0.990528956f, 0.990622311f, 0.990714754f, - 0.990806295f, 0.990896941f, 0.990986701f, 0.991075585f, - 0.9911636f, 0.991250754f, 0.991337056f, 0.991422515f, - 0.991507137f, 0.991590932f, 0.991673907f, 0.99175607f, - 0.991837429f, 0.991917991f, 0.991997765f, 0.992076758f, - 0.992154977f, 0.99223243f, 0.992309124f, 0.992385067f, - 0.992460265f, 0.992534727f, 0.992608459f, 0.992681467f, - 0.99275376f, 0.992825344f, 0.992896226f, 0.992966413f, - 0.993035911f, 0.993104727f, 0.993172868f, 0.993240339f, - 0.993307149f, 0.993373303f, 0.993438807f, 0.993503668f, - 0.993567892f, 0.993631484f, 0.993694453f, 0.993756802f, - 0.993818539f, 0.993879669f, 0.993940199f, 0.994000133f, - 0.994059478f, 0.994118239f, 0.994176423f, 0.994234034f, - 0.994291079f, 0.994347563f, 0.994403491f, 0.994458868f, - 0.994513701f, 0.994567994f, 0.994621753f, 0.994674982f, - 0.994727688f, 0.994779874f, 0.994831547f, 0.994882711f, - 0.994933371f, 0.994983532f, 0.995033198f, 0.995082376f, - 0.995131069f, 0.995179282f, 0.99522702f, 0.995274287f, - 0.995321089f, 0.995367429f, 0.995413313f, 0.995458744f, - 0.995503727f, 0.995548266f, 0.995592367f, 0.995636032f, - 0.995679266f, 0.995722075f, 0.99576446f, 0.995806428f, - 0.995847981f, 0.995889125f, 0.995929862f, 0.995970198f, - 0.996010135f, 0.996049678f, 0.99608883f, 0.996127597f, - 0.99616598f, 0.996203984f, 0.996241613f, 0.996278871f, - 0.99631576f, 0.996352285f, 0.996388449f, 0.996424256f, - 0.99645971f, 0.996494813f, 0.996529569f, 0.996563982f, - 0.996598054f, 0.99663179f, 0.996665193f, 0.996698265f, - 0.99673101f, 0.996763432f, 0.996795533f, 0.996827317f, - 0.996858787f, 0.996889945f, 0.996920795f, 0.996951341f, - 0.996981584f, 0.997011528f, 0.997041175f, 0.99707053f, - 0.997099594f, 0.997128371f, 0.997156863f, 0.997185073f, - 0.997213004f, 0.997240658f, 0.997268039f, 0.997295149f, - 0.997321991f, 0.997348567f, 0.99737488f, 0.997400932f, - 0.997426727f, 0.997452266f, 0.997477553f, 0.997502589f, - 0.997527377f, 0.99755192f, 0.997576219f, 0.997600279f, - 0.997624099f, 0.997647684f, 0.997671036f, 0.997694156f, - 0.997717047f, 0.997739712f, 0.997762151f, 0.997784369f, - 0.997806367f, 0.997828146f, 0.99784971f, 0.99787106f, - 0.997892199f, 0.997913128f, 0.99793385f, 0.997954366f, - 0.99797468f, 0.997994791f, 0.998014704f, 0.998034419f, - 0.998053939f, 0.998073265f, 0.9980924f, 0.998111345f, - 0.998130102f, 0.998148674f, 0.998167061f, 0.998185266f, - 0.99820329f, 0.998221136f, 0.998238805f, 0.998256299f, - 0.998273619f, 0.998290767f, 0.998307746f, 0.998324556f, - 0.998341199f, 0.998357677f, 0.998373992f, 0.998390145f, - 0.998406138f, 0.998421972f, 0.998437649f, 0.998453171f, - 0.998468538f, 0.998483753f, 0.998498818f, 0.998513733f, - 0.998528499f, 0.99854312f, 0.998557595f, 0.998571927f, - 0.998586116f, 0.998600165f, 0.998614074f, 0.998627845f, - 0.99864148f, 0.998654979f, 0.998668345f, 0.998681577f, - 0.998694679f, 0.99870765f, 0.998720493f, 0.998733208f, - 0.998745797f, 0.998758261f, 0.998770601f, 0.998782819f, - 0.998794916f, 0.998806892f, 0.99881875f, 0.99883049f, - 0.998842113f, 0.998853621f, 0.998865015f, 0.998876295f, - 0.998887464f, 0.998898522f, 0.99890947f, 0.998920309f, - 0.99893104f, 0.998941666f, 0.998952185f, 0.9989626f, - 0.998972912f, 0.998983121f, 0.998993229f, 0.999003237f, - 0.999013145f, 0.999022955f, 0.999032667f, 0.999042283f, - 0.999051803f, 0.999061229f, 0.999070561f, 0.999079801f, - 0.999088949f, 0.999098006f, 0.999106973f, 0.999115851f, - 0.99912464f, 0.999133343f, 0.999141959f, 0.999150489f, - 0.999158935f, 0.999167297f, 0.999175575f, 0.999183772f, - 0.999191887f, 0.999199921f, 0.999207876f, 0.999215751f, - 0.999223549f, 0.999231269f, 0.999238912f, 0.999246479f, - 0.999253971f, 0.999261389f, 0.999268733f, 0.999276004f, - 0.999283202f, 0.99929033f, 0.999297386f, 0.999304372f, - 0.999311289f, 0.999318137f, 0.999324917f, 0.99933163f, - 0.999338276f, 0.999344856f, 0.99935137f, 0.99935782f, - 0.999364206f, 0.999370528f, 0.999376788f, 0.999382985f, - 0.999389121f, 0.999395195f, 0.99940121f, 0.999407164f, - 0.99941306f, 0.999418896f, 0.999424675f, 0.999430396f, - 0.999436061f, 0.999441669f, 0.999447221f, 0.999452719f, - 0.999458161f, 0.99946355f, 0.999468885f, 0.999474167f, - 0.999479396f, 0.999484573f, 0.999489699f, 0.999494774f, - 0.999499799f, 0.999504774f, 0.999509699f, 0.999514575f, - 0.999519403f, 0.999524182f, 0.999528915f, 0.9995336f, - 0.999538238f, 0.999542831f, 0.999547378f, 0.999551879f, - 0.999556336f, 0.999560749f, 0.999565118f, 0.999569443f, - 0.999573725f, 0.999577965f, 0.999582162f, 0.999586318f, - 0.999590433f, 0.999594506f, 0.99959854f, 0.999602533f, - 0.999606486f, 0.9996104f, 0.999614275f, 0.999618112f, - 0.99962191f, 0.999625671f, 0.999629394f, 0.99963308f, - 0.99963673f, 0.999640343f, 0.99964392f, 0.999647462f, - 0.999650969f, 0.99965444f, 0.999657878f, 0.999661281f, - 0.99966465f, 0.999667986f, 0.999671288f, 0.999674558f, - 0.999677795f, 0.999681f, 0.999684173f, 0.999687315f, - 0.999690425f, 0.999693504f, 0.999696553f, 0.999699571f, - 0.99970256f, 0.999705519f, 0.999708448f, 0.999711348f, - 0.999714219f, 0.999717062f, 0.999719877f, 0.999722663f, - 0.999725422f, 0.999728153f, 0.999730857f, 0.999733535f, - 0.999736185f, 0.99973881f, 0.999741408f, 0.99974398f, - 0.999746527f, 0.999749049f, 0.999751545f, 0.999754016f, - 0.999756463f, 0.999758886f, 0.999761285f, 0.999763659f, - 0.99976601f, 0.999768338f, 0.999770643f, 0.999772924f, - 0.999775183f, 0.99977742f, 0.999779634f, 0.999781826f, - 0.999783997f, 0.999786145f, 0.999788273f, 0.999790379f, - 0.999792464f, 0.999794529f, 0.999796573f, 0.999798597f, - 0.9998006f, 0.999802584f, 0.999804548f, 0.999806492f, - 0.999808417f, 0.999810323f, 0.99981221f, 0.999814078f, - 0.999815928f, 0.999817759f, 0.999819572f, 0.999821367f, - 0.999823144f, 0.999824904f, 0.999826646f, 0.99982837f, - 0.999830078f, 0.999831768f, 0.999833442f, 0.999835099f, - 0.999836739f, 0.999838364f, 0.999839972f, 0.999841564f, - 0.99984314f, 0.999844701f, 0.999846246f, 0.999847775f, - 0.99984929f, 0.999850789f, 0.999852273f, 0.999853743f, - 0.999855198f, 0.999856639f, 0.999858065f, 0.999859477f, - 0.999860875f, 0.999862259f, 0.99986363f, 0.999864986f, - 0.99986633f, 0.999867659f, 0.999868976f, 0.99987028f, - 0.99987157f, 0.999872848f, 0.999874113f, 0.999875365f, - 0.999876605f, 0.999877833f, 0.999879049f, 0.999880252f, - 0.999881443f, 0.999882623f, 0.999883791f, 0.999884947f, - 0.999886091f, 0.999887225f, 0.999888347f, 0.999889458f, - 0.999890557f, 0.999891646f, 0.999892724f, 0.999893791f, - 0.999894848f, 0.999895894f, 0.99989693f, 0.999897956f, - 0.999898971f, 0.999899976f, 0.999900971f, 0.999901956f, - 0.999902932f, 0.999903898f, 0.999904854f, 0.9999058f, - 0.999906738f, 0.999907665f, 0.999908584f, 0.999909494f, - 0.999910394f, 0.999911286f, 0.999912168f, 0.999913042f, - 0.999913907f, 0.999914764f, 0.999915612f, 0.999916452f, - 0.999917283f, 0.999918106f, 0.999918921f, 0.999919727f, - 0.999920526f, 0.999921317f, 0.999922099f, 0.999922875f, - 0.999923642f, 0.999924402f, 0.999925154f, 0.999925898f, - 0.999926636f, 0.999927366f, 0.999928088f, 0.999928804f, - 0.999929512f, 0.999930213f, 0.999930908f, 0.999931595f, - 0.999932276f, 0.99993295f, 0.999933617f, 0.999934277f, - 0.999934931f, 0.999935579f, 0.99993622f, 0.999936854f, - 0.999937482f, 0.999938104f, 0.99993872f, 0.99993933f, - 0.999939934f, 0.999940531f, 0.999941123f, 0.999941709f, - 0.999942289f, 0.999942863f, 0.999943431f, 0.999943994f, - 0.999944551f, 0.999945103f, 0.999945649f, 0.99994619f, - 0.999946726f, 0.999947256f, 0.99994778f, 0.9999483f, - 0.999948814f, 0.999949324f, 0.999949828f, 0.999950327f, - 0.999950821f, 0.999951311f, 0.999951795f, 0.999952275f, - 0.999952749f, 0.99995322f, 0.999953685f, 0.999954146f, - 0.999954602f -}; -} // namespace tesseract diff --git a/opencl/oclkernels.h b/opencl/oclkernels.h index e22b2104..a36b664e 100644 --- a/opencl/oclkernels.h +++ b/opencl/oclkernels.h @@ -1,6 +1,16 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TESSERACT_OPENCL_OCLKERNELS_H_ +#define TESSERACT_OPENCL_OCLKERNELS_H_ -#ifndef _OCL_KERNEL_H_ -#define _OCL_KERNEL_H_ #ifndef USE_EXTERNAL_KERNEL #define KERNEL( ... )# __VA_ARGS__ "\n" // Double precision is a default of spreadsheets @@ -50,7 +60,7 @@ KERNEL( ) KERNEL( -\n__kernel void pixSubtract(__global int *dword, __global int *sword, +\n__kernel void pixSubtract(__global int *dword, __global int *sword, const int wpl, const int h, __global int *outword) { const unsigned int row = get_global_id(1); @@ -105,15 +115,15 @@ KERNEL( unsigned int prevword, nextword, currword,tempword; unsigned int destword; const int col = pos % wpl; - + //Ignore the execss if (pos >= (wpl * h)) return; - - - currword = *(sword + pos); + + + currword = *(sword + pos); destword = currword; - + //Handle boundary conditions if(col==0) prevword=0; @@ -124,9 +134,9 @@ KERNEL( nextword=0; else nextword = *(sword + pos + 1); - + //Loop unrolled - + //1 bit to left and 1 bit to right //Get the max value on LHS of every pixel tempword = (prevword << (31)) | ((currword >> 1)); @@ -142,10 +152,10 @@ KERNEL( //Get max value on RHS of every pixel tempword = (currword << 2) | (nextword >> (30)); destword |= tempword; - - + + *(dword + pos) = destword; - + }\n ) @@ -200,7 +210,7 @@ KERNEL( unsigned int destword, tempword, lastword, currword; unsigned int lnextword, lprevword, rnextword, rprevword, firstword, secondword; int i, j, siter, eiter; - + //Ignore the execss if (pos >= (wpl*h) || (xn < 1 && xp < 1)) return; @@ -225,7 +235,7 @@ KERNEL( firstword = 0x0; else firstword = *(sword + pos - 1); - + //Get next word if (col == (wpl - 1)) secondword = 0x0; @@ -237,7 +247,7 @@ KERNEL( { //Get the max value on LHS of every pixel tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0x0 : (firstword << (32-i)) | ((currword >> i)); - + destword |= tempword; //Get max value on RHS of every pixel @@ -266,11 +276,11 @@ KERNEL( else firstword = *(sword + row*wpl + siter); - if (eiter >= wpl) + if (eiter >= wpl) lastword = 0x0; else lastword = *(sword + row*wpl + eiter); - + for ( i = 1; i < nwords; i++) { //Gets LHS words @@ -280,14 +290,14 @@ KERNEL( secondword = *(sword + row*wpl + siter + i); lprevword = firstword << (32 - parbitsxn) | secondword >> parbitsxn; - + firstword = secondword; if ((siter + i + 1) < 0) secondword = 0x0; else secondword = *(sword + row*wpl + siter + i + 1); - + lnextword = firstword << (32 - parbitsxn) | secondword >> parbitsxn; //Gets RHS words @@ -295,7 +305,7 @@ KERNEL( firstword = 0x0; else firstword = *(sword + row*wpl + eiter - i); - + rnextword = firstword << parbitsxp | lastword >> (32 - parbitsxp); lastword = firstword; @@ -325,7 +335,7 @@ KERNEL( lastword = firstword; firstword = secondword; } - + *(dword + pos) = destword; }\n ) @@ -342,14 +352,14 @@ KERNEL( unsigned int prevword, nextword, currword,tempword; unsigned int destword; int i; - + //Ignore the execss if (pos >= (wpl * h)) return; - currword = *(sword + pos); + currword = *(sword + pos); destword = currword; - + //Handle boundary conditions if(col==0) prevword=0; @@ -360,7 +370,7 @@ KERNEL( nextword=0; else nextword = *(sword + pos + 1); - + for (i = 1; i <= halfwidth; i++) { //Get the max value on LHS of every pixel @@ -377,7 +387,7 @@ KERNEL( //Get max value on RHS of every pixel tempword = (currword << i) | (nextword >> (32 - i)); - + destword |= tempword; } @@ -397,7 +407,7 @@ KERNEL( unsigned int tempword; unsigned int destword; int i, siter, eiter; - + //Ignore the execss if (row >= h || col >= wpl) return; @@ -427,27 +437,27 @@ KERNEL( unsigned int prevword, nextword, currword,tempword; unsigned int destword; const int col = pos % wpl; - + //Ignore the execss if (pos >= (wpl * h)) return; - - currword = *(sword + pos); + + currword = *(sword + pos); destword = currword; - + //Handle boundary conditions if(col==0) prevword=0xffffffff; else prevword = *(sword + pos - 1); - + if(col==(wpl - 1)) nextword=0xffffffff; else nextword = *(sword + pos + 1); - + //Loop unrolled - + //1 bit to left and 1 bit to right //Get the min value on LHS of every pixel tempword = (prevword << (31)) | ((currword >> 1)); @@ -463,10 +473,10 @@ KERNEL( //Get min value on RHS of every pixel tempword = (currword << 2) | (nextword >> (30)); destword &= tempword; - - + + *(dword + pos) = destword; - + }\n ) @@ -491,7 +501,7 @@ KERNEL( if (row < 2 || row >= (h - 2)) { destword = 0x0; - } + } else { //2 words above @@ -518,7 +528,7 @@ KERNEL( tempword = *(sword + i*wpl + col); destword &= tempword; - if (col == 0) + if (col == 0) { destword &= fwmask; } @@ -534,7 +544,7 @@ KERNEL( ) KERNEL( -\n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl, +\n__kernel void morphoErodeHor(__global int *sword,__global int *dword, const int xp, const int xn, const int wpl, const int h, const char isAsymmetric, const int rwmask, const int lwmask) { const int col = get_global_id(0); @@ -569,7 +579,7 @@ KERNEL( firstword = 0xffffffff; else firstword = *(sword + pos - 1); - + //Get next word if (col == (wpl - 1)) secondword = 0xffffffff; @@ -585,7 +595,7 @@ KERNEL( //Get max value on RHS of every pixel tempword = ((i == parbitsxp) && (parbitsxp != parbitsxn)) ? 0xffffffff : (currword << i) | (secondword >> (32 - i)); - + //tempword = (currword << i) | (secondword >> (32 - i)); destword &= tempword; } @@ -614,18 +624,18 @@ KERNEL( *(dword + pos) = destword; return; } - + if (siter < 0) firstword = 0xffffffff; else firstword = *(sword + row*wpl + siter); - if (eiter >= wpl) + if (eiter >= wpl) lastword = 0xffffffff; else lastword = *(sword + row*wpl + eiter); - - + + for ( i = 1; i < nwords; i++) { //Gets LHS words @@ -635,14 +645,14 @@ KERNEL( secondword = *(sword + row*wpl + siter + i); lprevword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp); - + firstword = secondword; if ((siter + i + 1) < 0) secondword = 0xffffffff; else secondword = *(sword + row*wpl + siter + i + 1); - + lnextword = firstword << (32 - parbitsxp) | secondword >> (parbitsxp); //Gets RHS words @@ -650,7 +660,7 @@ KERNEL( firstword = 0xffffffff; else firstword = *(sword + row*wpl + eiter - i); - + rnextword = firstword << parbitsxn | lastword >> (32 - parbitsxn); lastword = firstword; @@ -680,7 +690,7 @@ KERNEL( lastword = firstword; firstword = secondword; } - + if (isAsymmetric) { //Clear boundary pixels @@ -700,8 +710,8 @@ KERNEL( KERNEL( \n__kernel void morphoErodeHor_32word(__global int *sword,__global int *dword, - const int halfwidth, const int wpl, - const int h, const char clearBoundPixH, + const int halfwidth, const int wpl, + const int h, const char clearBoundPixH, const int rwmask, const int lwmask, const char isEven) { @@ -715,25 +725,25 @@ KERNEL( if (pos >= (wpl * h)) return; - currword = *(sword + pos); + currword = *(sword + pos); destword = currword; - + //Handle boundary conditions if(col==0) prevword=0xffffffff; else prevword = *(sword + pos - 1); - + if(col==(wpl - 1)) nextword=0xffffffff; else nextword = *(sword + pos + 1); - + for (i = 1; i <= halfwidth; i++) { //Get the min value on LHS of every pixel tempword = (prevword << (32-i)) | ((currword >> i)); - + destword &= tempword; //Get min value on RHS of every pixel @@ -751,7 +761,7 @@ KERNEL( if (clearBoundPixH) { - if (col == 0) + if (col == 0) { destword &= rwmask; } @@ -767,7 +777,7 @@ KERNEL( KERNEL( \n__kernel void morphoErodeVer(__global int *sword,__global int *dword, - const int yp, + const int yp, const int wpl, const int h, const char clearBoundPixV, const int yn) { @@ -776,7 +786,7 @@ KERNEL( const unsigned int pos = row * wpl + col; unsigned int tempword, destword; int i, siter, eiter; - + //Ignore the execss if (row >= h || col >= wpl) return; @@ -796,7 +806,7 @@ KERNEL( //Clear boundary pixels if (clearBoundPixV && ((row < yp) || ((h - row) <= yn))) - { + { destword = 0x0; } @@ -884,23 +894,23 @@ KERNEL( \n __global const uchar* data, \n uint numPixels, \n __global uint *histBuffer) { // each wg will write HIST_SIZE*NUM_CHANNELS into this result; cpu will accumulate across wg's -\n +\n \n /* declare variables */ -\n +\n \n // work indices \n size_t groupId = get_group_id(0); \n size_t localId = get_local_id(0); // 0 -> 256-1 \n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1 \n uint numThreads = get_global_size(0); -\n +\n \n /* accumulate in global memory */ \n for ( uint pc = get_global_id(0); pc < numPixels; pc += get_global_size(0) ) { \n uchar value = data[ pc ]; \n int idx = value * get_global_size(0) + get_global_id(0); \n histBuffer[ idx ]++; -\n +\n \n } -\n +\n \n } // kernel_HistogramRectAllChannels_Grey ) @@ -993,35 +1003,35 @@ void kernel_HistogramRectOneChannelReduction( KERNEL( // unused - // each work group (x256) handles a histogram bin + // each work group (x256) handles a histogram bin \n __attribute__((reqd_work_group_size(256, 1, 1))) \n __kernel \n void kernel_HistogramRectAllChannelsReduction_Grey( \n int n, // pixel redundancy that needs to be accumulated \n __global uint *histBuffer, \n __global uint* histResult) { // each wg accumulates 1 bin -\n +\n \n /* declare variables */ -\n +\n \n // work indices \n size_t groupId = get_group_id(0); \n size_t localId = get_local_id(0); // 0 -> 256-1 \n size_t globalId = get_global_id(0); // 0 -> 8*10*256-1=20480-1 \n uint numThreads = get_global_size(0); \n unsigned int hist = 0; -\n +\n \n /* accumulate in global memory */ \n for ( uint p = 0; p < n; p+=GROUP_SIZE) { \n hist += histBuffer[ (get_group_id(0)*n + p)]; \n } -\n +\n \n /* reduction in local memory */ \n // populate local memory \n __local unsigned int localHist[GROUP_SIZE]; \n localHist[localId] = hist; \n barrier(CLK_LOCAL_MEM_FENCE); -\n +\n \n for (int stride = GROUP_SIZE/2; stride >= 1; stride /= 2) { \n if (localId < stride) { \n hist = localHist[ (localId+stride)]; @@ -1032,12 +1042,11 @@ KERNEL( \n } \n barrier(CLK_LOCAL_MEM_FENCE); \n } -\n +\n \n if (localId == 0) \n histResult[get_group_id(0)] = localHist[0]; -\n +\n \n } // kernel_HistogramRectAllChannelsReduction_Grey - ) // ThresholdRectToPix Kernel @@ -1066,7 +1075,6 @@ void kernel_ThresholdRectToPix( __global int *pix) { // declare variables - uint pad = PIXELS_PER_WORD * wpl - width;//number of padding bits at the end of each output line int pThresholds[NUM_CHANNELS]; int pHi_Values[NUM_CHANNELS]; for ( int i = 0; i < NUM_CHANNELS; i++) { @@ -1077,22 +1085,24 @@ void kernel_ThresholdRectToPix( // for each word (32 pixels) in output image for ( uint w = get_global_id(0); w < wpl*height; w += get_global_size(0) ) { unsigned int word = 0; // all bits start at zero - //decrease the pixel index for the padding at the end of each output line (=number of lines * padding) - uint pxIdxOffset = ( w / wpl) * pad;// = ( ( PIXELS_PER_WORD * w) / ( width + pad)) * pad; // for each burst in word for ( int b = 0; b < BURSTS_PER_WORD; b++) { // load burst charVec pixels; - for ( int i = 0; i < (PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH; i++ ) { - pixels.v[i] = imageData[w*(BURSTS_PER_WORD*(PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + b*((PIXELS_PER_BURST*NUM_CHANNELS)/CHAR_VEC_WIDTH) + i - pxIdxOffset]; - } + int offset = (w / wpl) * width; + offset += (w % wpl) * PIXELS_PER_WORD; + offset += b * PIXELS_PER_BURST; + + for (int i = 0; i < PIXELS_PER_BURST; ++i) + pixels.v[i] = imageData[offset + i]; // for each pixel in burst for ( int p = 0; p < PIXELS_PER_BURST; p++) { for ( int c = 0; c < NUM_CHANNELS; c++) { unsigned char pixChan = pixels.s[p*NUM_CHANNELS + c]; if (pHi_Values[c] >= 0 && (pixChan > pThresholds[c]) == (pHi_Values[c] == 0)) { - word |= (((uint)0x80000000) >> ((b*PIXELS_PER_BURST+p)&31)); + const uint kTopBit = 0x80000000; + word |= (kTopBit >> ((b*PIXELS_PER_BURST+p)&31)); } } } @@ -1146,10 +1156,10 @@ void kernel_ThresholdRectToPix_OneChan( // for each pixel in burst for ( int p = 0; p < PIXELS_PER_BURST; p++) { - + //int littleEndianIdx = p ^ 3; //int bigEndianIdx = p; - int idx = + int idx = \n#ifdef __ENDIAN_LITTLE__\n p ^ 3; \n#else\n @@ -1157,52 +1167,18 @@ void kernel_ThresholdRectToPix_OneChan( \n#endif\n unsigned char pixChan = pixels.s[idx]; if (pHi_Values[0] >= 0 && (pixChan > pThresholds[0]) == (pHi_Values[0] == 0)) { - word |= (0x80000000 >> ((b*PIXELS_PER_BURST+p)&31)); + const uint kTopBit = 0x80000000; + word |= (kTopBit >> ((b*PIXELS_PER_BURST+p)&31)); } } } pix[w] = word; } } - ) - -KERNEL( -\n#define RED_SHIFT 24\n -\n#define GREEN_SHIFT 16\n -\n#define BLUE_SHIFT 8\n -\n#define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val))\n -\n -\n__attribute__((reqd_work_group_size(256, 1, 1)))\n -\n__kernel\n -\nvoid kernel_RGBToGray( - __global const unsigned int *srcData, - __global unsigned char *dstData, - int srcWPL, - int dstWPL, - int height, - int width, - float rwt, - float gwt, - float bwt ) { - - // pixel index - int pixelIdx = get_global_id(0); - if (pixelIdx >= height*width) return; - - unsigned int word = srcData[pixelIdx]; - int output = (rwt * ((word >> RED_SHIFT) & 0xff) + - gwt * ((word >> GREEN_SHIFT) & 0xff) + - bwt * ((word >> BLUE_SHIFT) & 0xff) + 0.5); - // SET_DATA_BYTE - dstData[pixelIdx] = output; -} -) -#endif - ; // close char* -#endif // USE_EXTERNAL_KERNEL -//#endif //_OCL_KERNEL_H_ +#endif // USE_EXTERNAL_KERNEL +#endif // TESSERACT_OPENCL_OCLKERNELS_H_ /* vim:set shiftwidth=4 softtabstop=4 expandtab: */ diff --git a/opencl/opencl_device_selection.h b/opencl/opencl_device_selection.h index 74272b35..08506b30 100644 --- a/opencl/opencl_device_selection.h +++ b/opencl/opencl_device_selection.h @@ -1,7 +1,17 @@ -#ifdef USE_OPENCL +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + #ifndef DEVICE_SELECTION_H #define DEVICE_SELECTION_H +#ifdef USE_OPENCL #ifdef _MSC_VER #define _CRT_SECURE_NO_WARNINGS @@ -17,29 +27,12 @@ #include #endif -#define DS_DEVICE_NAME_LENGTH 256 - -typedef enum { - DS_SUCCESS = 0, - DS_INVALID_PROFILE = 1000, - DS_MEMORY_ERROR, - DS_INVALID_PERF_EVALUATOR_TYPE, - DS_INVALID_PERF_EVALUATOR, - DS_PERF_EVALUATOR_ERROR, - DS_FILE_ERROR, - DS_UNKNOWN_DEVICE_TYPE, - DS_PROFILE_FILE_ERROR, - DS_SCORE_SERIALIZER_ERROR, - DS_SCORE_DESERIALIZER_ERROR -} ds_status; - // device type typedef enum { DS_DEVICE_NATIVE_CPU = 0, - DS_DEVICE_OPENCL_DEVICE + DS_DEVICE_OPENCL_DEVICE } ds_device_type; - typedef struct { ds_device_type type; cl_device_id oclDeviceID; @@ -49,548 +42,5 @@ typedef struct { void* score; } ds_device; -typedef struct { - unsigned int numDevices; - ds_device* devices; - const char* version; -} ds_profile; - -// deallocate memory used by score -typedef ds_status (*ds_score_release)(void* score); -static ds_status releaseDSProfile(ds_profile* profile, ds_score_release sr) { - ds_status status = DS_SUCCESS; - if (profile!=NULL) { - if (profile->devices!=NULL && sr!=NULL) { - unsigned int i; - for (i = 0; i < profile->numDevices; i++) { - if (profile->devices[i].oclDeviceName) free(profile->devices[i].oclDeviceName); - if (profile->devices[i].oclDriverVersion) free(profile->devices[i].oclDriverVersion); - status = sr(profile->devices[i].score); - if (status != DS_SUCCESS) - break; - } - free(profile->devices); - } - free(profile); - } - return status; -} - - -static ds_status initDSProfile(ds_profile** p, const char* version) { - int numDevices; - cl_uint numPlatforms; - cl_platform_id* platforms = NULL; - cl_device_id* devices = NULL; - ds_status status = DS_SUCCESS; - ds_profile* profile = NULL; - unsigned int next; - unsigned int i; - - if (p == NULL) - return DS_INVALID_PROFILE; - - profile = (ds_profile*)malloc(sizeof(ds_profile)); - if (profile == NULL) - return DS_MEMORY_ERROR; - - memset(profile, 0, sizeof(ds_profile)); - - clGetPlatformIDs(0, NULL, &numPlatforms); - if (numPlatforms == 0) - goto cleanup; - - platforms = (cl_platform_id*)malloc(numPlatforms*sizeof(cl_platform_id)); - if (platforms == NULL) { - status = DS_MEMORY_ERROR; - goto cleanup; - } - clGetPlatformIDs(numPlatforms, platforms, NULL); - - numDevices = 0; - for (i = 0; i < (unsigned int)numPlatforms; i++) { - cl_uint num; - clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, NULL, &num); - numDevices+=num; - } - if (numDevices == 0) - goto cleanup; - - devices = (cl_device_id*)malloc(numDevices*sizeof(cl_device_id)); - if (devices == NULL) { - status = DS_MEMORY_ERROR; - goto cleanup; - } - - profile->numDevices = numDevices+1; // +1 to numDevices to include the native CPU - profile->devices = (ds_device*)malloc(profile->numDevices*sizeof(ds_device)); - if (profile->devices == NULL) { - profile->numDevices = 0; - status = DS_MEMORY_ERROR; - goto cleanup; - } - memset(profile->devices, 0, profile->numDevices*sizeof(ds_device)); - - next = 0; - for (i = 0; i < (unsigned int)numPlatforms; i++) { - cl_uint num; - unsigned j; - clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, devices, &num); - for (j = 0; j < num; j++, next++) { - char buffer[DS_DEVICE_NAME_LENGTH]; - size_t length; - - profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE; - profile->devices[next].oclDeviceID = devices[j]; - - clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME - , DS_DEVICE_NAME_LENGTH, &buffer, NULL); - length = strlen(buffer); - profile->devices[next].oclDeviceName = (char*)malloc(length+1); - memcpy(profile->devices[next].oclDeviceName, buffer, length+1); - - clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION - , DS_DEVICE_NAME_LENGTH, &buffer, NULL); - length = strlen(buffer); - profile->devices[next].oclDriverVersion = (char*)malloc(length+1); - memcpy(profile->devices[next].oclDriverVersion, buffer, length+1); - } - } - profile->devices[next].type = DS_DEVICE_NATIVE_CPU; - profile->version = version; - -cleanup: - if (platforms) free(platforms); - if (devices) free(devices); - if (status == DS_SUCCESS) { - *p = profile; - } - else { - if (profile) { - if (profile->devices) - free(profile->devices); - free(profile); - } - } - return status; -} - -// Pointer to a function that calculates the score of a device (ex: -// device->score) update the data size of score. The encoding and the format -// of the score data is implementation defined. The function should return -// DS_SUCCESS if there's no error to be reported. -typedef ds_status (*ds_perf_evaluator)(ds_device* device, void* data); - -typedef enum { - DS_EVALUATE_ALL - ,DS_EVALUATE_NEW_ONLY -} ds_evaluation_type; - -static ds_status profileDevices(ds_profile* profile, - const ds_evaluation_type type, - ds_perf_evaluator evaluator, - void* evaluatorData, unsigned int* numUpdates) { - ds_status status = DS_SUCCESS; - unsigned int i; - unsigned int updates = 0; - - if (profile == NULL) { - return DS_INVALID_PROFILE; - } - if (evaluator == NULL) { - return DS_INVALID_PERF_EVALUATOR; - } - - for (i = 0; i < profile->numDevices; i++) { - ds_status evaluatorStatus; - - switch (type) { - case DS_EVALUATE_NEW_ONLY: - if (profile->devices[i].score != NULL) - break; - // else fall through - case DS_EVALUATE_ALL: - evaluatorStatus = evaluator(profile->devices+i, evaluatorData); - if (evaluatorStatus != DS_SUCCESS) { - status = evaluatorStatus; - return status; - } - updates++; - break; - default: - return DS_INVALID_PERF_EVALUATOR_TYPE; - break; - }; - } - if (numUpdates) - *numUpdates = updates; - return status; -} - - -#define DS_TAG_VERSION "" -#define DS_TAG_VERSION_END "" -#define DS_TAG_DEVICE "" -#define DS_TAG_DEVICE_END "" -#define DS_TAG_SCORE "" -#define DS_TAG_SCORE_END "" -#define DS_TAG_DEVICE_TYPE "" -#define DS_TAG_DEVICE_TYPE_END "" -#define DS_TAG_DEVICE_NAME "" -#define DS_TAG_DEVICE_NAME_END "" -#define DS_TAG_DEVICE_DRIVER_VERSION "" -#define DS_TAG_DEVICE_DRIVER_VERSION_END "" - -#define DS_DEVICE_NATIVE_CPU_STRING "native_cpu" - - - -typedef ds_status (*ds_score_serializer)(ds_device* device, - void** serializedScore, - unsigned int* serializedScoreSize); -static ds_status writeProfileToFile(ds_profile* profile, - ds_score_serializer serializer, - const char* file) { - ds_status status = DS_SUCCESS; - FILE* profileFile = NULL; - - - if (profile == NULL) - return DS_INVALID_PROFILE; - - profileFile = fopen(file, "wb"); - if (profileFile==NULL) { - status = DS_FILE_ERROR; - } - else { - unsigned int i; - - // write version string - fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile); - fwrite(profile->version, sizeof(char), strlen(profile->version), profileFile); - fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END), profileFile); - fwrite("\n", sizeof(char), 1, profileFile); - - for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) { - void* serializedScore; - unsigned int serializedScoreSize; - - fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile); - - fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE), - profileFile); - fwrite(&profile->devices[i].type,sizeof(ds_device_type),1, profileFile); - fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char), - strlen(DS_TAG_DEVICE_TYPE_END), profileFile); - - switch(profile->devices[i].type) { - case DS_DEVICE_NATIVE_CPU: - { - // There's no need to emit a device name for the native CPU device. - /* - fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), - profileFile); - fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char), - strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile); - fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), - strlen(DS_TAG_DEVICE_NAME_END), profileFile); - */ - } - break; - case DS_DEVICE_OPENCL_DEVICE: - { - fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), - profileFile); - fwrite(profile->devices[i].oclDeviceName, - sizeof(char),strlen(profile->devices[i].oclDeviceName), profileFile); - fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), - strlen(DS_TAG_DEVICE_NAME_END), profileFile); - - fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char), - strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile); - fwrite(profile->devices[i].oclDriverVersion, sizeof(char), - strlen(profile->devices[i].oclDriverVersion), profileFile); - fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char), - strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile); - } - break; - default: - status = DS_UNKNOWN_DEVICE_TYPE; - break; - }; - - fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile); - status = serializer(profile->devices+i, &serializedScore, - &serializedScoreSize); - if (status == DS_SUCCESS && serializedScore!=NULL && serializedScoreSize > 0) { - fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile); - free(serializedScore); - } - fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END), profileFile); - fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END), profileFile); - fwrite("\n",sizeof(char),1,profileFile); - } - fclose(profileFile); - } - return status; -} - - -static ds_status readProFile(const char* fileName, char** content, - size_t* contentSize) { - FILE * input = NULL; - size_t size = 0; - char* binary = NULL; - - *contentSize = 0; - *content = NULL; - - input = fopen(fileName, "rb"); - if(input == NULL) { - return DS_FILE_ERROR; - } - - fseek(input, 0L, SEEK_END); - size = ftell(input); - rewind(input); - binary = (char*)malloc(size); - if(binary == NULL) { - fclose(input); - return DS_FILE_ERROR; - } - fread(binary, sizeof(char), size, input); - fclose(input); - - *contentSize = size; - *content = binary; - return DS_SUCCESS; -} - - -static const char* findString(const char* contentStart, const char* contentEnd, - const char* string) { - size_t stringLength; - const char* currentPosition; - const char* found; - found = NULL; - stringLength = strlen(string); - currentPosition = contentStart; - for(currentPosition = contentStart; currentPosition < contentEnd; currentPosition++) { - if (*currentPosition == string[0]) { - if (currentPosition+stringLength < contentEnd) { - if (strncmp(currentPosition, string, stringLength) == 0) { - found = currentPosition; - break; - } - } - } - } - return found; -} - - -typedef ds_status (*ds_score_deserializer)(ds_device* device, - const unsigned char* serializedScore, - unsigned int serializedScoreSize); -static ds_status readProfileFromFile(ds_profile* profile, - ds_score_deserializer deserializer, - const char* file) { - - ds_status status = DS_SUCCESS; - char* contentStart = NULL; - const char* contentEnd = NULL; - size_t contentSize; - - if (profile==NULL) - return DS_INVALID_PROFILE; - - status = readProFile(file, &contentStart, &contentSize); - if (status == DS_SUCCESS) { - const char* currentPosition; - const char* dataStart; - const char* dataEnd; - size_t versionStringLength; - - contentEnd = contentStart + contentSize; - currentPosition = contentStart; - - - // parse the version string - dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION); - if (dataStart == NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - dataStart += strlen(DS_TAG_VERSION); - - dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END); - if (dataEnd==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - - versionStringLength = strlen(profile->version); - if (versionStringLength!=(dataEnd-dataStart) - || strncmp(profile->version, dataStart, versionStringLength)!=0) { - // version mismatch - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - currentPosition = dataEnd+strlen(DS_TAG_VERSION_END); - - // parse the device information - while (1) { - unsigned int i; - - const char* deviceTypeStart; - const char* deviceTypeEnd; - ds_device_type deviceType; - - const char* deviceNameStart; - const char* deviceNameEnd; - - const char* deviceScoreStart; - const char* deviceScoreEnd; - - const char* deviceDriverStart; - const char* deviceDriverEnd; - - dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE); - if (dataStart==NULL) { - // nothing useful remain, quit... - break; - } - dataStart+=strlen(DS_TAG_DEVICE); - dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END); - if (dataEnd==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - - // parse the device type - deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE); - if (deviceTypeStart==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - deviceTypeStart+=strlen(DS_TAG_DEVICE_TYPE); - deviceTypeEnd = findString(deviceTypeStart, contentEnd, - DS_TAG_DEVICE_TYPE_END); - if (deviceTypeEnd==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type)); - - - // parse the device name - if (deviceType == DS_DEVICE_OPENCL_DEVICE) { - - deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME); - if (deviceNameStart==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - deviceNameStart+=strlen(DS_TAG_DEVICE_NAME); - deviceNameEnd = findString(deviceNameStart, contentEnd, - DS_TAG_DEVICE_NAME_END); - if (deviceNameEnd==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - - - deviceDriverStart = findString(dataStart, contentEnd, - DS_TAG_DEVICE_DRIVER_VERSION); - if (deviceDriverStart==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - deviceDriverStart+=strlen(DS_TAG_DEVICE_DRIVER_VERSION); - deviceDriverEnd = findString(deviceDriverStart, contentEnd, - DS_TAG_DEVICE_DRIVER_VERSION_END); - if (deviceDriverEnd ==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - - - // check if this device is on the system - for (i = 0; i < profile->numDevices; i++) { - if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) { - size_t actualDeviceNameLength; - size_t driverVersionLength; - - actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName); - driverVersionLength = strlen(profile->devices[i].oclDriverVersion); - if (actualDeviceNameLength == (deviceNameEnd - deviceNameStart) - && driverVersionLength == (deviceDriverEnd - deviceDriverStart) - && strncmp(profile->devices[i].oclDeviceName, deviceNameStart, - actualDeviceNameLength)==0 - && strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart, - driverVersionLength)==0) { - deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE); - if (deviceNameStart==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - deviceScoreStart+=strlen(DS_TAG_SCORE); - deviceScoreEnd = findString(deviceScoreStart, contentEnd, - DS_TAG_SCORE_END); - status = deserializer(profile->devices+i, - (const unsigned char*)deviceScoreStart, - deviceScoreEnd-deviceScoreStart); - if (status != DS_SUCCESS) { - goto cleanup; - } - } - } - } - - } - else if (deviceType == DS_DEVICE_NATIVE_CPU) { - for (i = 0; i < profile->numDevices; i++) { - if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) { - deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE); - if (deviceScoreStart==NULL) { - status = DS_PROFILE_FILE_ERROR; - goto cleanup; - } - deviceScoreStart+=strlen(DS_TAG_SCORE); - deviceScoreEnd = findString(deviceScoreStart, contentEnd, - DS_TAG_SCORE_END); - status = deserializer(profile->devices+i, - (const unsigned char*)deviceScoreStart, - deviceScoreEnd-deviceScoreStart); - if (status != DS_SUCCESS) { - goto cleanup; - } - } - } - } - - // skip over the current one to find the next device - currentPosition = dataEnd+strlen(DS_TAG_DEVICE_END); - } - } -cleanup: - if (contentStart!=NULL) free(contentStart); - return status; -} - -static ds_status getNumDeviceWithEmptyScore(ds_profile* profile, - unsigned int* num) { - unsigned int i; - if (profile == NULL || num==NULL) - return DS_MEMORY_ERROR; - *num=0; - for (i = 0; i < profile->numDevices; i++) { - if (profile->devices[i].score == NULL) { - *num++; - } - } - return DS_SUCCESS; -} - -#endif -#endif +#endif // USE_OPENCL +#endif // DEVICE_SELECTION_H diff --git a/opencl/openclwrapper.cpp b/opencl/openclwrapper.cpp index 7f81ae34..1c5a1b50 100644 --- a/opencl/openclwrapper.cpp +++ b/opencl/openclwrapper.cpp @@ -1,7 +1,14 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifdef _WIN32 -#include #include - #else #include #include @@ -16,37 +23,41 @@ #include "thresholder.h" #if ON_APPLE -#include #include +#include #endif -/* - Convenience macro to test the version of Leptonica. -*/ -#if defined(LIBLEPT_MAJOR_VERSION) && defined(LIBLEPT_MINOR_VERSION) -# define TESSERACT_LIBLEPT_PREREQ(maj, min) \ - ((LIBLEPT_MAJOR_VERSION) > (maj) || ((LIBLEPT_MAJOR_VERSION) == (maj) && (LIBLEPT_MINOR_VERSION) >= (min))) -#else -# define TESSERACT_LIBLEPT_PREREQ(maj, min) 0 -#endif - -#if TESSERACT_LIBLEPT_PREREQ(1,73) -# define CALLOC LEPT_CALLOC -# define FREE LEPT_FREE -#endif +#define CALLOC LEPT_CALLOC +#define FREE LEPT_FREE #ifdef USE_OPENCL #include "opencl_device_selection.h" GPUEnv OpenclDevice::gpuEnv; - bool OpenclDevice::deviceIsSelected = false; ds_device OpenclDevice::selectedDevice; - int OpenclDevice::isInited = 0; +static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC; + +static const l_uint32 lmask32[] = { + 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000, + 0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000, + 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000, + 0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00, + 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc, + 0xfffffffe, 0xffffffff}; + +static const l_uint32 rmask32[] = { + 0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f, + 0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, + 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff, + 0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, + 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff, + 0x7fffffff, 0xffffffff}; + struct tiff_transform { int vflip; /* if non-zero, image needs a vertical fip */ int hflip; /* if non-zero, image needs a horizontal flip */ @@ -66,31 +77,550 @@ static struct tiff_transform tiff_orientation_transforms[] = { {0, 0, -1} }; -static const l_int32 MAX_PAGES_IN_TIFF_FILE = 3000; +static const l_int32 MAX_PAGES_IN_TIFF_FILE = 3000; cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers cl_mem pixThBuffer; //output from thresholdtopix calculation cl_int clStatus; KernelEnv rEnv; +#define DS_TAG_VERSION "" +#define DS_TAG_VERSION_END "" +#define DS_TAG_DEVICE "" +#define DS_TAG_DEVICE_END "" +#define DS_TAG_SCORE "" +#define DS_TAG_SCORE_END "" +#define DS_TAG_DEVICE_TYPE "" +#define DS_TAG_DEVICE_TYPE_END "" +#define DS_TAG_DEVICE_NAME "" +#define DS_TAG_DEVICE_NAME_END "" +#define DS_TAG_DEVICE_DRIVER_VERSION "" +#define DS_TAG_DEVICE_DRIVER_VERSION_END "" + +#define DS_DEVICE_NATIVE_CPU_STRING "native_cpu" + +#define DS_DEVICE_NAME_LENGTH 256 + +typedef enum { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY } ds_evaluation_type; + +typedef struct { + unsigned int numDevices; + ds_device *devices; + const char *version; +} ds_profile; + +typedef enum { + DS_SUCCESS = 0, + DS_INVALID_PROFILE = 1000, + DS_MEMORY_ERROR, + DS_INVALID_PERF_EVALUATOR_TYPE, + DS_INVALID_PERF_EVALUATOR, + DS_PERF_EVALUATOR_ERROR, + DS_FILE_ERROR, + DS_UNKNOWN_DEVICE_TYPE, + DS_PROFILE_FILE_ERROR, + DS_SCORE_SERIALIZER_ERROR, + DS_SCORE_DESERIALIZER_ERROR +} ds_status; + +// Pointer to a function that calculates the score of a device (ex: +// device->score) update the data size of score. The encoding and the format +// of the score data is implementation defined. The function should return +// DS_SUCCESS if there's no error to be reported. +typedef ds_status (*ds_perf_evaluator)(ds_device *device, void *data); + +// deallocate memory used by score +typedef ds_status (*ds_score_release)(void *score); +static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) { + ds_status status = DS_SUCCESS; + if (profile != nullptr) { + if (profile->devices != nullptr && sr != nullptr) { + unsigned int i; + for (i = 0; i < profile->numDevices; i++) { + free(profile->devices[i].oclDeviceName); + free(profile->devices[i].oclDriverVersion); + status = sr(profile->devices[i].score); + if (status != DS_SUCCESS) break; + } + free(profile->devices); + } + free(profile); + } + return status; +} + +static ds_status initDSProfile(ds_profile **p, const char *version) { + int numDevices; + cl_uint numPlatforms; + cl_platform_id *platforms = nullptr; + cl_device_id *devices = nullptr; + ds_status status = DS_SUCCESS; + unsigned int next; + unsigned int i; + + if (p == nullptr) return DS_INVALID_PROFILE; + + ds_profile *profile = (ds_profile *)malloc(sizeof(ds_profile)); + if (profile == nullptr) return DS_MEMORY_ERROR; + + memset(profile, 0, sizeof(ds_profile)); + + clGetPlatformIDs(0, nullptr, &numPlatforms); + + if (numPlatforms > 0) { + platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id)); + if (platforms == nullptr) { + status = DS_MEMORY_ERROR; + goto cleanup; + } + clGetPlatformIDs(numPlatforms, platforms, nullptr); + } + + numDevices = 0; + for (i = 0; i < (unsigned int)numPlatforms; i++) { + cl_uint num; + clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, nullptr, &num); + numDevices += num; + } + + if (numDevices > 0) { + devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id)); + if (devices == nullptr) { + status = DS_MEMORY_ERROR; + goto cleanup; + } + } + + profile->numDevices = + numDevices + 1; // +1 to numDevices to include the native CPU + profile->devices = + (ds_device *)malloc(profile->numDevices * sizeof(ds_device)); + if (profile->devices == nullptr) { + profile->numDevices = 0; + status = DS_MEMORY_ERROR; + goto cleanup; + } + memset(profile->devices, 0, profile->numDevices * sizeof(ds_device)); + + next = 0; + for (i = 0; i < (unsigned int)numPlatforms; i++) { + cl_uint num; + unsigned j; + clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, devices, &num); + for (j = 0; j < num; j++, next++) { + char buffer[DS_DEVICE_NAME_LENGTH]; + size_t length; + + profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE; + profile->devices[next].oclDeviceID = devices[j]; + + clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME, + DS_DEVICE_NAME_LENGTH, &buffer, nullptr); + length = strlen(buffer); + profile->devices[next].oclDeviceName = (char *)malloc(length + 1); + memcpy(profile->devices[next].oclDeviceName, buffer, length + 1); + + clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION, + DS_DEVICE_NAME_LENGTH, &buffer, nullptr); + length = strlen(buffer); + profile->devices[next].oclDriverVersion = (char *)malloc(length + 1); + memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1); + } + } + profile->devices[next].type = DS_DEVICE_NATIVE_CPU; + profile->version = version; + +cleanup: + free(platforms); + free(devices); + if (status == DS_SUCCESS) { + *p = profile; + } else { + if (profile) { + free(profile->devices); + free(profile); + } + } + return status; +} + +static ds_status profileDevices(ds_profile *profile, + const ds_evaluation_type type, + ds_perf_evaluator evaluator, + void *evaluatorData, unsigned int *numUpdates) { + ds_status status = DS_SUCCESS; + unsigned int i; + unsigned int updates = 0; + + if (profile == nullptr) { + return DS_INVALID_PROFILE; + } + if (evaluator == nullptr) { + return DS_INVALID_PERF_EVALUATOR; + } + + for (i = 0; i < profile->numDevices; i++) { + ds_status evaluatorStatus; + + switch (type) { + case DS_EVALUATE_NEW_ONLY: + if (profile->devices[i].score != nullptr) break; + // else fall through + case DS_EVALUATE_ALL: + evaluatorStatus = evaluator(profile->devices + i, evaluatorData); + if (evaluatorStatus != DS_SUCCESS) { + status = evaluatorStatus; + return status; + } + updates++; + break; + default: + return DS_INVALID_PERF_EVALUATOR_TYPE; + break; + }; + } + if (numUpdates) *numUpdates = updates; + return status; +} + +static const char *findString(const char *contentStart, const char *contentEnd, + const char *string) { + size_t stringLength; + const char *currentPosition; + const char *found = nullptr; + stringLength = strlen(string); + currentPosition = contentStart; + for (currentPosition = contentStart; currentPosition < contentEnd; + currentPosition++) { + if (*currentPosition == string[0]) { + if (currentPosition + stringLength < contentEnd) { + if (strncmp(currentPosition, string, stringLength) == 0) { + found = currentPosition; + break; + } + } + } + } + return found; +} + +static ds_status readProFile(const char *fileName, char **content, + size_t *contentSize) { + size_t size = 0; + + *contentSize = 0; + *content = nullptr; + + FILE *input = fopen(fileName, "rb"); + if (input == nullptr) { + return DS_FILE_ERROR; + } + + fseek(input, 0L, SEEK_END); + size = ftell(input); + rewind(input); + char *binary = (char *)malloc(size); + if (binary == nullptr) { + fclose(input); + return DS_FILE_ERROR; + } + fread(binary, sizeof(char), size, input); + fclose(input); + + *contentSize = size; + *content = binary; + return DS_SUCCESS; +} + +typedef ds_status (*ds_score_deserializer)(ds_device *device, + const unsigned char *serializedScore, + unsigned int serializedScoreSize); + +static ds_status readProfileFromFile(ds_profile *profile, + ds_score_deserializer deserializer, + const char *file) { + ds_status status = DS_SUCCESS; + char *contentStart = nullptr; + const char *contentEnd = nullptr; + size_t contentSize; + + if (profile == nullptr) return DS_INVALID_PROFILE; + + status = readProFile(file, &contentStart, &contentSize); + if (status == DS_SUCCESS) { + const char *currentPosition; + const char *dataStart; + const char *dataEnd; + size_t versionStringLength; + + contentEnd = contentStart + contentSize; + currentPosition = contentStart; + + // parse the version string + dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION); + if (dataStart == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + dataStart += strlen(DS_TAG_VERSION); + + dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END); + if (dataEnd == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + + versionStringLength = strlen(profile->version); + if (versionStringLength != (dataEnd - dataStart) || + strncmp(profile->version, dataStart, versionStringLength) != 0) { + // version mismatch + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + currentPosition = dataEnd + strlen(DS_TAG_VERSION_END); + + // parse the device information + while (1) { + unsigned int i; + + const char *deviceTypeStart; + const char *deviceTypeEnd; + ds_device_type deviceType; + + const char *deviceNameStart; + const char *deviceNameEnd; + + const char *deviceScoreStart; + const char *deviceScoreEnd; + + const char *deviceDriverStart; + const char *deviceDriverEnd; + + dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE); + if (dataStart == nullptr) { + // nothing useful remain, quit... + break; + } + dataStart += strlen(DS_TAG_DEVICE); + dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END); + if (dataEnd == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + + // parse the device type + deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE); + if (deviceTypeStart == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE); + deviceTypeEnd = + findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END); + if (deviceTypeEnd == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type)); + + // parse the device name + if (deviceType == DS_DEVICE_OPENCL_DEVICE) { + deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME); + if (deviceNameStart == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + deviceNameStart += strlen(DS_TAG_DEVICE_NAME); + deviceNameEnd = + findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END); + if (deviceNameEnd == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + + deviceDriverStart = + findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION); + if (deviceDriverStart == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION); + deviceDriverEnd = findString(deviceDriverStart, contentEnd, + DS_TAG_DEVICE_DRIVER_VERSION_END); + if (deviceDriverEnd == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + + // check if this device is on the system + for (i = 0; i < profile->numDevices; i++) { + if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) { + size_t actualDeviceNameLength; + size_t driverVersionLength; + + actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName); + driverVersionLength = strlen(profile->devices[i].oclDriverVersion); + if (actualDeviceNameLength == (deviceNameEnd - deviceNameStart) && + driverVersionLength == (deviceDriverEnd - deviceDriverStart) && + strncmp(profile->devices[i].oclDeviceName, deviceNameStart, + actualDeviceNameLength) == 0 && + strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart, + driverVersionLength) == 0) { + deviceScoreStart = + findString(dataStart, contentEnd, DS_TAG_SCORE); + if (deviceNameStart == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + deviceScoreStart += strlen(DS_TAG_SCORE); + deviceScoreEnd = + findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END); + status = deserializer(profile->devices + i, + (const unsigned char *)deviceScoreStart, + deviceScoreEnd - deviceScoreStart); + if (status != DS_SUCCESS) { + goto cleanup; + } + } + } + } + } else if (deviceType == DS_DEVICE_NATIVE_CPU) { + for (i = 0; i < profile->numDevices; i++) { + if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) { + deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE); + if (deviceScoreStart == nullptr) { + status = DS_PROFILE_FILE_ERROR; + goto cleanup; + } + deviceScoreStart += strlen(DS_TAG_SCORE); + deviceScoreEnd = + findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END); + status = deserializer(profile->devices + i, + (const unsigned char *)deviceScoreStart, + deviceScoreEnd - deviceScoreStart); + if (status != DS_SUCCESS) { + goto cleanup; + } + } + } + } + + // skip over the current one to find the next device + currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END); + } + } +cleanup: + free(contentStart); + return status; +} + +typedef ds_status (*ds_score_serializer)(ds_device *device, + void **serializedScore, + unsigned int *serializedScoreSize); +static ds_status writeProfileToFile(ds_profile *profile, + ds_score_serializer serializer, + const char *file) { + ds_status status = DS_SUCCESS; + + if (profile == nullptr) return DS_INVALID_PROFILE; + + FILE *profileFile = fopen(file, "wb"); + if (profileFile == nullptr) { + status = DS_FILE_ERROR; + } else { + unsigned int i; + + // write version string + fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile); + fwrite(profile->version, sizeof(char), strlen(profile->version), + profileFile); + fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END), + profileFile); + fwrite("\n", sizeof(char), 1, profileFile); + + for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) { + void *serializedScore; + unsigned int serializedScoreSize; + + fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile); + + fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE), + profileFile); + fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile); + fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char), + strlen(DS_TAG_DEVICE_TYPE_END), profileFile); + + switch (profile->devices[i].type) { + case DS_DEVICE_NATIVE_CPU: { + // There's no need to emit a device name for the native CPU device. + /* + fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), + profileFile); + fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char), + strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile); + fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), + strlen(DS_TAG_DEVICE_NAME_END), profileFile); + */ + } break; + case DS_DEVICE_OPENCL_DEVICE: { + fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME), + profileFile); + fwrite(profile->devices[i].oclDeviceName, sizeof(char), + strlen(profile->devices[i].oclDeviceName), profileFile); + fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char), + strlen(DS_TAG_DEVICE_NAME_END), profileFile); + + fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char), + strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile); + fwrite(profile->devices[i].oclDriverVersion, sizeof(char), + strlen(profile->devices[i].oclDriverVersion), profileFile); + fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char), + strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile); + } break; + default: + status = DS_UNKNOWN_DEVICE_TYPE; + break; + }; + + fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile); + status = serializer(profile->devices + i, &serializedScore, + &serializedScoreSize); + if (status == DS_SUCCESS && serializedScore != nullptr && + serializedScoreSize > 0) { + fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile); + free(serializedScore); + } + fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END), + profileFile); + fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END), + profileFile); + fwrite("\n", sizeof(char), 1, profileFile); + } + fclose(profileFile); + } + return status; +} + // substitute invalid characters in device name with _ void legalizeFileName( char *fileName) { //printf("fileName: %s\n", fileName); - const char* invalidChars = "/\?:*\"><| "; // space is valid but can cause headaches + const char *invalidChars = + "/\?:*\"><| "; // space is valid but can cause headaches // for each invalid char for (int i = 0; i < strlen(invalidChars); i++) { char invalidStr[4]; invalidStr[0] = invalidChars[i]; - invalidStr[1] = NULL; + invalidStr[1] = '\0'; //printf("eliminating %s\n", invalidStr); //char *pos = strstr(fileName, invalidStr); // initial ./ is valid for present directory //if (*pos == '.') pos++; //if (*pos == '/') pos++; - for ( char *pos = strstr(fileName, invalidStr); pos != NULL; pos = strstr(pos+1, invalidStr)) { - //printf("\tfound: %s, ", pos); - pos[0] = '_'; - //printf("fileName: %s\n", fileName); + for (char *pos = strstr(fileName, invalidStr); pos != nullptr; + pos = strstr(pos + 1, invalidStr)) { + // printf("\tfound: %s, ", pos); + pos[0] = '_'; + // printf("fileName: %s\n", fileName); } } } @@ -103,39 +633,41 @@ void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) { gpuInfo->mpDevID = device; gpuInfo->mpArryDevsID = new cl_device_id[1]; gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID; - clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE , sizeof(cl_device_type), (void *) &gpuInfo->mDevType , &size); + clStatus = + clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE, + sizeof(cl_device_type), &gpuInfo->mDevType, &size); CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)"); // platform - clStatus = clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM , sizeof(cl_platform_id), (void *) &gpuInfo->mpPlatformID , &size); + clStatus = + clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM, + sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size); CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)"); // context cl_context_properties props[3]; props[0] = CL_CONTEXT_PLATFORM; props[1] = (cl_context_properties) gpuInfo->mpPlatformID; props[2] = 0; - gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, NULL, NULL, &clStatus); + gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr, + nullptr, &clStatus); CHECK_OPENCL( clStatus, "populateGPUEnv::createContext"); // queue cl_command_queue_properties queueProperties = 0; gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus ); CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue"); - } int OpenclDevice::LoadOpencl() { #ifdef WIN32 - HINSTANCE HOpenclDll = NULL; - void * OpenclDll = NULL; - //fprintf(stderr, " LoadOpenclDllxx... \n"); - OpenclDll = static_cast( HOpenclDll ); - OpenclDll = LoadLibrary( "openCL.dll" ); - if ( !static_cast( OpenclDll ) ) - { - fprintf(stderr, "[OD] Load opencl.dll failed!\n"); - FreeLibrary( static_cast( OpenclDll ) ); - return 0; - + HINSTANCE HOpenclDll = nullptr; + void *OpenclDll = nullptr; + // fprintf(stderr, " LoadOpenclDllxx... \n"); + OpenclDll = static_cast(HOpenclDll); + OpenclDll = LoadLibrary("openCL.dll"); + if (!static_cast(OpenclDll)) { + fprintf(stderr, "[OD] Load opencl.dll failed!\n"); + FreeLibrary(static_cast(OpenclDll)); + return 0; } fprintf(stderr, "[OD] Load opencl.dll successful!\n"); #endif @@ -158,60 +690,57 @@ cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer, size_t nElem return membuffer; } -PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, int elements, cl_mem_flags flags, bool memcopy = false, bool sync = true) -{ - PROCNAME("mapOutputCLBuffer"); - if (!pixd) - { - if (memcopy) - { - if ((pixd = pixCreateTemplate(pixs)) == NULL) - (PIX *)ERROR_PTR("pixd not made", procName, NULL); - } - else - { - if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), pixGetDepth(pixs))) == NULL) - (PIX *)ERROR_PTR("pixd not made", procName, NULL); - } +PIX *mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX *pixd, PIX *pixs, + int elements, cl_mem_flags flags, bool memcopy = false, + bool sync = true) { + PROCNAME("mapOutputCLBuffer"); + if (!pixd) { + if (memcopy) { + if ((pixd = pixCreateTemplate(pixs)) == nullptr) + (PIX *)ERROR_PTR("pixd not made", procName, nullptr); + } else { + if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs), + pixGetDepth(pixs))) == nullptr) + (PIX *)ERROR_PTR("pixd not made", procName, nullptr); } - l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0, - elements * sizeof(l_uint32), 0, NULL, NULL, NULL ); + } + l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer( + rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0, + elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr); - if (memcopy) - { - memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32)); - } - else - { - pixSetData(pixd, pValues); - } + if (memcopy) { + memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32)); + } else { + pixSetData(pixd, pValues); + } - clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,clbuffer,pValues,0,NULL,NULL); + clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, nullptr, + nullptr); - if (sync) - { - clFinish( rEnv.mpkCmdQueue ); - } + if (sync) { + clFinish(rEnv.mpkCmdQueue); + } - return pixd; + return pixd; } cl_mem allocateIntBuffer( KernelEnv rEnv, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus , bool sync = false) { - cl_mem xValues = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (CL_MEM_READ_WRITE), - nElements * sizeof(l_int32), NULL, pStatus); + cl_mem xValues = + clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(CL_MEM_READ_WRITE), + nElements * sizeof(l_int32), nullptr, pStatus); - if (_pValues != NULL) - { - l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, - nElements * sizeof(l_int32), 0, NULL, NULL, NULL ); + if (_pValues != nullptr) { + l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( + rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, + nElements * sizeof(l_int32), 0, nullptr, nullptr, nullptr); - memcpy(pValues, _pValues, nElements * sizeof(l_int32)); + memcpy(pValues, _pValues, nElements * sizeof(l_int32)); - clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL); + clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, xValues, pValues, 0, nullptr, + nullptr); - if (sync) - clFinish( rEnv.mpkCmdQueue ); + if (sync) clFinish(rEnv.mpkCmdQueue); } return xValues; @@ -220,26 +749,25 @@ PIX* mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, PIX* pixd, PIX* pixs, in void OpenclDevice::releaseMorphCLBuffers() { - if (pixdCLIntermediate != NULL) - clReleaseMemObject(pixdCLIntermediate); - if (pixsCLBuffer != NULL) - clReleaseMemObject(pixsCLBuffer); - if (pixdCLBuffer != NULL) - clReleaseMemObject(pixdCLBuffer); - if (pixThBuffer != NULL) - clReleaseMemObject(pixThBuffer); + if (pixdCLIntermediate != nullptr) clReleaseMemObject(pixdCLIntermediate); + if (pixsCLBuffer != nullptr) clReleaseMemObject(pixsCLBuffer); + if (pixdCLBuffer != nullptr) clReleaseMemObject(pixdCLBuffer); + if (pixThBuffer != nullptr) clReleaseMemObject(pixThBuffer); + pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr; } int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs) { SetKernelEnv( &rEnv ); - if (pixThBuffer != NULL) - { - pixsCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); + if (pixThBuffer != nullptr) { + pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, + CL_MEM_ALLOC_HOST_PTR, &clStatus); - //Get the output from ThresholdToPix operation - clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, sizeof(l_uint32) * wpl*h, 0, NULL, NULL); + // Get the output from ThresholdToPix operation + clStatus = + clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0, + sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr); } else { @@ -250,9 +778,11 @@ int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs) pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus); } - pixdCLBuffer = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); + pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h, + CL_MEM_ALLOC_HOST_PTR, &clStatus); - pixdCLIntermediate = allocateZeroCopyBuffer(rEnv, NULL, wpl*h, CL_MEM_ALLOC_HOST_PTR, &clStatus); + pixdCLIntermediate = allocateZeroCopyBuffer( + rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus); return (int)clStatus; } @@ -271,7 +801,6 @@ PERF_COUNT_SUB("LoadOpencl") #endif // sets up environment, compiles programs - InitOpenclRunEnv_DeviceSelection( 0 ); //PERF_COUNT_SUB("called InitOpenclRunEnv_DS") //PERF_COUNT_END @@ -310,7 +839,6 @@ int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) { //PERF_COUNT_START("InitOpenclRunEnv_DS") if (!isInited) { // after programs compiled, selects best device - //printf("[DS] InitOpenclRunEnv_DS::Calling performDeviceSelection()\n"); ds_device bestDevice_DS = getDeviceSelection( ); //PERF_COUNT_SUB("called getDeviceSelection()") cl_device_id bestDevice = bestDevice_DS.oclDeviceID; @@ -359,22 +887,22 @@ int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo ) { clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] ); CHECK_OPENCL( clStatus, "clReleaseProgram" ); - gpuEnv.mpArryPrograms[i] = NULL; + gpuEnv.mpArryPrograms[i] = nullptr; } } if ( gpuEnv.mpCmdQueue ) { clReleaseCommandQueue( gpuEnv.mpCmdQueue ); - gpuEnv.mpCmdQueue = NULL; + gpuEnv.mpCmdQueue = nullptr; } if ( gpuEnv.mpContext ) { clReleaseContext( gpuEnv.mpContext ); - gpuEnv.mpContext = NULL; + gpuEnv.mpContext = nullptr; } isInited = 0; gpuInfo->mnIsUserCreated = 0; - free( gpuInfo->mpArryDevsID ); + delete[] gpuInfo->mpArryDevsID; return 1; } int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle ) @@ -382,22 +910,22 @@ int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle ) unsigned int i = 0; cl_int clStatus; int status = 0; - char *str = NULL; - FILE *fd = NULL; - char fileName[256] = { 0 }, cl_name[128] = { 0 }; + char *str = nullptr; + FILE *fd = nullptr; + char fileName[256] = {0}, cl_name[128] = {0}; char deviceName[1024]; - clStatus = clGetDeviceInfo( gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, sizeof(deviceName), deviceName, NULL ); - CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); - str = (char*) strstr( clFileName, (char*) ".cl" ); - memcpy( cl_name, clFileName, str - clFileName ); + clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME, + sizeof(deviceName), deviceName, nullptr); + CHECK_OPENCL(clStatus, "clGetDeviceInfo"); + str = (char *)strstr(clFileName, (char *)".cl"); + memcpy(cl_name, clFileName, str - clFileName); cl_name[str - clFileName] = '\0'; - sprintf( fileName, "%s-%s.bin", cl_name, deviceName ); + sprintf(fileName, "%s-%s.bin", cl_name, deviceName); legalizeFileName(fileName); - fd = fopen( fileName, "rb" ); - status = ( fd != NULL ) ? 1 : 0; - if ( fd != NULL ) - { - *fhandle = fd; + fd = fopen(fileName, "rb"); + status = (fd != nullptr) ? 1 : 0; + if (fd != nullptr) { + *fhandle = fd; } return status; @@ -409,9 +937,8 @@ int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * cl { if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 ) { - if ( gpuEnvCached->mpArryPrograms[i] != NULL ) - { - return 1; + if (gpuEnvCached->mpArryPrograms[i] != nullptr) { + return 1; } } } @@ -420,11 +947,10 @@ int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * cl } int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes ) { - FILE *output = NULL; - output = fopen( fileName, "wb" ); - if ( output == NULL ) - { - return 0; + FILE *output = nullptr; + output = fopen(fileName, "wb"); + if (output == nullptr) { + return 0; } fwrite( birary, sizeof(char), numBytes, output ); @@ -437,36 +963,37 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c { unsigned int i = 0; cl_int clStatus; - size_t *binarySizes, numDevices=0; + size_t *binarySizes; + cl_uint numDevices; cl_device_id *mpArryDevsID; - char **binaries, *str = NULL; + char **binaries, *str = nullptr; - clStatus = clGetProgramInfo( program, CL_PROGRAM_NUM_DEVICES, - sizeof(numDevices), &numDevices, NULL ); + clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES, + sizeof(numDevices), &numDevices, nullptr); CHECK_OPENCL( clStatus, "clGetProgramInfo" ); mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); - if ( mpArryDevsID == NULL ) - { - return 0; + if (mpArryDevsID == nullptr) { + return 0; } /* grab the handles to all of the devices in the program. */ - clStatus = clGetProgramInfo( program, CL_PROGRAM_DEVICES, - sizeof(cl_device_id) * numDevices, mpArryDevsID, NULL ); + clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES, + sizeof(cl_device_id) * numDevices, mpArryDevsID, + nullptr); CHECK_OPENCL( clStatus, "clGetProgramInfo" ); /* figure out the sizes of each of the binaries. */ binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices ); - clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARY_SIZES, - sizeof(size_t) * numDevices, binarySizes, NULL ); + clStatus = + clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES, + sizeof(size_t) * numDevices, binarySizes, nullptr); CHECK_OPENCL( clStatus, "clGetProgramInfo" ); /* copy over all of the generated binaries. */ binaries = (char**) malloc( sizeof(char *) * numDevices ); - if ( binaries == NULL ) - { - return 0; + if (binaries == nullptr) { + return 0; } for ( i = 0; i < numDevices; i++ ) @@ -474,19 +1001,18 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c if ( binarySizes[i] != 0 ) { binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] ); - if ( binaries[i] == NULL ) - { - return 0; + if (binaries[i] == nullptr) { + return 0; } } else { - binaries[i] = NULL; + binaries[i] = nullptr; } } - clStatus = clGetProgramInfo( program, CL_PROGRAM_BINARIES, - sizeof(char *) * numDevices, binaries, NULL ); + clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES, + sizeof(char *) * numDevices, binaries, nullptr); CHECK_OPENCL(clStatus,"clGetProgramInfo"); /* dump out each binary into its own separate file. */ @@ -498,7 +1024,7 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c { char deviceName[1024]; clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME, - sizeof(deviceName), deviceName, NULL); + sizeof(deviceName), deviceName, nullptr); CHECK_OPENCL( clStatus, "clGetDeviceInfo" ); str = (char*) strstr( clFileName, (char*) ".cl" ); @@ -518,59 +1044,32 @@ int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * c // Release all resouces and memory for ( i = 0; i < numDevices; i++ ) { - if ( binaries[i] != NULL ) - { - free( binaries[i] ); - binaries[i] = NULL; - } + free(binaries[i]); + binaries[i] = nullptr; } - if ( binaries != NULL ) - { - free( binaries ); - binaries = NULL; - } + free(binaries); + binaries = nullptr; - if ( binarySizes != NULL ) - { - free( binarySizes ); - binarySizes = NULL; - } + free(binarySizes); + binarySizes = nullptr; + + free(mpArryDevsID); + mpArryDevsID = nullptr; - if ( mpArryDevsID != NULL ) - { - free( mpArryDevsID ); - mpArryDevsID = NULL; - } return 1; } -void copyIntBuffer( KernelEnv rEnv, cl_mem xValues, const l_uint32 *_pValues, size_t nElements, cl_int *pStatus ) -{ - l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer( rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0, - nElements * sizeof(l_int32), 0, NULL, NULL, NULL ); - clFinish( rEnv.mpkCmdQueue ); - if (_pValues != NULL) - { - for ( int i = 0; i < (int)nElements; i++ ) - pValues[i] = (l_int32)_pValues[i]; - } - - clEnqueueUnmapMemObject(rEnv.mpkCmdQueue,xValues,pValues,0,NULL,NULL); - //clFinish( rEnv.mpkCmdQueue ); - return; -} - int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) { //PERF_COUNT_START("CompileKernelFile") cl_int clStatus = 0; size_t length; - char *buildLog = NULL, *binary; + char *buildLog = nullptr, *binary; const char *source; size_t source_size[1]; int b_error, binary_status, binaryExisted, idx; - size_t numDevices; + cl_uint numDevices; cl_device_id *mpArryDevsID; FILE *fd, *fd1; const char* filename = "kernel.cl"; @@ -590,14 +1089,13 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) //PERF_COUNT_SUB("BinaryGenerated") if ( binaryExisted == 1 ) { - clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, - sizeof(numDevices), &numDevices, NULL ); - CHECK_OPENCL( clStatus, "clGetContextInfo" ); + clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES, + sizeof(numDevices), &numDevices, nullptr); + CHECK_OPENCL(clStatus, "clGetContextInfo"); - mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices ); - if ( mpArryDevsID == NULL ) - { - return 0; + mpArryDevsID = (cl_device_id *)malloc(sizeof(cl_device_id) * numDevices); + if (mpArryDevsID == nullptr) { + return 0; } //PERF_COUNT_SUB("get numDevices") b_error = 0; @@ -622,10 +1120,11 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) fclose( fd ); //PERF_COUNT_SUB("read file") - fd = NULL; + fd = nullptr; // grab the handles to all of the devices in the context. - clStatus = clGetContextInfo( gpuInfo->mpContext, CL_CONTEXT_DEVICES, - sizeof( cl_device_id ) * numDevices, mpArryDevsID, NULL ); + clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES, + sizeof(cl_device_id) * numDevices, + mpArryDevsID, nullptr); CHECK_OPENCL( clStatus, "clGetContextInfo" ); //PERF_COUNT_SUB("get devices") //fprintf(stderr, "[OD] Create kernel from binary\n"); @@ -636,8 +1135,8 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) //PERF_COUNT_SUB("clCreateProgramWithBinary") free( binary ); free( mpArryDevsID ); - mpArryDevsID = NULL; -//PERF_COUNT_SUB("binaryExisted") + mpArryDevsID = nullptr; + // PERF_COUNT_SUB("binaryExisted") } else { @@ -649,9 +1148,8 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) //PERF_COUNT_SUB("!binaryExisted") } - if ( gpuInfo->mpArryPrograms[idx] == (cl_program) NULL ) - { - return 0; + if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) { + return 0; } //char options[512]; @@ -660,15 +1158,17 @@ int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption ) PERF_COUNT_START("OD::CompileKernel::clBuildProgram") if (!gpuInfo->mnIsUserCreated) { - clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, - buildOption, NULL, NULL); -//PERF_COUNT_SUB("clBuildProgram notUserCreated") + clStatus = + clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID, + buildOption, nullptr, nullptr); + // PERF_COUNT_SUB("clBuildProgram notUserCreated") } else { - clStatus = clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), - buildOption, NULL, NULL); -//PERF_COUNT_SUB("clBuildProgram isUserCreated") + clStatus = + clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID), + buildOption, nullptr, nullptr); + // PERF_COUNT_SUB("clBuildProgram isUserCreated") } PERF_COUNT_END if ( clStatus != CL_SUCCESS ) @@ -676,13 +1176,15 @@ PERF_COUNT_END printf ("BuildProgram error!\n"); if ( !gpuInfo->mnIsUserCreated ) { - clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], - CL_PROGRAM_BUILD_LOG, 0, NULL, &length ); + clStatus = clGetProgramBuildInfo( + gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0], + CL_PROGRAM_BUILD_LOG, 0, nullptr, &length); } else { - clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, - CL_PROGRAM_BUILD_LOG, 0, NULL, &length); + clStatus = clGetProgramBuildInfo( + gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID, + CL_PROGRAM_BUILD_LOG, 0, nullptr, &length); } if ( clStatus != CL_SUCCESS ) { @@ -690,9 +1192,8 @@ PERF_COUNT_END return 0; } buildLog = (char*) malloc( length ); - if ( buildLog == (char*) NULL ) - { - return 0; + if (buildLog == (char *)nullptr) { + return 0; } if ( !gpuInfo->mnIsUserCreated ) { @@ -711,10 +1212,9 @@ PERF_COUNT_END } fd1 = fopen( "kernel-build.log", "w+" ); - if ( fd1 != NULL ) - { - fwrite( buildLog, sizeof(char), length, fd1 ); - fclose( fd1 ); + if (fd1 != nullptr) { + fwrite(buildLog, sizeof(char), length, fd1); + fclose(fd1); } free( buildLog ); @@ -757,621 +1257,45 @@ PERF_COUNT_START("pixReadFromTiffKernel") l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32)); rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus ); - CHECK_OPENCL( clStatus, "clCreateKernel"); + CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel"); //Allocate input and output OCL buffers valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus); //Kernel arguments - clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&valuesCl ); + clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl); CHECK_OPENCL( clStatus, "clSetKernelArg"); - clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(w), (void *)&w ); + clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w); CHECK_OPENCL( clStatus, "clSetKernelArg" ); - clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(h), (void *)&h ); + clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h); CHECK_OPENCL( clStatus, "clSetKernelArg" ); - clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(wpl), (void *)&wpl ); + clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); CHECK_OPENCL( clStatus, "clSetKernelArg" ); - clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&outputCl ); + clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl); CHECK_OPENCL( clStatus, "clSetKernelArg"); //Kernel enqueue PERF_COUNT_SUB("before") - clStatus = clEnqueueNDRangeKernel( rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, NULL, globalThreads, localThreads, 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel" ); +clStatus = + clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, + globalThreads, localThreads, 0, nullptr, nullptr); +CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel"); - /* map results back from gpu */ - void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, 0, w*h * sizeof(l_uint32), 0, NULL, NULL, &clStatus); - CHECK_OPENCL( clStatus, "clEnqueueMapBuffer outputCl"); - clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, NULL, NULL); +/* map results back from gpu */ +void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ, + 0, w * h * sizeof(l_uint32), 0, nullptr, nullptr, + &clStatus); +CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl"); +clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr); - //Sync - clFinish( rEnv.mpkCmdQueue ); +// Sync +clFinish(rEnv.mpkCmdQueue); PERF_COUNT_SUB("kernel & map") PERF_COUNT_END return pResult; } -PIX * OpenclDevice::pixReadTiffCl ( const char *filename, l_int32 n ) -{ -PERF_COUNT_START("pixReadTiffCL") - FILE *fp; -PIX *pix; - - //printf("pixReadTiffCl file"); - PROCNAME("pixReadTiff"); - - if (!filename) - return (PIX *)ERROR_PTR("filename not defined", procName, NULL); - - if ((fp = fopenReadStream(filename)) == NULL) - return (PIX *)ERROR_PTR("image file not found", procName, NULL); - if ((pix = pixReadStreamTiffCl(fp, n)) == NULL) { - fclose(fp); - return (PIX *)ERROR_PTR("pix not read", procName, NULL); - } - fclose(fp); -PERF_COUNT_END - return pix; - -} -TIFF * -OpenclDevice::fopenTiffCl(FILE *fp, - const char *modestring) -{ -l_int32 fd; - - PROCNAME("fopenTiff"); - - if (!fp) - return (TIFF *)ERROR_PTR("stream not opened", procName, NULL); - if (!modestring) - return (TIFF *)ERROR_PTR("modestring not defined", procName, NULL); - - if ((fd = fileno(fp)) < 0) - return (TIFF *)ERROR_PTR("invalid file descriptor", procName, NULL); - lseek(fd, 0, SEEK_SET); - - return TIFFFdOpen(fd, "TIFFstream", modestring); -} -l_int32 OpenclDevice::getTiffStreamResolutionCl(TIFF *tif, - l_int32 *pxres, - l_int32 *pyres) -{ -l_uint16 resunit; -l_int32 foundxres, foundyres; -l_float32 fxres, fyres; - - PROCNAME("getTiffStreamResolution"); - - if (!tif) - return ERROR_INT("tif not opened", procName, 1); - if (!pxres || !pyres) - return ERROR_INT("&xres and &yres not both defined", procName, 1); - *pxres = *pyres = 0; - - TIFFGetFieldDefaulted(tif, TIFFTAG_RESOLUTIONUNIT, &resunit); - foundxres = TIFFGetField(tif, TIFFTAG_XRESOLUTION, &fxres); - foundyres = TIFFGetField(tif, TIFFTAG_YRESOLUTION, &fyres); - if (!foundxres && !foundyres) return 1; - if (!foundxres && foundyres) - fxres = fyres; - else if (foundxres && !foundyres) - fyres = fxres; - - if (resunit == RESUNIT_CENTIMETER) { /* convert to ppi */ - *pxres = (l_int32)(2.54 * fxres + 0.5); - *pyres = (l_int32)(2.54 * fyres + 0.5); - } - else { - *pxres = (l_int32)fxres; - *pyres = (l_int32)fyres; - } - - return 0; -} - -struct L_Memstream -{ -l_uint8 *buffer; /* expands to hold data when written to; */ - /* fixed size when read from. */ -size_t bufsize; /* current size allocated when written to; */ - /* fixed size of input data when read from. */ -size_t offset; /* byte offset from beginning of buffer. */ -size_t hw; /* high-water mark; max bytes in buffer. */ -l_uint8 **poutdata; /* input param for writing; data goes here. */ -size_t *poutsize; /* input param for writing; data size goes here. */ -}; -typedef struct L_Memstream L_MEMSTREAM; - -/* These are static functions for memory I/O */ -static L_MEMSTREAM *memstreamCreateForRead(l_uint8 *indata, size_t pinsize); -static L_MEMSTREAM *memstreamCreateForWrite(l_uint8 **poutdata, - size_t *poutsize); -static tsize_t tiffReadCallback(thandle_t handle, tdata_t data, tsize_t length); -static tsize_t tiffWriteCallback(thandle_t handle, tdata_t data, - tsize_t length); -static toff_t tiffSeekCallback(thandle_t handle, toff_t offset, l_int32 whence); -static l_int32 tiffCloseCallback(thandle_t handle); -static toff_t tiffSizeCallback(thandle_t handle); -static l_int32 tiffMapCallback(thandle_t handle, tdata_t *data, toff_t *length); -static void tiffUnmapCallback(thandle_t handle, tdata_t data, toff_t length); - - -static L_MEMSTREAM * -memstreamCreateForRead(l_uint8 *indata, -size_t insize) -{ - L_MEMSTREAM *mstream; - - mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM)); - mstream->buffer = indata; /* handle to input data array */ - mstream->bufsize = insize; /* amount of input data */ - mstream->hw = insize; /* high-water mark fixed at input data size */ - mstream->offset = 0; /* offset always starts at 0 */ - return mstream; -} - - -static L_MEMSTREAM * -memstreamCreateForWrite(l_uint8 **poutdata, -size_t *poutsize) -{ - L_MEMSTREAM *mstream; - - mstream = (L_MEMSTREAM *)CALLOC(1, sizeof(L_MEMSTREAM)); - mstream->buffer = (l_uint8 *)CALLOC(8 * 1024, 1); - mstream->bufsize = 8 * 1024; - mstream->poutdata = poutdata; /* used only at end of write */ - mstream->poutsize = poutsize; /* ditto */ - mstream->hw = mstream->offset = 0; - return mstream; -} - - -static tsize_t -tiffReadCallback(thandle_t handle, -tdata_t data, -tsize_t length) -{ - L_MEMSTREAM *mstream; - size_t amount; - - mstream = (L_MEMSTREAM *)handle; - amount = L_MIN((size_t)length, mstream->hw - mstream->offset); - memcpy(data, mstream->buffer + mstream->offset, amount); - mstream->offset += amount; - return amount; -} - - -static tsize_t -tiffWriteCallback(thandle_t handle, -tdata_t data, -tsize_t length) -{ - L_MEMSTREAM *mstream; - size_t newsize; - - /* reallocNew() uses calloc to initialize the array. - * If malloc is used instead, for some of the encoding methods, - * not all the data in 'bufsize' bytes in the buffer will - * have been initialized by the end of the compression. */ - mstream = (L_MEMSTREAM *)handle; - if (mstream->offset + length > mstream->bufsize) { - newsize = 2 * (mstream->offset + length); - mstream->buffer = (l_uint8 *)reallocNew((void **)&mstream->buffer, - mstream->offset, newsize); - mstream->bufsize = newsize; - } - - memcpy(mstream->buffer + mstream->offset, data, length); - mstream->offset += length; - mstream->hw = L_MAX(mstream->offset, mstream->hw); - return length; -} - - -static toff_t -tiffSeekCallback(thandle_t handle, -toff_t offset, -l_int32 whence) -{ - L_MEMSTREAM *mstream; - - PROCNAME("tiffSeekCallback"); - mstream = (L_MEMSTREAM *)handle; - switch (whence) { - case SEEK_SET: - /* fprintf(stderr, "seek_set: offset = %d\n", offset); */ - mstream->offset = offset; - break; - case SEEK_CUR: - /* fprintf(stderr, "seek_cur: offset = %d\n", offset); */ - mstream->offset += offset; - break; - case SEEK_END: - /* fprintf(stderr, "seek end: hw = %d, offset = %d\n", - mstream->hw, offset); */ - mstream->offset = mstream->hw - offset; /* offset >= 0 */ - break; - default: - return (toff_t)ERROR_INT("bad whence value", procName, - mstream->offset); - } - - return mstream->offset; -} - - -static l_int32 -tiffCloseCallback(thandle_t handle) -{ - L_MEMSTREAM *mstream; - - mstream = (L_MEMSTREAM *)handle; - if (mstream->poutdata) { /* writing: save the output data */ - *mstream->poutdata = mstream->buffer; - *mstream->poutsize = mstream->hw; - } - FREE(mstream); /* never free the buffer! */ - return 0; -} - - -static toff_t -tiffSizeCallback(thandle_t handle) -{ - L_MEMSTREAM *mstream; - - mstream = (L_MEMSTREAM *)handle; - return mstream->hw; -} - - -static l_int32 -tiffMapCallback(thandle_t handle, -tdata_t *data, -toff_t *length) -{ - L_MEMSTREAM *mstream; - - mstream = (L_MEMSTREAM *)handle; - *data = mstream->buffer; - *length = mstream->hw; - return 0; -} - - -static void -tiffUnmapCallback(thandle_t handle, -tdata_t data, -toff_t length) -{ - return; -} - - -/*! -* fopenTiffMemstream() -* -* Input: filename (for error output; can be "") -* operation ("w" for write, "r" for read) -* &data ( written data) -* &datasize ( size of written data) -* Return: tiff (data structure, opened for write to memory) -* -* Notes: -* (1) This wraps up a number of callbacks for either: -* * reading from tiff in memory buffer --> pix -* * writing from pix --> tiff in memory buffer -* (2) After use, the memstream is automatically destroyed when -* TIFFClose() is called. TIFFCleanup() doesn't free the memstream. -*/ -static TIFF * -fopenTiffMemstream(const char *filename, -const char *operation, -l_uint8 **pdata, -size_t *pdatasize) -{ - L_MEMSTREAM *mstream; - - PROCNAME("fopenTiffMemstream"); - - if (!filename) - return (TIFF *)ERROR_PTR("filename not defined", procName, NULL); - if (!operation) - return (TIFF *)ERROR_PTR("operation not defined", procName, NULL); - if (!pdata) - return (TIFF *)ERROR_PTR("&data not defined", procName, NULL); - if (!pdatasize) - return (TIFF *)ERROR_PTR("&datasize not defined", procName, NULL); - if (!strcmp(operation, "r") && !strcmp(operation, "w")) - return (TIFF *)ERROR_PTR("operation not 'r' or 'w'}", procName, NULL); - - if (!strcmp(operation, "r")) - mstream = memstreamCreateForRead(*pdata, *pdatasize); - else - mstream = memstreamCreateForWrite(pdata, pdatasize); - - return TIFFClientOpen(filename, operation, mstream, - tiffReadCallback, tiffWriteCallback, - tiffSeekCallback, tiffCloseCallback, - tiffSizeCallback, tiffMapCallback, - tiffUnmapCallback); -} - - - -PIX * -OpenclDevice::pixReadMemTiffCl(const l_uint8 *data,size_t size,l_int32 n) -{ - l_int32 i, pagefound; - PIX *pix; - TIFF *tif; - //L_MEMSTREAM *memStream; - PROCNAME("pixReadMemTiffCl"); - - if (!data) - return (PIX *)ERROR_PTR("data pointer is NULL", procName, NULL); - - if ((tif = fopenTiffMemstream("", "r", (l_uint8 **)&data, &size)) == NULL) - return (PIX *)ERROR_PTR("tif not opened", procName, NULL); - - pagefound = FALSE; - pix = NULL; - for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) { - if (i == n) { - pagefound = TRUE; - if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) { - TIFFCleanup(tif); - return (PIX *)ERROR_PTR("pix not read", procName, NULL); - } - break; - } - if (TIFFReadDirectory(tif) == 0) - break; - } - - if (pagefound == FALSE) { - L_WARNING("tiff page %d not found", procName); - TIFFCleanup(tif); - return NULL; - } - - TIFFCleanup(tif); - return pix; -} - -PIX * -OpenclDevice::pixReadStreamTiffCl(FILE *fp, - l_int32 n) -{ -l_int32 i, pagefound; -PIX *pix; -TIFF *tif; - - PROCNAME("pixReadStreamTiff"); - - if (!fp) - return (PIX *)ERROR_PTR("stream not defined", procName, NULL); - - if ((tif = fopenTiffCl(fp, "rb")) == NULL) - return (PIX *)ERROR_PTR("tif not opened", procName, NULL); - - pagefound = FALSE; - pix = NULL; - for (i = 0; i < MAX_PAGES_IN_TIFF_FILE; i++) { - if (i == n) { - pagefound = TRUE; - if ((pix = pixReadFromTiffStreamCl(tif)) == NULL) { - TIFFCleanup(tif); - return (PIX *)ERROR_PTR("pix not read", procName, NULL); - } - break; - } - if (TIFFReadDirectory(tif) == 0) - break; - } - - if (pagefound == FALSE) { - L_WARNING("tiff page %d not found", procName, n); - TIFFCleanup(tif); - return NULL; - } - - TIFFCleanup(tif); - return pix; -} - -static l_int32 -getTiffCompressedFormat(l_uint16 tiffcomp) -{ -l_int32 comptype; - - switch (tiffcomp) - { - case COMPRESSION_CCITTFAX4: - comptype = IFF_TIFF_G4; - break; - case COMPRESSION_CCITTFAX3: - comptype = IFF_TIFF_G3; - break; - case COMPRESSION_CCITTRLE: - comptype = IFF_TIFF_RLE; - break; - case COMPRESSION_PACKBITS: - comptype = IFF_TIFF_PACKBITS; - break; - case COMPRESSION_LZW: - comptype = IFF_TIFF_LZW; - break; - case COMPRESSION_ADOBE_DEFLATE: - comptype = IFF_TIFF_ZIP; - break; - default: - comptype = IFF_TIFF; - break; - } - return comptype; -} - -void compare(l_uint32 *cpu, l_uint32 *gpu,int size) -{ - for(int i=0;i 32) - return (PIX *)ERROR_PTR("can't handle bpp > 32", procName, NULL); - if (spp == 1) - d = bps; - else if (spp == 3 || spp == 4) - d = 32; - else - return (PIX *)ERROR_PTR("spp not in set {1,3,4}", procName, NULL); - - TIFFGetField(tif, TIFFTAG_IMAGEWIDTH, &w); - TIFFGetField(tif, TIFFTAG_IMAGELENGTH, &h); - tiffbpl = TIFFScanlineSize(tif); - - if ((pix = pixCreate(w, h, d)) == NULL) - return (PIX *)ERROR_PTR("pix not made", procName, NULL); - data = (l_uint8 *)pixGetData(pix); - wpl = pixGetWpl(pix); - bpl = 4 * wpl; - - - if (spp == 1) { - if ((linebuf = (l_uint8 *)CALLOC(tiffbpl + 1, sizeof(l_uint8))) == NULL) - return (PIX *)ERROR_PTR("calloc fail for linebuf", procName, NULL); - - for (i = 0 ; i < h ; i++) { - if (TIFFReadScanline(tif, linebuf, i, 0) < 0) { - FREE(linebuf); - pixDestroy(&pix); - return (PIX *)ERROR_PTR("line read fail", procName, NULL); - } - memcpy((char *)data, (char *)linebuf, tiffbpl); - data += bpl; - } - if (bps <= 8) - pixEndianByteSwap(pix); - else - pixEndianTwoByteSwap(pix); - FREE(linebuf); - } - else { - if ((tiffdata = (l_uint32 *)CALLOC(w * h, sizeof(l_uint32))) == NULL) { - pixDestroy(&pix); - return (PIX *)ERROR_PTR("calloc fail for tiffdata", procName, NULL); - } - if (!TIFFReadRGBAImageOriented(tif, w, h, (uint32 *)tiffdata, - ORIENTATION_TOPLEFT, 0)) { - FREE(tiffdata); - pixDestroy(&pix); - return (PIX *)ERROR_PTR("failed to read tiffdata", procName, NULL); - } - line = pixGetData(pix); - - //Invoke the OpenCL kernel for pixReadFromTiff - l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line); - - pixSetData(pix, output_gpu); - // pix already has data allocated, it now points to output_gpu? - FREE(tiffdata); - FREE(line); - //FREE(output_gpu); - } - - if (getTiffStreamResolutionCl(tif, &xres, &yres) == 0) { - pixSetXRes(pix, xres); - pixSetYRes(pix, yres); - } - - - TIFFGetFieldDefaulted(tif, TIFFTAG_COMPRESSION, &tiffcomp); - comptype = getTiffCompressedFormat(tiffcomp); - pixSetInputFormat(pix, comptype); - - if (TIFFGetField(tif, TIFFTAG_COLORMAP, &redmap, &greenmap, &bluemap)) { - - if ((cmap = pixcmapCreate(bps)) == NULL) { - pixDestroy(&pix); - return (PIX *)ERROR_PTR("cmap not made", procName, NULL); - } - ncolors = 1 << bps; - for (i = 0; i < ncolors; i++) - pixcmapAddColor(cmap, redmap[i] >> 8, greenmap[i] >> 8, - bluemap[i] >> 8); - pixSetColormap(pix, cmap); - } - else { - if (!TIFFGetField(tif, TIFFTAG_PHOTOMETRIC, &photometry)) { - - if (tiffcomp == COMPRESSION_CCITTFAX3 || - tiffcomp == COMPRESSION_CCITTFAX4 || - tiffcomp == COMPRESSION_CCITTRLE || - tiffcomp == COMPRESSION_CCITTRLEW) { - photometry = PHOTOMETRIC_MINISWHITE; - } - else - photometry = PHOTOMETRIC_MINISBLACK; - } - if ((d == 1 && photometry == PHOTOMETRIC_MINISBLACK) || - (d == 8 && photometry == PHOTOMETRIC_MINISWHITE)) - pixInvert(pix, pix); - } - - if (TIFFGetField(tif, TIFFTAG_ORIENTATION, &orientation)) { - if (orientation >= 1 && orientation <= 8) { - struct tiff_transform *transform = - &tiff_orientation_transforms[orientation - 1]; - if (transform->vflip) pixFlipTB(pix, pix); - if (transform->hflip) pixFlipLR(pix, pix); - if (transform->rotate) { - PIX *oldpix = pix; - pix = pixRotate90(oldpix, transform->rotate); - pixDestroy(&oldpix); - } - } - } - - return pix; -} - //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels cl_int pixDilateCL_55(l_int32 wpl, l_int32 h) @@ -1390,6 +1314,7 @@ pixDilateCL_55(l_int32 wpl, l_int32 h) localThreads[1] = GROUPSIZE_HMORY; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status ); + CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5"); status = clSetKernelArg(rEnv.mpkKernel, 0, @@ -1399,24 +1324,12 @@ pixDilateCL_55(l_int32 wpl, l_int32 h) 1, sizeof(cl_mem), &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(h), - (const void *)&h); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); //Swap source and dest buffers pixtemp = pixsCLBuffer; @@ -1432,6 +1345,7 @@ pixDilateCL_55(l_int32 wpl, l_int32 h) localThreads[1] = GROUPSIZE_Y; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status ); + CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5"); status = clSetKernelArg(rEnv.mpkKernel, 0, @@ -1441,23 +1355,11 @@ pixDilateCL_55(l_int32 wpl, l_int32 h) 1, sizeof(cl_mem), &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(h), - (const void *)&h); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); return status; } @@ -1473,8 +1375,8 @@ pixErodeCL_55(l_int32 wpl, l_int32 h) l_uint32 fwmask, lwmask; size_t localThreads[2]; - lwmask = lmask32[32 - 2]; - fwmask = rmask32[32 - 2]; + lwmask = lmask32[31 - 2]; + fwmask = rmask32[31 - 2]; //Horizontal pass gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX; @@ -1484,6 +1386,7 @@ pixErodeCL_55(l_int32 wpl, l_int32 h) localThreads[1] = GROUPSIZE_HMORY; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status ); + CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5"); status = clSetKernelArg(rEnv.mpkKernel, 0, @@ -1493,24 +1396,12 @@ pixErodeCL_55(l_int32 wpl, l_int32 h) 1, sizeof(cl_mem), &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(h), - (const void *)&h); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); //Swap source and dest buffers pixtemp = pixsCLBuffer; @@ -1526,6 +1417,7 @@ pixErodeCL_55(l_int32 wpl, l_int32 h) localThreads[1] = GROUPSIZE_Y; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status ); + CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5"); status = clSetKernelArg(rEnv.mpkKernel, 0, @@ -1535,31 +1427,13 @@ pixErodeCL_55(l_int32 wpl, l_int32 h) 1, sizeof(cl_mem), &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(h), - (const void *)&h); - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(fwmask), - (const void *)&fwmask); - status = clSetKernelArg(rEnv.mpkKernel, - 5, - sizeof(lwmask), - (const void *)&lwmask); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask); + status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); return status; } @@ -1600,101 +1474,56 @@ pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) if (xp > 31 || xn > 31) { - //Generic case. - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor", &status ); + // Generic case. + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status); + CHECK_OPENCL(status, "clCreateKernel morphoDilateHor"); - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(xp), - (const void *)&xp); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(xn), - (const void *)&xn); - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 5, - sizeof(h), - (const void *)&h); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); - if (yp > 0 || yn > 0) - { - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; + if (yp > 0 || yn > 0) { + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; } } else if (xp > 0 || xn > 0 ) { - //Specific Horizontal pass kernel for half width < 32 - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_32word", &status ); - isEven = (xp != xn); + // Specific Horizontal pass kernel for half width < 32 + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status); + CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word"); + isEven = (xp != xn); - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(xp), - (const void *)&xp); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(h), - (const void *)&h); - status = clSetKernelArg(rEnv.mpkKernel, - 5, - sizeof(isEven), - (const void *)&isEven); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h); + status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); - if (yp > 0 || yn > 0) - { - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; - } + if (yp > 0 || yn > 0) { + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; + } } if (yp > 0 || yn > 0) { rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status ); + CHECK_OPENCL(status, "clCreateKernel morphoDilateVer"); status = clSetKernelArg(rEnv.mpkKernel, 0, @@ -1704,280 +1533,123 @@ pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) 1, sizeof(cl_mem), &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(yp), - (const void *)&yp); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(h), - (const void *)&h); - status = clSetKernelArg(rEnv.mpkKernel, - 5, - sizeof(yn), - (const void *)&yn); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h); + status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); } - return status; } //Morphology Erode operation. Invokes the relevant OpenCL kernels -cl_int -pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) -{ +cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) { + l_int32 xp, yp, xn, yn; + SEL *sel; + size_t globalThreads[2]; + size_t localThreads[2]; + cl_mem pixtemp; + cl_int status; + int gsize; + char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC); + l_uint32 rwmask, lwmask; + char isEven; - l_int32 xp, yp, xn, yn; - SEL* sel; - size_t globalThreads[2]; - size_t localThreads[2]; - cl_mem pixtemp; - cl_int status; - int gsize; - char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC); - l_uint32 rwmask, lwmask; - char isEven; + sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); - sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT); - - selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); - selDestroy(&sel); - OpenclDevice::SetKernelEnv( &rEnv ); - - if (hsize == 5 && vsize == 5 && isAsymmetric) - { - //Specific kernel for 5x5 - status = pixErodeCL_55(wpl, h); - return status; - } - - rwmask = rmask32[32 - (xp & 31)]; - lwmask = lmask32[32 - (xn & 31)]; - - //global and local work dimensions for Horizontal pass - gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; - globalThreads[0] = gsize; - gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; - globalThreads[1] = gsize; - localThreads[0] = GROUPSIZE_X; - localThreads[1] = GROUPSIZE_Y; - - //Horizontal Pass - if (xp > 31 || xn > 31 ) - { - //Generic case. - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor", &status ); - - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(xp), - (const void *)&xp); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(xn), - (const void *)&xn); - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 5, - sizeof(h), - (const void *)&h); - status = clSetKernelArg(rEnv.mpkKernel, - 6, - sizeof(isAsymmetric), - (const void *)&isAsymmetric); - status = clSetKernelArg(rEnv.mpkKernel, - 7, - sizeof(rwmask), - (const void *)&rwmask); - status = clSetKernelArg(rEnv.mpkKernel, - 8, - sizeof(lwmask), - (const void *)&lwmask); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); - - if (yp > 0 || yn > 0) - { - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; - } - } - else if (xp > 0 || xn > 0) - { - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_32word", &status ); - isEven = (xp != xn); - - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(xp), - (const void *)&xp); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(h), - (const void *)&h); - status = clSetKernelArg(rEnv.mpkKernel, - 5, - sizeof(isAsymmetric), - (const void *)&isAsymmetric); - status = clSetKernelArg(rEnv.mpkKernel, - 6, - sizeof(rwmask), - (const void *)&rwmask); - status = clSetKernelArg(rEnv.mpkKernel, - 7, - sizeof(lwmask), - (const void *)&lwmask); - status = clSetKernelArg(rEnv.mpkKernel, - 8, - sizeof(isEven), - (const void *)&isEven); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); - - if (yp > 0 || yn > 0) - { - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; - } - } - - //Vertical Pass - if (yp > 0 || yn > 0) - { - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer", &status ); - - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &pixsCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &pixdCLBuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(yp), - (const void *)&yp); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(h), - (const void *)&h); - status = clSetKernelArg(rEnv.mpkKernel, - 5, - sizeof(isAsymmetric), - (const void *)&isAsymmetric); - status = clSetKernelArg(rEnv.mpkKernel, - 6, - sizeof(yn), - (const void *)&yn); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); - } + selFindMaxTranslations(sel, &xp, &yp, &xn, &yn); + selDestroy(&sel); + OpenclDevice::SetKernelEnv(&rEnv); + if (hsize == 5 && vsize == 5 && isAsymmetric) { + // Specific kernel for 5x5 + status = pixErodeCL_55(wpl, h); return status; -} + } -// OpenCL implementation of Morphology Dilate -//Note: Assumes the source and dest opencl buffer are initialized. No check done -PIX* -OpenclDevice::pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) -{ - l_uint32 wpl, h; + lwmask = lmask32[31 - (xn & 31)]; + rwmask = rmask32[31 - (xp & 31)]; - wpl = pixGetWpl(pixs); - h = pixGetHeight(pixs); + // global and local work dimensions for Horizontal pass + gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X; + globalThreads[0] = gsize; + gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y; + globalThreads[1] = gsize; + localThreads[0] = GROUPSIZE_X; + localThreads[1] = GROUPSIZE_Y; - clStatus = pixDilateCL(hsize, vsize, wpl, h); + // Horizontal Pass + if (xp > 31 || xn > 31) { + // Generic case. + rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status); - if (reqDataCopy) - { - pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ, false); + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h); + status = + clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric); + status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask); + status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); + + if (yp > 0 || yn > 0) { + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; } + } else if (xp > 0 || xn > 0) { + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status); + isEven = (xp != xn); - return pixd; -} + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h); + status = + clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric); + status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask); + status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask); + status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); -// OpenCL implementation of Morphology Erode -//Note: Assumes the source and dest opencl buffer are initialized. No check done -PIX* -OpenclDevice::pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy = false) -{ - l_uint32 wpl, h; - - wpl = pixGetWpl(pixs); - h = pixGetHeight(pixs); - - clStatus = pixErodeCL(hsize, vsize, wpl, h); - - if (reqDataCopy) - { - pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); + if (yp > 0 || yn > 0) { + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; } + } - return pixd; + // Vertical Pass + if (yp > 0 || yn > 0) { + rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status); + CHECK_OPENCL(status, "clCreateKernel morphoErodeVer"); + + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h); + status = + clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric); + status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); + } + + return status; } //Morphology Open operation. Invokes the relevant OpenCL kernels @@ -2018,54 +1690,6 @@ pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h) return status; } -// OpenCL implementation of Morphology Close -//Note: Assumes the source and dest opencl buffer are initialized. No check done -PIX* -OpenclDevice::pixCloseBrickCL(PIX *pixd, - PIX *pixs, - l_int32 hsize, - l_int32 vsize, - bool reqDataCopy = false) -{ - l_uint32 wpl, h; - - wpl = pixGetWpl(pixs); - h = pixGetHeight(pixs); - - clStatus = pixCloseCL(hsize, vsize, wpl, h); - - if (reqDataCopy) - { - pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); - } - - return pixd; -} - -// OpenCL implementation of Morphology Open -//Note: Assumes the source and dest opencl buffer are initialized. No check done -PIX* -OpenclDevice::pixOpenBrickCL(PIX *pixd, - PIX *pixs, - l_int32 hsize, - l_int32 vsize, - bool reqDataCopy = false) -{ - l_uint32 wpl, h; - - wpl = pixGetWpl(pixs); - h = pixGetHeight(pixs); - - clStatus = pixOpenCL(hsize, vsize, wpl, h); - - if (reqDataCopy) - { - pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); - } - - return pixd; -} - //pix OR operation: outbuffer = buffer1 | buffer2 cl_int pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer) @@ -2081,6 +1705,7 @@ pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem ou globalThreads[1] = gsize; rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixOR", &status ); + CHECK_OPENCL(status, "clCreateKernel pixOR"); status = clSetKernelArg(rEnv.mpkKernel, 0, @@ -2094,368 +1719,211 @@ pixORCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem ou 2, sizeof(cl_mem), &outbuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(h), - (const void *)&h); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); - - return status; -} - -//pix AND operation: outbuffer = buffer1 & buffer2 -cl_int -pixANDCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outbuffer) -{ - cl_int status; - size_t globalThreads[2]; - int gsize; - size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; - - gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; - globalThreads[0] = gsize; - gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; - globalThreads[1] = gsize; - - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixAND", &status ); - - // Enqueue a kernel run call. - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &buffer1); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &buffer2); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(cl_mem), - &outbuffer); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(h), - (const void *)&h); - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h); + status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, + nullptr, globalThreads, localThreads, 0, + nullptr, nullptr); return status; } //output = buffer1 & ~(buffer2) -cl_int -pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, cl_mem buffer2, cl_mem outBuffer = NULL) -{ - cl_int status; - size_t globalThreads[2]; - int gsize; - size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; +cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1, + cl_mem buffer2, cl_mem outBuffer = nullptr) { + cl_int status; + size_t globalThreads[2]; + int gsize; + size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y}; - gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X; - globalThreads[0] = gsize; - gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y; - globalThreads[1] = gsize; + gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X; + globalThreads[0] = gsize; + gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y; + globalThreads[1] = gsize; - if (outBuffer != NULL) - { - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract", &status ); - } - else - { - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "pixSubtract_inplace", &status ); - } + if (outBuffer != nullptr) { + rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract", &status); + CHECK_OPENCL(status, "clCreateKernel pixSubtract"); + } else { + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status); + CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace"); + } - // Enqueue a kernel run call. - status = clSetKernelArg(rEnv.mpkKernel, - 0, - sizeof(cl_mem), - &buffer1); - status = clSetKernelArg(rEnv.mpkKernel, - 1, - sizeof(cl_mem), - &buffer2); - status = clSetKernelArg(rEnv.mpkKernel, - 2, - sizeof(wpl), - (const void *)&wpl); - status = clSetKernelArg(rEnv.mpkKernel, - 3, - sizeof(h), - (const void *)&h); - if (outBuffer != NULL) - { - status = clSetKernelArg(rEnv.mpkKernel, - 4, - sizeof(cl_mem), - (const void *)&outBuffer); - } - status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 2, - NULL, - globalThreads, - localThreads, - 0, - NULL, - NULL); + // Enqueue a kernel run call. + status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1); + status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2); + status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl); + status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h); + if (outBuffer != nullptr) { + status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outBuffer); + } + status = + clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr, + globalThreads, localThreads, 0, nullptr, nullptr); - return status; -} - -// OpenCL implementation of Subtract pix -//Note: Assumes the source and dest opencl buffer are initialized. No check done -PIX* -OpenclDevice::pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy = false) -{ - l_uint32 wpl, h; - - PROCNAME("pixSubtractCL"); - - if (!pixs1) - return (PIX *)ERROR_PTR("pixs1 not defined", procName, pixd); - if (!pixs2) - return (PIX *)ERROR_PTR("pixs2 not defined", procName, pixd); - if (pixGetDepth(pixs1) != pixGetDepth(pixs2)) - return (PIX *)ERROR_PTR("depths of pixs* unequal", procName, pixd); - -#if EQUAL_SIZE_WARNING - if (!pixSizesEqual(pixs1, pixs2)) - L_WARNING("pixs1 and pixs2 not equal sizes", procName); -#endif /* EQUAL_SIZE_WARNING */ - - wpl = pixGetWpl(pixs1); - h = pixGetHeight(pixs1); - - clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); - - if (reqDataCopy) - { - //Read back output data from OCL buffer to cpu - pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs1, wpl*h, CL_MAP_READ); - } - - return pixd; -} - -// OpenCL implementation of Hollow pix -//Note: Assumes the source and dest opencl buffer are initialized. No check done -PIX* -OpenclDevice::pixHollowCL(PIX *pixd, - PIX *pixs, - l_int32 close_hsize, - l_int32 close_vsize, - l_int32 open_hsize, - l_int32 open_vsize, - bool reqDataCopy = false) -{ - l_uint32 wpl, h; - cl_mem pixtemp; - - wpl = pixGetWpl(pixs); - h = pixGetHeight(pixs); - - //First step : Close Morph operation: Dilate followed by Erode - clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h); - - //Store the output of close operation in an intermediate buffer - //this will be later used for pixsubtract - clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); - - //Second step: Open Operation - Erode followed by Dilate - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; - - clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h); - - //Third step: Subtract : (Close - Open) - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixdCLIntermediate; - pixdCLIntermediate = pixtemp; - - clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); - - if (reqDataCopy) - { - //Read back output data from OCL buffer to cpu - pixd = mapOutputCLBuffer(rEnv, pixdCLBuffer, pixd, pixs, wpl*h, CL_MAP_READ); - } - return pixd; + return status; } // OpenCL implementation of Get Lines from pix function //Note: Assumes the source and dest opencl buffer are initialized. No check done -void -OpenclDevice::pixGetLinesCL(PIX *pixd, - PIX *pixs, - PIX** pix_vline, - PIX** pix_hline, - PIX** pixClosed, - bool getpixClosed, - l_int32 close_hsize, l_int32 close_vsize, - l_int32 open_hsize, l_int32 open_vsize, - l_int32 line_hsize, l_int32 line_vsize) -{ - l_uint32 wpl, h; - cl_mem pixtemp; +void OpenclDevice::pixGetLinesCL(PIX *pixd, PIX *pixs, PIX **pix_vline, + PIX **pix_hline, PIX **pixClosed, + bool getpixClosed, l_int32 close_hsize, + l_int32 close_vsize, l_int32 open_hsize, + l_int32 open_vsize, l_int32 line_hsize, + l_int32 line_vsize) { + l_uint32 wpl, h; + cl_mem pixtemp; - wpl = pixGetWpl(pixs); - h = pixGetHeight(pixs); + wpl = pixGetWpl(pixs); + h = pixGetHeight(pixs); - //First step : Close Morph operation: Dilate followed by Erode - clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h); + // First step : Close Morph operation: Dilate followed by Erode + clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h); - //Copy the Close output to CPU buffer - if (getpixClosed) - { - *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, wpl*h, CL_MAP_READ, true, false); - } + // Copy the Close output to CPU buffer + if (getpixClosed) { + *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs, + wpl * h, CL_MAP_READ, true, false); + } - //Store the output of close operation in an intermediate buffer - //this will be later used for pixsubtract - clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); + // Store the output of close operation in an intermediate buffer + // this will be later used for pixsubtract + clStatus = + clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, + 0, sizeof(int) * wpl * h, 0, nullptr, nullptr); - //Second step: Open Operation - Erode followed by Dilate - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; + // Second step: Open Operation - Erode followed by Dilate + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; - clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h); + clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h); - //Third step: Subtract : (Close - Open) - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixdCLIntermediate; - pixdCLIntermediate = pixtemp; + // Third step: Subtract : (Close - Open) + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixdCLIntermediate; + pixdCLIntermediate = pixtemp; - clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); + clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer); - //Store the output of Hollow operation in an intermediate buffer - //this will be later used - clStatus = clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, 0, sizeof(int) * wpl*h, 0, NULL, NULL); + // Store the output of Hollow operation in an intermediate buffer + // this will be later used + clStatus = + clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0, + 0, sizeof(int) * wpl * h, 0, nullptr, nullptr); - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLBuffer; - pixdCLBuffer = pixtemp; + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLBuffer; + pixdCLBuffer = pixtemp; - //Fourth step: Get vertical line - //pixOpenBrick(NULL, pix_hollow, 1, min_line_length); - clStatus = pixOpenCL(1, line_vsize, wpl, h); + // Fourth step: Get vertical line + // pixOpenBrick(nullptr, pix_hollow, 1, min_line_length); + clStatus = pixOpenCL(1, line_vsize, wpl, h); - //Copy the vertical line output to CPU buffer - *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl*h, CL_MAP_READ, true, false); + // Copy the vertical line output to CPU buffer + *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h, + CL_MAP_READ, true, false); - pixtemp = pixsCLBuffer; - pixsCLBuffer = pixdCLIntermediate; - pixdCLIntermediate = pixtemp; + pixtemp = pixsCLBuffer; + pixsCLBuffer = pixdCLIntermediate; + pixdCLIntermediate = pixtemp; - //Fifth step: Get horizontal line - //pixOpenBrick(NULL, pix_hollow, min_line_length, 1); - clStatus = pixOpenCL(line_hsize, 1, wpl, h); + // Fifth step: Get horizontal line + // pixOpenBrick(nullptr, pix_hollow, min_line_length, 1); + clStatus = pixOpenCL(line_hsize, 1, wpl, h); - //Copy the horizontal line output to CPU buffer - *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl*h, CL_MAP_READ, true, true); + // Copy the horizontal line output to CPU buffer + *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h, + CL_MAP_READ, true, true); - return; + return; } - /************************************************************************* * HistogramRect * Otsu Thresholding Operations * histogramAllChannels is laid out as all channel 0, then all channel 1... * only supports 1 or 4 channels (bytes_per_pixel) ************************************************************************/ -int OpenclDevice::HistogramRectOCL( - const unsigned char* imageData, - int bytes_per_pixel, - int bytes_per_line, - int left, // always 0 - int top, // always 0 - int width, - int height, - int kHistogramSize, - int* histogramAllChannels) -{ -PERF_COUNT_START("HistogramRectOCL") - cl_int clStatus; - int retVal= 0; - KernelEnv histKern; - SetKernelEnv( &histKern ); - KernelEnv histRedKern; - SetKernelEnv( &histRedKern ); - /* map imagedata to device as read only */ - // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need. - // faster option would be to allocate initial image buffer - // using a garlic bus memory type - cl_mem imageBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); +int OpenclDevice::HistogramRectOCL(unsigned char *imageData, + int bytes_per_pixel, int bytes_per_line, + int left, // always 0 + int top, // always 0 + int width, int height, int kHistogramSize, + int *histogramAllChannels) { + PERF_COUNT_START("HistogramRectOCL") + cl_int clStatus; + int retVal = 0; + KernelEnv histKern; + SetKernelEnv(&histKern); + KernelEnv histRedKern; + SetKernelEnv(&histRedKern); + /* map imagedata to device as read only */ + // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be + // coherent which we don't need. + // faster option would be to allocate initial image buffer + // using a garlic bus memory type + cl_mem imageBuffer = clCreateBuffer( + histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, + width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus); + CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer"); - /* setup work group size parameters */ - int block_size = 256; - cl_uint numCUs; - clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL); - CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); + /* setup work group size parameters */ + int block_size = 256; + cl_uint numCUs; + clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(numCUs), &numCUs, nullptr); + CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer"); - int requestedOccupancy = 10; - int numWorkGroups = numCUs * requestedOccupancy; - int numThreads = block_size*numWorkGroups; - size_t local_work_size[] = {static_cast(block_size)}; - size_t global_work_size[] = {static_cast(numThreads)}; - size_t red_global_work_size[] = {static_cast(block_size*kHistogramSize*bytes_per_pixel)}; + int requestedOccupancy = 10; + int numWorkGroups = numCUs * requestedOccupancy; + int numThreads = block_size * numWorkGroups; + size_t local_work_size[] = {static_cast(block_size)}; + size_t global_work_size[] = {static_cast(numThreads)}; + size_t red_global_work_size[] = { + static_cast(block_size * kHistogramSize * bytes_per_pixel)}; - /* map histogramAllChannels as write only */ - int numBins = kHistogramSize*bytes_per_pixel*numWorkGroups; + /* map histogramAllChannels as write only */ + int numBins = kHistogramSize * bytes_per_pixel * numWorkGroups; - cl_mem histogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, kHistogramSize*bytes_per_pixel*sizeof(int), (void *)histogramAllChannels, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer histogramBuffer"); + cl_mem histogramBuffer = clCreateBuffer( + histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, + kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels, + &clStatus); + CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer"); - /* intermediate histogram buffer */ - int histRed = 256; - int tmpHistogramBins = kHistogramSize*bytes_per_pixel*histRed; + /* intermediate histogram buffer */ + int histRed = 256; + int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed; - cl_mem tmpHistogramBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE, tmpHistogramBins*sizeof(cl_uint), NULL, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer tmpHistogramBuffer"); + cl_mem tmpHistogramBuffer = + clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE, + tmpHistogramBins * sizeof(cl_uint), nullptr, &clStatus); + CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer"); - /* atomic sync buffer */ - int *zeroBuffer = new int[1]; - zeroBuffer[0] = 0; - cl_mem atomicSyncBuffer = clCreateBuffer( histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, sizeof(cl_int), (void *)zeroBuffer, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer atomicSyncBuffer"); - delete[] zeroBuffer; - //Create kernel objects based on bytes_per_pixel - if (bytes_per_pixel == 1) - { - histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus ); - CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannel"); + /* atomic sync buffer */ + int *zeroBuffer = new int[1]; + zeroBuffer[0] = 0; + cl_mem atomicSyncBuffer = clCreateBuffer( + histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR, + sizeof(cl_int), zeroBuffer, &clStatus); + CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer"); + delete[] zeroBuffer; + // Create kernel objects based on bytes_per_pixel + if (bytes_per_pixel == 1) { + histKern.mpkKernel = clCreateKernel( + histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus); + CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel"); - histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectOneChannelReduction", &clStatus ); - CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectOneChannelReduction"); - } else { + histRedKern.mpkKernel = + clCreateKernel(histRedKern.mpkProgram, + "kernel_HistogramRectOneChannelReduction", &clStatus); + CHECK_OPENCL(clStatus, + "clCreateKernel kernel_HistogramRectOneChannelReduction"); + } else { histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus ); CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels"); @@ -2466,72 +1934,77 @@ PERF_COUNT_START("HistogramRectOCL") void *ptr; //Initialize tmpHistogramBuffer buffer - ptr = clEnqueueMapBuffer(histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, tmpHistogramBins*sizeof(cl_uint), 0, NULL, NULL, &clStatus); + ptr = clEnqueueMapBuffer( + histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0, + tmpHistogramBins * sizeof(cl_uint), 0, nullptr, nullptr, &clStatus); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer"); memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint)); - clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, NULL, NULL); + clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0, + nullptr, nullptr); /* set kernel 1 arguments */ - clStatus = clSetKernelArg( histKern.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer ); + clStatus = + clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer); CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); cl_uint numPixels = width*height; - clStatus = clSetKernelArg( histKern.mpkKernel, 1, sizeof(cl_uint), (void *)&numPixels ); + clStatus = + clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels); CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" ); - clStatus = clSetKernelArg( histKern.mpkKernel, 2, sizeof(cl_mem), (void *)&tmpHistogramBuffer ); + clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem), + &tmpHistogramBuffer); CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); /* set kernel 2 arguments */ int n = numThreads/bytes_per_pixel; - clStatus = clSetKernelArg( histRedKern.mpkKernel, 0, sizeof(cl_int), (void *)&n ); + clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n); CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); - clStatus = clSetKernelArg( histRedKern.mpkKernel, 1, sizeof(cl_mem), (void *)&tmpHistogramBuffer ); + clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem), + &tmpHistogramBuffer); CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer"); - clStatus = clSetKernelArg( histRedKern.mpkKernel, 2, sizeof(cl_mem), (void *)&histogramBuffer ); + clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem), + &histogramBuffer); CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer"); /* launch histogram */ PERF_COUNT_SUB("before") - clStatus = clEnqueueNDRangeKernel( - histKern.mpkCmdQueue, - histKern.mpkKernel, - 1, NULL, global_work_size, local_work_size, - 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels" ); - clFinish( histKern.mpkCmdQueue ); - if(clStatus !=0) - { - retVal = -1; - } +clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1, + nullptr, global_work_size, local_work_size, 0, + nullptr, nullptr); +CHECK_OPENCL(clStatus, + "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels"); +clFinish(histKern.mpkCmdQueue); +if (clStatus != 0) { + retVal = -1; + } /* launch histogram */ clStatus = clEnqueueNDRangeKernel( - histRedKern.mpkCmdQueue, - histRedKern.mpkKernel, - 1, NULL, red_global_work_size, local_work_size, - 0, NULL, NULL ); + histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr, + red_global_work_size, local_work_size, 0, nullptr, nullptr); CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" ); clFinish( histRedKern.mpkCmdQueue ); - if(clStatus !=0) - { - retVal = -1; - } -PERF_COUNT_SUB("redKernel") + if (clStatus != 0) { + retVal = -1; + } + PERF_COUNT_SUB("redKernel") /* map results back from gpu */ - ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, CL_MAP_READ, 0, kHistogramSize*bytes_per_pixel*sizeof(int), 0, NULL, NULL, &clStatus); + ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE, + CL_MAP_READ, 0, + kHistogramSize * bytes_per_pixel * sizeof(int), 0, + nullptr, nullptr, &clStatus); CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); - if(clStatus !=0) - { - retVal = -1; - } - clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, NULL, NULL); + if (clStatus != 0) { + retVal = -1; + } + clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0, + nullptr, nullptr); clReleaseMemObject(histogramBuffer); clReleaseMemObject(imageBuffer); PERF_COUNT_SUB("after") PERF_COUNT_END - return retVal; - +return retVal; } /************************************************************************* @@ -2539,111 +2012,118 @@ PERF_COUNT_END * from the class, using thresholds/hi_values to the output IMAGE. * only supports 1 or 4 channels ************************************************************************/ -int OpenclDevice::ThresholdRectToPixOCL( - const unsigned char* imageData, - int bytes_per_pixel, - int bytes_per_line, - const int* thresholds, - const int* hi_values, - Pix** pix, - int height, - int width, - int top, - int left) { -PERF_COUNT_START("ThresholdRectToPixOCL") - int retVal =0; - /* create pix result buffer */ - *pix = pixCreate(width, height, 1); - uinT32* pixData = pixGetData(*pix); - int wpl = pixGetWpl(*pix); - int pixSize = wpl*height*sizeof(uinT32); // number of pixels +int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData, + int bytes_per_pixel, int bytes_per_line, + int *thresholds, int *hi_values, + Pix **pix, int height, int width, + int top, int left) { + PERF_COUNT_START("ThresholdRectToPixOCL") + int retVal = 0; + /* create pix result buffer */ + *pix = pixCreate(width, height, 1); + uint32_t *pixData = pixGetData(*pix); + int wpl = pixGetWpl(*pix); + int pixSize = wpl * height * sizeof(uint32_t); // number of pixels - cl_int clStatus; - KernelEnv rEnv; - SetKernelEnv( &rEnv ); + cl_int clStatus; + KernelEnv rEnv; + SetKernelEnv(&rEnv); - /* setup work group size parameters */ - int block_size = 256; - cl_uint numCUs = 6; - clStatus = clGetDeviceInfo( gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, sizeof(numCUs), &numCUs, NULL); - CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); + /* setup work group size parameters */ + int block_size = 256; + cl_uint numCUs = 6; + clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS, + sizeof(numCUs), &numCUs, nullptr); + CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer"); - int requestedOccupancy = 10; - int numWorkGroups = numCUs * requestedOccupancy; - int numThreads = block_size*numWorkGroups; - size_t local_work_size[] = {(size_t) block_size}; - size_t global_work_size[] = {(size_t) numThreads}; + int requestedOccupancy = 10; + int numWorkGroups = numCUs * requestedOccupancy; + int numThreads = block_size * numWorkGroups; + size_t local_work_size[] = {(size_t)block_size}; + size_t global_work_size[] = {(size_t)numThreads}; - /* map imagedata to device as read only */ - // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be coherent which we don't need. - // faster option would be to allocate initial image buffer - // using a garlic bus memory type - cl_mem imageBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, width*height*bytes_per_pixel*sizeof(char), (void *)imageData, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer imageBuffer"); + /* map imagedata to device as read only */ + // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be + // coherent which we don't need. + // faster option would be to allocate initial image buffer + // using a garlic bus memory type + cl_mem imageBuffer = clCreateBuffer( + rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, + width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus); + CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer"); - /* map pix as write only */ - pixThBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, pixSize, (void *)pixData, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer pix"); + /* map pix as write only */ + pixThBuffer = + clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR, + pixSize, pixData, &clStatus); + CHECK_OPENCL(clStatus, "clCreateBuffer pix"); - /* map thresholds and hi_values */ - cl_mem thresholdsBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)thresholds, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer thresholdBuffer"); - cl_mem hiValuesBuffer = clCreateBuffer( rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, bytes_per_pixel*sizeof(int), (void *)hi_values, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer hiValuesBuffer"); + /* map thresholds and hi_values */ + cl_mem thresholdsBuffer = + clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, + bytes_per_pixel * sizeof(int), thresholds, &clStatus); + CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer"); + cl_mem hiValuesBuffer = + clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, + bytes_per_pixel * sizeof(int), hi_values, &clStatus); + CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer"); - /* compile kernel */ - if (bytes_per_pixel == 4) { - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus ); - CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix"); - } else { - rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus ); - CHECK_OPENCL( clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan"); - } + /* compile kernel */ + if (bytes_per_pixel == 4) { + rEnv.mpkKernel = + clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus); + CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix"); + } else { + rEnv.mpkKernel = clCreateKernel( + rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus); + CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan"); + } - /* set kernel arguments */ - clStatus = clSetKernelArg( rEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&imageBuffer ); - CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer"); - cl_uint numPixels = width*height; - clStatus = clSetKernelArg( rEnv.mpkKernel, 1, sizeof(int), (void *)&height ); - CHECK_OPENCL( clStatus, "clSetKernelArg height" ); - clStatus = clSetKernelArg( rEnv.mpkKernel, 2, sizeof(int), (void *)&width ); - CHECK_OPENCL( clStatus, "clSetKernelArg width" ); - clStatus = clSetKernelArg( rEnv.mpkKernel, 3, sizeof(int), (void *)&wpl ); - CHECK_OPENCL( clStatus, "clSetKernelArg wpl" ); - clStatus = clSetKernelArg( rEnv.mpkKernel, 4, sizeof(cl_mem), (void *)&thresholdsBuffer ); - CHECK_OPENCL( clStatus, "clSetKernelArg thresholdsBuffer" ); - clStatus = clSetKernelArg( rEnv.mpkKernel, 5, sizeof(cl_mem), (void *)&hiValuesBuffer ); - CHECK_OPENCL( clStatus, "clSetKernelArg hiValuesBuffer" ); - clStatus = clSetKernelArg( rEnv.mpkKernel, 6, sizeof(cl_mem), (void *)&pixThBuffer ); - CHECK_OPENCL( clStatus, "clSetKernelArg pixThBuffer"); + /* set kernel arguments */ + clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer); + CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer"); + cl_uint numPixels = width * height; + clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height); + CHECK_OPENCL(clStatus, "clSetKernelArg height"); + clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width); + CHECK_OPENCL(clStatus, "clSetKernelArg width"); + clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl); + CHECK_OPENCL(clStatus, "clSetKernelArg wpl"); + clStatus = + clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer); + CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer"); + clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer); + CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer"); + clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer); + CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer"); - /* launch kernel & wait */ -PERF_COUNT_SUB("before") - clStatus = clEnqueueNDRangeKernel( - rEnv.mpkCmdQueue, - rEnv.mpkKernel, - 1, NULL, global_work_size, local_work_size, - 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix" ); - clFinish( rEnv.mpkCmdQueue ); -PERF_COUNT_SUB("kernel") - if(clStatus !=0) - { - printf("Setting return value to -1\n"); - retVal = -1; - } - /* map results back from gpu */ - void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, pixSize, 0, NULL, NULL, &clStatus); - CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer"); - clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, NULL, NULL); + /* launch kernel & wait */ + PERF_COUNT_SUB("before") + clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1, + nullptr, global_work_size, local_work_size, + 0, nullptr, nullptr); + CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix"); + clFinish(rEnv.mpkCmdQueue); + PERF_COUNT_SUB("kernel") + if (clStatus != 0) { + printf("Setting return value to -1\n"); + retVal = -1; + } + /* map results back from gpu */ + void *ptr = + clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0, + pixSize, 0, nullptr, nullptr, &clStatus); + CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer"); + clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, nullptr, + nullptr); - clReleaseMemObject(imageBuffer); - clReleaseMemObject(thresholdsBuffer); - clReleaseMemObject(hiValuesBuffer); + clReleaseMemObject(imageBuffer); + clReleaseMemObject(thresholdsBuffer); + clReleaseMemObject(hiValuesBuffer); -PERF_COUNT_SUB("after") -PERF_COUNT_END -return retVal; + PERF_COUNT_SUB("after") + PERF_COUNT_END + return retVal; } @@ -2720,7 +2200,6 @@ void populateTessScoreEvaluationInputData( TessScoreEvaluationInputData *input ) float fractionBlack = 0.1; // how much of the image should be blackened int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2); for (int i = 0; i < numSpots; i++) { - int lineWidth = rand()%maxLineWidth; int col = lineWidth + rand()%(width-2*lineWidth); int row = lineWidth + rand()%(height-2*lineWidth); @@ -2751,15 +2230,14 @@ typedef struct _TessDeviceScore { *****************************************************************************/ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { - double time = 0; #if ON_WINDOWS LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); #elif ON_APPLE - mach_timebase_info_data_t info = { 0, 0 }; + mach_timebase_info_data_t info = {0, 0}; mach_timebase_info(&info); - long long start,stop; + long long start, stop; #else timespec time_funct_start, time_funct_end; #endif @@ -2770,21 +2248,22 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu if (type == DS_DEVICE_OPENCL_DEVICE) { #if ON_WINDOWS QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); +#elif ON_APPLE + start = mach_absolute_time(); #else clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); #endif OpenclDevice::gpuEnv = *env; int wpl = pixGetWpl(input.pix); - OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, wpl, NULL); + OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height, + wpl, nullptr); #if ON_WINDOWS QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; +#elif ON_APPLE + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; #else clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; @@ -2793,8 +2272,8 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu } else { #if ON_WINDOWS QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); +#elif ON_APPLE + start = mach_absolute_time(); #else clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); #endif @@ -2807,7 +2286,6 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu int idx = 0; for (i = 0; i < input.height ; i++) { for (j = 0; j < input.width; j++) { - l_uint32 tiffword = tiffdata[i * input.width + j]; l_int32 rval = ((tiffword) & 0xff); l_int32 gval = (((tiffword) >> 8) & 0xff); @@ -2820,9 +2298,9 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu #if ON_WINDOWS QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; +#elif ON_APPLE + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; #else clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; @@ -2837,15 +2315,14 @@ double composeRGBPixelMicroBench( GPUEnv *env, TessScoreEvaluationInputData inpu } double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { - double time; #if ON_WINDOWS LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); -#elif ON_APPLE - mach_timebase_info_data_t info = { 0, 0 }; +#elif ON_APPLE + mach_timebase_info_data_t info = {0, 0}; mach_timebase_info(&info); - long long start,stop; + long long start, stop; #else timespec time_funct_start, time_funct_end; #endif @@ -2857,58 +2334,56 @@ double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, int kHistogramSize = 256; int bytes_per_line = input.width*input.numChannels; int *histogramAllChannels = new int[kHistogramSize*input.numChannels]; - int retVal= 0; + int retVal = 0; // function call if (type == DS_DEVICE_OPENCL_DEVICE) { #if ON_WINDOWS QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); +#elif ON_APPLE + start = mach_absolute_time(); #else clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); #endif OpenclDevice::gpuEnv = *env; int wpl = pixGetWpl(input.pix); - retVal= OpenclDevice::HistogramRectOCL(input.imageData, input.numChannels, bytes_per_line, top, left, input.width, input.height, kHistogramSize, histogramAllChannels); + retVal = OpenclDevice::HistogramRectOCL( + input.imageData, input.numChannels, bytes_per_line, top, left, + input.width, input.height, kHistogramSize, histogramAllChannels); #if ON_WINDOWS QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - if(retVal ==0) - { - time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; - } - else - { - time= FLT_MAX; - } +#elif ON_APPLE + stop = mach_absolute_time(); + if (retVal == 0) { + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; + } else { + time = FLT_MAX; + } #else clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; #endif } else { - int *histogram = new int[kHistogramSize]; #if ON_WINDOWS QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); +#elif ON_APPLE + start = mach_absolute_time(); #else clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); #endif for (int ch = 0; ch < input.numChannels; ++ch) { - tesseract::HistogramRect(input.pix, input.numChannels, - left, top, input.width, input.height, histogram); + tesseract::HistogramRect(input.pix, input.numChannels, left, top, + input.width, input.height, histogram); } #if ON_WINDOWS QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; +#elif ON_APPLE + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; #else clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; @@ -2934,13 +2409,13 @@ void ThresholdRectToPix_Native(const unsigned char* imagedata, int height = pixGetHeight(*pix); *pix = pixCreate(width, height, 1); - uinT32* pixdata = pixGetData(*pix); + uint32_t *pixdata = pixGetData(*pix); int wpl = pixGetWpl(*pix); const unsigned char* srcdata = imagedata + top * bytes_per_line + left * bytes_per_pixel; for (int y = 0; y < height; ++y) { - const uinT8* linedata = srcdata; - uinT32* pixline = pixdata + y * wpl; + const uint8_t *linedata = srcdata; + uint32_t *pixline = pixdata + y * wpl; for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) { bool white_result = true; for (int ch = 0; ch < bytes_per_pixel; ++ch) { @@ -2960,16 +2435,15 @@ void ThresholdRectToPix_Native(const unsigned char* imagedata, } double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) { - double time; - int retVal =0; + int retVal = 0; #if ON_WINDOWS LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); -#elif ON_APPLE - mach_timebase_info_data_t info = { 0, 0 }; +#elif ON_APPLE + mach_timebase_info_data_t info = {0, 0}; mach_timebase_info(&info); - long long start,stop; + long long start, stop; #else timespec time_funct_start, time_funct_end; #endif @@ -2995,29 +2469,29 @@ double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData i if (type == DS_DEVICE_OPENCL_DEVICE) { #if ON_WINDOWS QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); +#elif ON_APPLE + start = mach_absolute_time(); #else clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); #endif OpenclDevice::gpuEnv = *env; int wpl = pixGetWpl(input.pix); - retVal= OpenclDevice::ThresholdRectToPixOCL(input.imageData, input.numChannels, bytes_per_line, thresholds, hi_values, &input.pix, input.height, input.width, top, left); + retVal = OpenclDevice::ThresholdRectToPixOCL( + input.imageData, input.numChannels, bytes_per_line, thresholds, + hi_values, &input.pix, input.height, input.width, top, left); #if ON_WINDOWS QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - if(retVal ==0) - { - time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9;; - } - else - { - time= FLT_MAX; - } +#elif ON_APPLE + stop = mach_absolute_time(); + if (retVal == 0) { + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; + ; + } else { + time = FLT_MAX; + } #else clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); @@ -3030,8 +2504,8 @@ double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData i thresholder.SetImage( input.pix ); #if ON_WINDOWS QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); +#elif ON_APPLE + start = mach_absolute_time(); #else clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); #endif @@ -3041,9 +2515,9 @@ double thresholdRectToPixMicroBench( GPUEnv *env, TessScoreEvaluationInputData i #if ON_WINDOWS QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; +#elif ON_APPLE + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; #else clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; @@ -3062,10 +2536,10 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in #if ON_WINDOWS LARGE_INTEGER freq, time_funct_start, time_funct_end; QueryPerformanceFrequency(&freq); -#elif ON_APPLE - mach_timebase_info_data_t info = { 0, 0 }; +#elif ON_APPLE + mach_timebase_info_data_t info = {0, 0}; mach_timebase_info(&info); - long long start,stop; + long long start, stop; #else timespec time_funct_start, time_funct_end; #endif @@ -3083,25 +2557,28 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in if (type == DS_DEVICE_OPENCL_DEVICE) { #if ON_WINDOWS QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); +#elif ON_APPLE + start = mach_absolute_time(); #else clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); #endif Pix *src_pix = input.pix; OpenclDevice::gpuEnv = *env; OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix); - Pix *pix_vline = NULL, *pix_hline = NULL, *pix_closed = NULL; - OpenclDevice::pixGetLinesCL(NULL, input.pix, &pix_vline, &pix_hline, &pix_closed, true, closing_brick, closing_brick, max_line_width, max_line_width, min_line_length, min_line_length); + Pix *pix_vline = nullptr, *pix_hline = nullptr, *pix_closed = nullptr; + OpenclDevice::pixGetLinesCL( + nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed, true, + closing_brick, closing_brick, max_line_width, max_line_width, + min_line_length, min_line_length); OpenclDevice::releaseMorphCLBuffers(); #if ON_WINDOWS QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; +#elif ON_APPLE + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; #else clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; @@ -3109,28 +2586,30 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in } else { #if ON_WINDOWS QueryPerformanceCounter(&time_funct_start); -#elif ON_APPLE - start = mach_absolute_time(); +#elif ON_APPLE + start = mach_absolute_time(); #else clock_gettime( CLOCK_MONOTONIC, &time_funct_start ); #endif // native serial code Pix *src_pix = input.pix; - Pix *pix_closed = pixCloseBrick(NULL, src_pix, closing_brick, closing_brick); - Pix *pix_solid = pixOpenBrick(NULL, pix_closed, max_line_width, max_line_width); - Pix *pix_hollow = pixSubtract(NULL, pix_closed, pix_solid); + Pix *pix_closed = + pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick); + Pix *pix_solid = + pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width); + Pix *pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid); pixDestroy(&pix_solid); - Pix *pix_vline = pixOpenBrick(NULL, pix_hollow, 1, min_line_length); - Pix *pix_hline = pixOpenBrick(NULL, pix_hollow, min_line_length, 1); + Pix *pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length); + Pix *pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1); pixDestroy(&pix_hollow); #if ON_WINDOWS QueryPerformanceCounter(&time_funct_end); time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); -#elif ON_APPLE - stop = mach_absolute_time(); - time = ((stop - start) * (double) info.numer / info.denom) / 1.0E9; +#elif ON_APPLE + stop = mach_absolute_time(); + time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9; #else clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; @@ -3148,11 +2627,10 @@ double getLineMasksMorphMicroBench( GPUEnv *env, TessScoreEvaluationInputData in #include "stdlib.h" - // encode score object as byte string ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) { *serializedScoreSize = sizeof(TessDeviceScore); - *serializedScore = (void *) new unsigned char[*serializedScoreSize]; + *serializedScore = new unsigned char[*serializedScoreSize]; memcpy(*serializedScore, device->score, *serializedScoreSize); return DS_SUCCESS; } @@ -3165,18 +2643,17 @@ ds_status deserializeScore( ds_device* device, const unsigned char* serializedSc return DS_SUCCESS; } -ds_status releaseScore( void* score ) { - delete[] score; +ds_status releaseScore(void *score) { + delete (TessDeviceScore *)score; return DS_SUCCESS; } // evaluate devices ds_status evaluateScoreForDevice( ds_device *device, void *inputData) { - // overwrite statuc gpuEnv w/ current device // so native opencl calls can be used; they use static gpuEnv printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" ); - GPUEnv *env = NULL; + GPUEnv *env = nullptr; if (device->type == DS_DEVICE_OPENCL_DEVICE) { env = new GPUEnv; //printf("[DS] populating tmp GPUEnv from device\n"); @@ -3208,15 +2685,13 @@ ds_status evaluateScoreForDevice( ds_device *device, void *inputData) { float composeRGBPixelWeight = 1.2f; float histogramRectWeight = 2.4f; float thresholdRectToPixWeight = 4.5f; - float getLineMasksMorphWeight = 5.0f; + float getLineMasksMorphWeight = 5.0f; - float weightedTime = - composeRGBPixelWeight * composeRGBPixelTime + - histogramRectWeight * histogramRectTime + - thresholdRectToPixWeight * thresholdRectToPixTime + - getLineMasksMorphWeight * getLineMasksMorphTime - ; - device->score = (void *)new TessDeviceScore; + float weightedTime = composeRGBPixelWeight * composeRGBPixelTime + + histogramRectWeight * histogramRectTime + + thresholdRectToPixWeight * thresholdRectToPixTime + + getLineMasksMorphWeight * getLineMasksMorphTime; + device->score = new TessDeviceScore; ((TessDeviceScore *)device->score)->time = weightedTime; printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" ); @@ -3231,83 +2706,104 @@ ds_status evaluateScoreForDevice( ds_device *device, void *inputData) { // initial call to select device ds_device OpenclDevice::getDeviceSelection( ) { if (!deviceIsSelected) { -PERF_COUNT_START("getDeviceSelection") - // check if opencl is available at runtime - if( 1 == LoadOpencl() ) { - // opencl is available -//PERF_COUNT_SUB("LoadOpencl") - // setup devices - ds_status status; - ds_profile *profile; - status = initDSProfile( &profile, "v0.1" ); -PERF_COUNT_SUB("initDSProfile") - // try reading scores from file - char *fileName = "tesseract_opencl_profile_devices.dat"; - status = readProfileFromFile( profile, deserializeScore, fileName); - if (status != DS_SUCCESS) { - // need to run evaluation - printf("[DS] Profile file not available (%s); performing profiling.\n", fileName); + PERF_COUNT_START("getDeviceSelection") + // check if opencl is available at runtime + if (1 == LoadOpencl()) { + // opencl is available + // PERF_COUNT_SUB("LoadOpencl") + // setup devices + ds_status status; + ds_profile *profile; + status = initDSProfile(&profile, "v0.1"); + PERF_COUNT_SUB("initDSProfile") + // try reading scores from file + const char *fileName = "tesseract_opencl_profile_devices.dat"; + status = readProfileFromFile(profile, deserializeScore, fileName); + if (status != DS_SUCCESS) { + // need to run evaluation + printf("[DS] Profile file not available (%s); performing profiling.\n", + fileName); - // create input data - TessScoreEvaluationInputData input; - populateTessScoreEvaluationInputData( &input ); -//PERF_COUNT_SUB("populateTessScoreEvaluationInputData") - // perform evaluations - unsigned int numUpdates; - status = profileDevices( profile, DS_EVALUATE_ALL, evaluateScoreForDevice, (void *)&input, &numUpdates ); -PERF_COUNT_SUB("profileDevices") - // write scores to file - if ( status == DS_SUCCESS ) { - status = writeProfileToFile( profile, serializeScore, fileName); -PERF_COUNT_SUB("writeProfileToFile") - if ( status == DS_SUCCESS ) { - printf("[DS] Scores written to file (%s).\n", fileName); + // create input data + TessScoreEvaluationInputData input; + populateTessScoreEvaluationInputData(&input); + // PERF_COUNT_SUB("populateTessScoreEvaluationInputData") + // perform evaluations + unsigned int numUpdates; + status = profileDevices(profile, DS_EVALUATE_ALL, + evaluateScoreForDevice, &input, &numUpdates); + PERF_COUNT_SUB("profileDevices") + // write scores to file + if (status == DS_SUCCESS) { + status = writeProfileToFile(profile, serializeScore, fileName); + PERF_COUNT_SUB("writeProfileToFile") + if (status == DS_SUCCESS) { + printf("[DS] Scores written to file (%s).\n", fileName); + } else { + printf( + "[DS] Error saving scores to file (%s); scores not written to " + "file.\n", + fileName); + } } else { - printf("[DS] Error saving scores to file (%s); scores not written to file.\n", fileName); + printf( + "[DS] Unable to evaluate performance; scores not written to " + "file.\n"); } } else { - printf("[DS] Unable to evaluate performance; scores not written to file.\n"); + PERF_COUNT_SUB("readProfileFromFile") + printf("[DS] Profile read from file (%s).\n", fileName); } - } else { -PERF_COUNT_SUB("readProfileFromFile") - printf("[DS] Profile read from file (%s).\n", fileName); - } + // we now have device scores either from file or evaluation + // select fastest using custom Tesseract selection algorithm + float bestTime = FLT_MAX; // begin search with worst possible time + int bestDeviceIdx = -1; + for (int d = 0; d < profile->numDevices; d++) { + ds_device device = profile->devices[d]; + TessDeviceScore score = *(TessDeviceScore *)device.score; - // we now have device scores either from file or evaluation - // select fastest using custom Tesseract selection algorithm - float bestTime = FLT_MAX; // begin search with worst possible time - int bestDeviceIdx = -1; - for (int d = 0; d < profile->numDevices; d++) { - ds_device device = profile->devices[d]; - TessDeviceScore score = *(TessDeviceScore *)device.score; - - float time = score.time; - printf("[DS] Device[%i] %i:%s score is %f\n", d+1, device.type, device.oclDeviceName, time); - if (time < bestTime) { - bestTime = time; + float time = score.time; + printf("[DS] Device[%i] %i:%s score is %f\n", d + 1, device.type, + device.oclDeviceName, time); + if (time < bestTime) { + bestTime = time; bestDeviceIdx = d; + } } - } - printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); - // cleanup - // TODO: call destructor for profile object? + printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1, + profile->devices[bestDeviceIdx].oclDeviceName, + profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE + ? "OpenCL" + : "Native"); + // cleanup + // TODO: call destructor for profile object? - bool overrided = false; + bool overridden = false; char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE"); - if (overrideDeviceStr != NULL) { + if (overrideDeviceStr != nullptr) { int overrideDeviceIdx = atoi(overrideDeviceStr); - if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices ) { - printf("[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, %i)\n", overrideDeviceStr, overrideDeviceIdx); + if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) { + printf( + "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, " + "%i)\n", + overrideDeviceStr, overrideDeviceIdx); bestDeviceIdx = overrideDeviceIdx - 1; - overrided = true; + overridden = true; } else { - printf("[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are valid devices).\n", overrideDeviceStr, profile->numDevices); + printf( + "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are " + "valid devices).\n", + overrideDeviceStr, profile->numDevices); } } - if (overrided) { - printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx+1, profile->devices[bestDeviceIdx].oclDeviceName, profile->devices[bestDeviceIdx].type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native"); + if (overridden) { + printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1, + profile->devices[bestDeviceIdx].oclDeviceName, + profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE + ? "OpenCL" + : "Native"); } selectedDevice = profile->devices[bestDeviceIdx]; // cleanup @@ -3317,15 +2813,15 @@ PERF_COUNT_SUB("readProfileFromFile") printf("[DS] OpenCL runtime not available.\n"); selectedDevice.type = DS_DEVICE_NATIVE_CPU; selectedDevice.oclDeviceName = "(null)"; - selectedDevice.score = NULL; - selectedDevice.oclDeviceID = NULL; - selectedDevice.oclDriverVersion = NULL; + selectedDevice.score = nullptr; + selectedDevice.oclDeviceID = nullptr; + selectedDevice.oclDriverVersion = nullptr; } deviceIsSelected = true; -PERF_COUNT_SUB("select from Profile") -PERF_COUNT_END + PERF_COUNT_SUB("select from Profile") + PERF_COUNT_END } -//PERF_COUNT_END + // PERF_COUNT_END return selectedDevice; } @@ -3335,172 +2831,4 @@ bool OpenclDevice::selectedDeviceIsOpenCL() { return (device.type == DS_DEVICE_OPENCL_DEVICE); } -bool OpenclDevice::selectedDeviceIsNativeCPU() { - ds_device device = getDeviceSelection(); - return (device.type == DS_DEVICE_NATIVE_CPU); -} - - - -/*! - * pixConvertRGBToGray() from leptonica, converted to opencl kernel - * - * Input: pix (32 bpp RGB) - * rwt, gwt, bwt (non-negative; these should add to 1.0, - * or use 0.0 for default) - * Return: 8 bpp pix, or null on error - * - * Notes: - * (1) Use a weighted average of the RGB values. - */ -#define SET_DATA_BYTE( pdata, n, val ) (*(l_uint8 *)((l_uintptr_t)((l_uint8 *)(pdata) + (n)) ^ 3) = (val)) - -Pix * OpenclDevice::pixConvertRGBToGrayOCL( - Pix *srcPix, // 32-bit source - float rwt, - float gwt, - float bwt ) -{ -PERF_COUNT_START("pixConvertRGBToGrayOCL") - Pix *dstPix; // 8-bit destination - - if (rwt < 0.0 || gwt < 0.0 || bwt < 0.0) return NULL; - - if (rwt == 0.0 && gwt == 0.0 && bwt == 0.0) { - // magic numbers from leptonica - rwt = 0.3; - gwt = 0.5; - bwt = 0.2; - } - // normalize - float sum = rwt + gwt + bwt; - rwt /= sum; - gwt /= sum; - bwt /= sum; - - // source pix - int w, h; - pixGetDimensions(srcPix, &w, &h, NULL); - //printf("Image is %i x %i\n", w, h); - unsigned int *srcData = pixGetData(srcPix); - int srcWPL = pixGetWpl(srcPix); - int srcSize = srcWPL * h * sizeof(unsigned int); - - // destination pix - if ((dstPix = pixCreate(w, h, 8)) == NULL) - return NULL; - pixCopyResolution(dstPix, srcPix); - unsigned int *dstData = pixGetData(dstPix); - int dstWPL = pixGetWpl(dstPix); - int dstWords = dstWPL * h; - int dstSize = dstWords * sizeof(unsigned int); - //printf("dstSize = %i\n", dstSize); -PERF_COUNT_SUB("pix setup") - - // opencl objects - cl_int clStatus; - KernelEnv kEnv; - SetKernelEnv( &kEnv ); - - // source buffer - cl_mem srcBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, srcSize, (void *)srcData, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer srcBuffer"); - - // destination buffer - cl_mem dstBuffer = clCreateBuffer( kEnv.mpkContext, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, dstSize, (void *)dstData, &clStatus ); - CHECK_OPENCL( clStatus, "clCreateBuffer dstBuffer"); - - // setup work group size parameters - int block_size = 256; - int numWorkGroups = ((h*w+block_size-1) / block_size ); - int numThreads = block_size*numWorkGroups; - size_t local_work_size[] = {static_cast(block_size)}; - size_t global_work_size[] = {static_cast(numThreads)}; - //printf("Enqueueing %i threads for %i output pixels\n", numThreads, w*h); - - /* compile kernel */ - kEnv.mpkKernel = clCreateKernel( kEnv.mpkProgram, "kernel_RGBToGray", &clStatus ); - CHECK_OPENCL( clStatus, "clCreateKernel kernel_RGBToGray"); - - - /* set kernel arguments */ - clStatus = clSetKernelArg( kEnv.mpkKernel, 0, sizeof(cl_mem), (void *)&srcBuffer ); - CHECK_OPENCL( clStatus, "clSetKernelArg srcBuffer"); - clStatus = clSetKernelArg( kEnv.mpkKernel, 1, sizeof(cl_mem), (void *)&dstBuffer ); - CHECK_OPENCL( clStatus, "clSetKernelArg dstBuffer"); - clStatus = clSetKernelArg( kEnv.mpkKernel, 2, sizeof(int), (void *)&srcWPL ); - CHECK_OPENCL( clStatus, "clSetKernelArg srcWPL" ); - clStatus = clSetKernelArg( kEnv.mpkKernel, 3, sizeof(int), (void *)&dstWPL ); - CHECK_OPENCL( clStatus, "clSetKernelArg dstWPL" ); - clStatus = clSetKernelArg( kEnv.mpkKernel, 4, sizeof(int), (void *)&h ); - CHECK_OPENCL( clStatus, "clSetKernelArg height" ); - clStatus = clSetKernelArg( kEnv.mpkKernel, 5, sizeof(int), (void *)&w ); - CHECK_OPENCL( clStatus, "clSetKernelArg width" ); - clStatus = clSetKernelArg( kEnv.mpkKernel, 6, sizeof(float), (void *)&rwt ); - CHECK_OPENCL( clStatus, "clSetKernelArg rwt" ); - clStatus = clSetKernelArg( kEnv.mpkKernel, 7, sizeof(float), (void *)&gwt ); - CHECK_OPENCL( clStatus, "clSetKernelArg gwt"); - clStatus = clSetKernelArg( kEnv.mpkKernel, 8, sizeof(float), (void *)&bwt ); - CHECK_OPENCL( clStatus, "clSetKernelArg bwt"); - - /* launch kernel & wait */ -PERF_COUNT_SUB("before") - clStatus = clEnqueueNDRangeKernel( - kEnv.mpkCmdQueue, - kEnv.mpkKernel, - 1, NULL, global_work_size, local_work_size, - 0, NULL, NULL ); - CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_RGBToGray" ); - clFinish( kEnv.mpkCmdQueue ); -PERF_COUNT_SUB("kernel") - - /* map results back from gpu */ - void *ptr = clEnqueueMapBuffer(kEnv.mpkCmdQueue, dstBuffer, CL_TRUE, CL_MAP_READ, 0, dstSize, 0, NULL, NULL, &clStatus); - CHECK_OPENCL( clStatus, "clEnqueueMapBuffer dstBuffer"); - clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, dstBuffer, ptr, 0, NULL, NULL); - -#if 0 - // validate: compute on cpu - Pix *cpuPix = pixCreate(w, h, 8); - pixCopyResolution(cpuPix, srcPix); - unsigned int *cpuData = pixGetData(cpuPix); - int cpuWPL = pixGetWpl(cpuPix); - unsigned int *cpuLine, *srcLine; - int i, j; - for (i = 0, srcLine = srcData, cpuLine = cpuData; i < h; i++) { - for (j = 0; j < w; j++) { - unsigned int word = *(srcLine + j); - int val = (l_int32)(rwt * ((word >> L_RED_SHIFT) & 0xff) + - gwt * ((word >> L_GREEN_SHIFT) & 0xff) + - bwt * ((word >> L_BLUE_SHIFT) & 0xff) + 0.5); - SET_DATA_BYTE(cpuLine, j, val); - } - srcLine += srcWPL; - cpuLine += cpuWPL; - } - - // validate: compare - printf("converted 32-bit -> 8-bit image\n"); - for (int row = 0; row < h; row++) { - for (int col = 0; col < w; col++) { - int idx = row*w + col; - unsigned int srcVal = srcData[idx]; - unsigned char cpuVal = ((unsigned char *)cpuData)[idx]; - unsigned char oclVal = ((unsigned char *)dstData)[idx]; - if (srcVal > 0) { - printf("%4i,%4i: %u, %u, %u\n", row, col, srcVal, cpuVal, oclVal); - } - } - //printf("\n"); - } -#endif - // release opencl objects - clReleaseMemObject(srcBuffer); - clReleaseMemObject(dstBuffer); - - -PERF_COUNT_END - // success - return dstPix; -} #endif diff --git a/opencl/openclwrapper.h b/opencl/openclwrapper.h index e422a09e..ad79fa6c 100644 --- a/opencl/openclwrapper.h +++ b/opencl/openclwrapper.h @@ -1,3 +1,16 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#ifndef TESSERACT_OPENCL_OPENCLWRAPPER_H_ +#define TESSERACT_OPENCL_OPENCLWRAPPER_H_ + #include #include "allheaders.h" #include "pix.h" @@ -10,7 +23,8 @@ // including CL/cl.h doesn't occur until USE_OPENCL defined below // platform preprocessor commands -#if defined( WIN32 ) || defined( __WIN32__ ) || defined( _WIN32 ) || defined( __CYGWIN__ ) || defined( __MINGW32__ ) +#if defined(WIN32) || defined(__WIN32__) || defined(_WIN32) || \ + defined(__CYGWIN__) || defined(__MINGW32__) #define ON_WINDOWS 1 #define ON_LINUX 0 #define ON_APPLE 0 @@ -80,21 +94,23 @@ time_sub_start = time_funct_start; \ time_sub_end = time_funct_start; -#define PERF_COUNT_END \ - QueryPerformanceCounter(&time_funct_end); \ - elapsed_time_sec = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart); \ - printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec); +#define PERF_COUNT_END \ + QueryPerformanceCounter(&time_funct_end); \ + elapsed_time_sec = (time_funct_end.QuadPart - time_funct_start.QuadPart) / \ + (double)(freq.QuadPart); \ + printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec); #else #define PERF_COUNT_START(FUNCT_NAME) #define PERF_COUNT_END #endif #if PERF_COUNT_VERBOSE >= 3 -#define PERF_COUNT_SUB(SUB) \ - QueryPerformanceCounter(&time_sub_end); \ - elapsed_time_sec = (time_sub_end.QuadPart-time_sub_start.QuadPart)/(double)(freq.QuadPart); \ - printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \ - time_sub_start = time_sub_end; +#define PERF_COUNT_SUB(SUB) \ + QueryPerformanceCounter(&time_sub_end); \ + elapsed_time_sec = (time_sub_end.QuadPart - time_sub_start.QuadPart) / \ + (double)(freq.QuadPart); \ + printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \ + time_sub_start = time_sub_end; #else #define PERF_COUNT_SUB(SUB) #endif @@ -112,21 +128,25 @@ time_sub_start = time_funct_start; \ time_sub_end = time_funct_start; -#define PERF_COUNT_END \ - clock_gettime( CLOCK_MONOTONIC, &time_funct_end ); \ - elapsed_time_sec = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0; \ - printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec); +#define PERF_COUNT_END \ + clock_gettime(CLOCK_MONOTONIC, &time_funct_end); \ + elapsed_time_sec = \ + (time_funct_end.tv_sec - time_funct_start.tv_sec) * 1.0 + \ + (time_funct_end.tv_nsec - time_funct_start.tv_nsec) / 1000000000.0; \ + printf(PERF_COUNT_REPORT_STR, funct_name, "total", elapsed_time_sec); #else #define PERF_COUNT_START(FUNCT_NAME) #define PERF_COUNT_END #endif #if PERF_COUNT_VERBOSE >= 3 -#define PERF_COUNT_SUB(SUB) \ - clock_gettime( CLOCK_MONOTONIC, &time_sub_end ); \ - elapsed_time_sec = (time_sub_end.tv_sec - time_sub_start.tv_sec)*1.0 + (time_sub_end.tv_nsec - time_sub_start.tv_nsec)/1000000000.0; \ - printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \ - time_sub_start = time_sub_end; +#define PERF_COUNT_SUB(SUB) \ + clock_gettime(CLOCK_MONOTONIC, &time_sub_end); \ + elapsed_time_sec = \ + (time_sub_end.tv_sec - time_sub_start.tv_sec) * 1.0 + \ + (time_sub_end.tv_nsec - time_sub_start.tv_nsec) / 1000000000.0; \ + printf(PERF_COUNT_REPORT_STR, funct_name, SUB, elapsed_time_sec); \ + time_sub_start = time_sub_end; #else #define PERF_COUNT_SUB(SUB) #endif @@ -171,29 +191,6 @@ typedef struct _OpenCLEnv } OpenCLEnv; typedef int ( *cl_kernel_function )( void **userdata, KernelEnv *kenv ); - -static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC; - -static const l_uint32 lmask32[] = {0x0, - 0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, - 0xf8000000, 0xfc000000, 0xfe000000, 0xff000000, - 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000, - 0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, - 0xffff8000, 0xffffc000, 0xffffe000, 0xfffff000, - 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00, - 0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, - 0xfffffff8, 0xfffffffc, 0xfffffffe, 0xffffffff}; - -static const l_uint32 rmask32[] = {0x0, - 0x00000001, 0x00000003, 0x00000007, 0x0000000f, - 0x0000001f, 0x0000003f, 0x0000007f, 0x000000ff, - 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff, - 0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, - 0x0001ffff, 0x0003ffff, 0x0007ffff, 0x000fffff, - 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff, - 0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, - 0x1fffffff, 0x3fffffff, 0x7fffffff, 0xffffffff}; - #define CHECK_OPENCL(status,name) \ if( status != CL_SUCCESS ) \ { \ @@ -244,13 +241,7 @@ public: static int BinaryGenerated( const char * clFileName, FILE ** fhandle ); //static int CompileKernelFile( const char *filename, GPUEnv *gpuInfo, const char *buildOption ); static l_uint32* pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl, l_uint32 *line); - static Pix* pixReadTiffCl( const char *filename, l_int32 n ); - static PIX * pixReadStreamTiffCl ( FILE *fp, l_int32 n ); - static PIX * pixReadMemTiffCl(const l_uint8 *data, size_t size, l_int32 n); - static PIX* pixReadFromTiffStreamCl(TIFF *tif); static int composeRGBPixelCl(int *tiffdata,int *line,int h,int w); - static l_int32 getTiffStreamResolutionCl(TIFF *tif,l_int32 *pxres,l_int32 *pyres); - static TIFF* fopenTiffCl(FILE *fp,const char *modestring); /* OpenCL implementations of Morphological operations*/ @@ -258,30 +249,12 @@ public: static int initMorphCLAllocations(l_int32 wpl, l_int32 h, PIX* pixs); static void releaseMorphCLBuffers(); - // OpenCL implementation of Morphology Dilate - static PIX* pixDilateBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy); - - // OpenCL implementation of Morphology Erode - static PIX* pixErodeBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy); - - // OpenCL implementation of Morphology Close - static PIX* pixCloseBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy); - - // OpenCL implementation of Morphology Open - static PIX* pixOpenBrickCL(PIX *pixd, PIX *pixs, l_int32 hsize, l_int32 vsize, bool reqDataCopy); - - // OpenCL implementation of Morphology Open - static PIX* pixSubtractCL(PIX *pixd, PIX *pixs1, PIX *pixs2, bool reqDataCopy); - - // OpenCL implementation of Morphology (Hollow = Closed - Open) - static PIX* pixHollowCL(PIX *pixd, PIX *pixs, l_int32 close_hsize, l_int32 close_vsize, l_int32 open_hsize, l_int32 open_vsize, bool reqDataCopy); - - static void pixGetLinesCL(PIX *pixd, PIX *pixs, - PIX** pix_vline, PIX** pix_hline, - PIX** pixClosed, bool getpixClosed, - l_int32 close_hsize, l_int32 close_vsize, - l_int32 open_hsize, l_int32 open_vsize, - l_int32 line_hsize, l_int32 line_vsize); + static void pixGetLinesCL(PIX *pixd, PIX *pixs, PIX **pix_vline, + PIX **pix_hline, PIX **pixClosed, + bool getpixClosed, l_int32 close_hsize, + l_int32 close_vsize, l_int32 open_hsize, + l_int32 open_vsize, l_int32 line_hsize, + l_int32 line_vsize); //int InitOpenclAttr( OpenCLEnv * env ); //int ReleaseKernel( KernelEnv * env ); @@ -293,8 +266,6 @@ public: //int RegisterKernelWrapper( const char *kernelName, cl_kernel_function function ); //int RunKernelWrapper( cl_kernel_function function, const char * kernelName, void **usrdata ); //int GetKernelEnvAndFunc( const char *kernelName, KernelEnv *env, cl_kernel_function *function ); - // static cl_device_id performDeviceSelection( ); - //static bool thresholdRectToPixMicroBench( TessScoreEvaluationInputData input, ds_device_type type); static int LoadOpencl(); #ifdef WIN32 @@ -302,42 +273,25 @@ public: static void FreeOpenclDll(); #endif - inline static int AddKernelConfig( int kCount, const char *kName ); /* for binarization */ - static int HistogramRectOCL( - const unsigned char *imagedata, - int bytes_per_pixel, - int bytes_per_line, - int left, - int top, - int width, - int height, - int kHistogramSize, - int *histogramAllChannels); + static int HistogramRectOCL(unsigned char *imagedata, int bytes_per_pixel, + int bytes_per_line, int left, int top, + int width, int height, int kHistogramSize, + int *histogramAllChannels); - static int ThresholdRectToPixOCL( - const unsigned char* imagedata, - int bytes_per_pixel, - int bytes_per_line, - const int* thresholds, - const int* hi_values, - Pix** pix, - int rect_height, - int rect_width, - int rect_top, - int rect_left); - - static Pix * pixConvertRGBToGrayOCL( Pix *pix, float weightRed = 0.3, float weightGreen = 0.5, float weightBlue = 0.2 ); + static int ThresholdRectToPixOCL(unsigned char *imagedata, + int bytes_per_pixel, int bytes_per_line, + int *thresholds, int *hi_values, Pix **pix, + int rect_height, int rect_width, + int rect_top, int rect_left); static ds_device getDeviceSelection(); static ds_device selectedDevice; static bool deviceIsSelected; static bool selectedDeviceIsOpenCL(); - static bool selectedDeviceIsNativeCPU(); - }; - -#endif +#endif // USE_OPENCL +#endif // TESSERACT_OPENCL_OPENCLWRAPPER_H_ diff --git a/snap/snapcraft.yaml b/snap/snapcraft.yaml new file mode 100644 index 00000000..4dd68d4a --- /dev/null +++ b/snap/snapcraft.yaml @@ -0,0 +1,34 @@ +name: tesseract +version: master +summary: open source optical character recognition engine +description: | + Tesseract has unicode (UTF-8) support, and can recognize more than 100 + languages "out of the box". It can be trained to recognize other languages. + Tesseract supports various output formats: plain-text, hocr(html), pdf. + +grade: devel # must be 'stable' to release into candidate/stable channels +confinement: strict + +apps: + tesseract: + command: env TESSDATA_PREFIX=$SNAP_USER_COMMON tesseract + plugs: [home] + +parts: + tesseract: + source: . + plugin: autotools + build-packages: + - autoconf-archive + - pkg-config + - libpng12-dev + - libjpeg8-dev + - libtiff5-dev + - zlib1g-dev + - libicu-dev + - libpango1.0-dev + - libcairo2-dev + after: [leptonica] + leptonica: + source: http://www.leptonica.org/source/leptonica-1.74.1.tar.gz + plugin: autotools diff --git a/tessdata/Makefile.am b/tessdata/Makefile.am index 184c94c6..f1fc9757 100644 --- a/tessdata/Makefile.am +++ b/tessdata/Makefile.am @@ -27,22 +27,7 @@ langdata = bul.traineddata mlt.traineddata chr.traineddata \ tur.traineddata epo.traineddata msa.traineddata \ kor.traineddata isl.traineddata jpn.traineddata \ chi_tra.traineddata ita.traineddata spa_old.traineddata \ - deu-frak.traineddata aze.traineddata fra.cube.lm \ - ita.tesseract_cube.nn eng.cube.word-freq rus.cube.lm \ - spa.cube.size fra.cube.nn fra.cube.params rus.cube.size \ - fra.cube.fold eng.cube.size ita.cube.bigrams \ - eng.tesseract_cube.nn rus.cube.params hin.cube.nn \ - spa.cube.params hin.cube.lm fra.cube.word-freq \ - spa.cube.word-freq ara.cube.nn ara.cube.word-freq \ - spa.cube.fold eng.cube.nn eng.cube.params eng.cube.lm \ - ita.cube.size hin.tesseract_cube.nn ita.cube.lm \ - fra.cube.bigrams ara.cube.fold spa.cube.bigrams \ - hin.cube.word-freq rus.cube.word-freq ita.cube.word-freq \ - fra.tesseract_cube.nn rus.cube.fold ara.cube.size \ - eng.cube.fold ita.cube.params ara.cube.params ita.cube.fold \ - ara.cube.bigrams hin.cube.params hin.cube.fold spa.cube.lm \ - ita.cube.nn fra.cube.size eng.cube.bigrams ara.cube.lm \ - rus.cube.nn spa.cube.nn hin.cube.bigrams + deu-frak.traineddata aze.traineddata uninstall-local: cd $(DESTDIR)$(datadir); \ diff --git a/tessdata/configs/Makefile.am b/tessdata/configs/Makefile.am index a4f1d675..461ac176 100644 --- a/tessdata/configs/Makefile.am +++ b/tessdata/configs/Makefile.am @@ -1,3 +1,3 @@ datadir = @datadir@/tessdata/configs -data_DATA = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr tsv linebox pdf rebox strokewidth bigram txt -EXTRA_DIST = inter makebox box.train unlv ambigs.train api_config kannada box.train.stderr quiet logfile digits hocr tsv linebox pdf rebox strokewidth bigram txt +data_DATA = inter makebox box.train unlv ambigs.train lstm.train api_config kannada box.train.stderr quiet logfile digits hocr tsv linebox pdf rebox strokewidth bigram txt +EXTRA_DIST = inter makebox box.train unlv ambigs.train lstm.train api_config kannada box.train.stderr quiet logfile digits hocr tsv linebox pdf rebox strokewidth bigram txt diff --git a/tessdata/configs/box.train.stderr b/tessdata/configs/box.train.stderr index 6fc51fdd..d44ff2b2 100644 --- a/tessdata/configs/box.train.stderr +++ b/tessdata/configs/box.train.stderr @@ -1,7 +1,7 @@ -file_type .bl -#tessedit_use_nn F -textord_fast_pitch_test T -tessedit_single_match 0 +file_type .bl +#tessedit_use_nn F +textord_fast_pitch_test T +tessedit_single_match 0 tessedit_zero_rejection T tessedit_minimal_rejection F tessedit_write_rep_codes F diff --git a/tessdata/configs/lstm.train b/tessdata/configs/lstm.train new file mode 100755 index 00000000..3cb172d5 --- /dev/null +++ b/tessdata/configs/lstm.train @@ -0,0 +1,13 @@ +disable_character_fragments T +file_type .bl +textord_fast_pitch_test T +tessedit_single_match 0 +tessedit_zero_rejection T +tessedit_minimal_rejection F +tessedit_write_rep_codes F +il1_adaption_test 1 +edges_children_fix F +edges_childarea 0.65 +edges_boxarea 0.9 +tessedit_train_line_recognizer T +textord_no_rejects T diff --git a/tessdata/pdf.ttf b/tessdata/pdf.ttf index 578974a9..d1472b20 100644 Binary files a/tessdata/pdf.ttf and b/tessdata/pdf.ttf differ diff --git a/tesseract.pc.cmake b/tesseract.pc.cmake new file mode 100644 index 00000000..f9f64f6c --- /dev/null +++ b/tesseract.pc.cmake @@ -0,0 +1,12 @@ +prefix=@CMAKE_INSTALL_PREFIX@ +exec_prefix=${prefix}/bin +libdir=${prefix}/lib +includedir=${prefix}/include + +Name: @tesseract_NAME@ +Description: An OCR Engine that was developed at HP Labs between 1985 and 1995... and now at Google. +URL: https://github.com/tesseract-ocr/tesseract +Version: @tesseract_VERSION@ +Libs: -L${libdir} -l@tesseract_OUTPUT_NAME@ +Libs.private: +Cflags: -I${includedir} -I${includedir}/tesseract diff --git a/testing/counttestset.sh b/testing/counttestset.sh index d44b7e83..4bce68da 100755 --- a/testing/counttestset.sh +++ b/testing/counttestset.sh @@ -49,13 +49,13 @@ do fi # echo "$srcdir/$page.tif" # Count character errors. - testing/unlv/accuracy $srcdir/$page.txt $resdir/$page.txt $resdir/$page.acc + testing/unlv/accuracy "$srcdir/$page.txt" "$resdir/$page.txt" "$resdir/$page.acc" accfiles="$accfiles $resdir/$page.acc" # Count word errors. - testing/unlv/wordacc $srcdir/$page.txt $resdir/$page.txt $resdir/$page.wa + testing/unlv/wordacc "$srcdir/$page.txt" "$resdir/$page.txt" "$resdir/$page.wa" wafiles="$wafiles $resdir/$page.wa" -done <$pages -testing/unlv/accsum $accfiles >testing/reports/$setname.characc -testing/unlv/wordaccsum $wafiles >testing/reports/$setname.wordacc +done <"$pages" +testing/unlv/accsum "$accfiles" >"testing/reports/$setname.characc" +testing/unlv/wordaccsum "$wafiles" >"testing/reports/$setname.wordacc" diff --git a/testing/reorgdata.sh b/testing/reorgdata.sh index 141de4a6..34ad6d69 100755 --- a/testing/reorgdata.sh +++ b/testing/reorgdata.sh @@ -1,4 +1,13 @@ #!/bin/bash +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. if [ $# -ne 1 ] then @@ -24,21 +33,21 @@ do if [ -r "$old/PAGES" ] then new=${s#*/}.$ext - mkdir -p $new + mkdir -p "$new" echo "Set $old -> $new" #The pages file had - instead of _ so fix it and add the extension. - for page in `cat $old/PAGES` + for page in $(cat $old/PAGES) do echo "${page%-*}_${page#*-}.$ext" - done >$new/pages - for f in `cat $new/pages` + done >"$new/pages" + for f in $(cat "$new/pages") do #Put a tif extension on the tif files. - cp $old/${old}_B/$f $new/$f.tif + cp "$old/${old}_B/$f" "$new/$f.tif" #Put a uzn extension on the zone files. - cp $old/${old}_B/${f}Z $new/$f.uzn + cp "$old/${old}_B/${f}Z" "$new/$f.uzn" #Cat all the truth files together and put into a single txt file. - cat $old/${old}_GT/${f%.$ext}.Z* >$new/$f.txt + cat "$old/${old}_GT/${f%.$ext}".Z* >"$new/$f.txt" done fi done diff --git a/testing/runalltests.sh b/testing/runalltests.sh index b23ebc96..f458bdea 100755 --- a/testing/runalltests.sh +++ b/testing/runalltests.sh @@ -25,12 +25,12 @@ then echo "Run $0 from the tesseract-ocr root directory!" exit 1 fi -if [ ! -r api/tesseract -a ! -r tesseract.exe ] +if [ ! -r api/tesseract ] && [ ! -r tesseract.exe ] then echo "Please build tesseract before running $0" exit 1 fi -if [ ! -r testing/unlv/accuracy -a ! -r testing/unlv/accuracy.exe ] +if [ ! -r testing/unlv/accuracy ] && [ ! -r testing/unlv/accuracy.exe ] then echo "Please download the UNLV accuracy tools (and build) to testing/unlv" exit 1 @@ -39,7 +39,7 @@ fi #deltapc new old calculates the %change from old to new deltapc() { awk ' BEGIN { -printf("%.2f", 100.0*('$1'-'$2')/'$2'); +printf("%.2f", 100.0*('"$1"'-'"$2"')/'"$2"'); }' } @@ -53,7 +53,7 @@ total = 0.0; } END { printf("%.2f\n", total); -}' $1 +}' "$1" } imdir="$1" @@ -74,47 +74,47 @@ totaloldwerrs=0 totaloldnswerrs=0 for set in $testsets do - if [ -r $imdir/$set/pages ] + if [ -r "$imdir/$set/pages" ] then # Run tesseract on all the pages. - $bindir/runtestset.sh $imdir/$set/pages + $bindir/runtestset.sh "$imdir/$set/pages" # Count the errors on all the pages. - $bindir/counttestset.sh $imdir/$set/pages + $bindir/counttestset.sh "$imdir/$set/pages" # Get the old character word and nonstop word errors. - olderrs=`cat testing/reports/1995.$set.sum | cut -f3` - oldwerrs=`cat testing/reports/1995.$set.sum | cut -f6` - oldnswerrs=`cat testing/reports/1995.$set.sum | cut -f9` + olderrs=$(cut -f3 "testing/reports/1995.$set.sum") + oldwerrs=$(cut -f6 "testing/reports/1995.$set.sum") + oldnswerrs=$(cut -f9 "testing/reports/1995.$set.sum") # Get the new character word and nonstop word errors and accuracy. - cherrs=`head -4 testing/reports/$set.characc |tail -1 |cut -c1-9 | - tr -d '[:blank:]'` - chacc=`head -5 testing/reports/$set.characc |tail -1 |cut -c1-9 | - tr -d '[:blank:]'` - wderrs=`head -4 testing/reports/$set.wordacc |tail -1 |cut -c1-9 | - tr -d '[:blank:]'` - wdacc=`head -5 testing/reports/$set.wordacc |tail -1 |cut -c1-9 | - tr -d '[:blank:]'` - nswderrs=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 | - cut -c10-17 |tr -d '[:blank:]'` - nswdacc=`grep Total testing/reports/$set.wordacc |head -2 |tail -1 | - cut -c19-26 |tr -d '[:blank:]'` + cherrs=$(head -4 "testing/reports/$set.characc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + chacc=$(head -5 "testing/reports/$set.characc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + wderrs=$(head -4 "testing/reports/$set.wordacc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + wdacc=$(head -5 "testing/reports/$set.wordacc" |tail -1 |cut -c1-9 | + tr -d '[:blank:]') + nswderrs=$(grep Total "testing/reports/$set.wordacc" |head -2 |tail -1 | + cut -c10-17 |tr -d '[:blank:]') + nswdacc=$(grep Total "testing/reports/$set.wordacc" |head -2 |tail -1 | + cut -c19-26 |tr -d '[:blank:]') # Compute the percent change. - chdelta=`deltapc $cherrs $olderrs` - wdelta=`deltapc $wderrs $oldwerrs` - nswdelta=`deltapc $nswderrs $oldnswerrs` + chdelta=$(deltapc "$cherrs" "$olderrs") + wdelta=$(deltapc "$wderrs" "$oldwerrs") + nswdelta=$(deltapc "$nswderrs" "$oldnswerrs") sumfile=$rdir/$vid.$set.sum - if [ -r testing/reports/$set.times ] + if [ -r "testing/reports/$set.times" ] then - total_time=`timesum testing/reports/$set.times` - if [ -r testing/reports/prev/$set.times ] + total_time=$(timesum "testing/reports/$set.times") + if [ -r "testing/reports/prev/$set.times" ] then - paste testing/reports/prev/$set.times testing/reports/$set.times | - awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >testing/reports/$set.timedelta + paste "testing/reports/prev/$set.times" "testing/reports/$set.times" | + awk '{ printf("%s %.2f\n", $1, $4-$2); }' |sort -k2n >"testing/reports/$set.timedelta" fi else total_time='0.0' fi echo "$vid $set $cherrs $chacc $chdelta% $wderrs $wdacc\ - $wdelta% $nswderrs $nswdacc $nswdelta% ${total_time}s" >$sumfile + $wdelta% $nswderrs $nswdacc $nswdelta% ${total_time}s" >"$sumfile" # Sum totals over all the testsets. let totalerrs=totalerrs+cherrs let totalwerrs=totalwerrs+wderrs @@ -125,10 +125,10 @@ do fi done # Compute grand total percent change. -chdelta=`deltapc $totalerrs $totalolderrs` -wdelta=`deltapc $totalwerrs $totaloldwerrs` -nswdelta=`deltapc $totalnswerrs $totaloldnswerrs ` +chdelta=$(deltapc $totalerrs $totalolderrs) +wdelta=$(deltapc $totalwerrs $totaloldwerrs) +nswdelta=$(deltapc $totalnswerrs $totaloldnswerrs) tfile=$rdir/$vid.total.sum echo "$vid Total $totalerrs - $chdelta% $totalwerrs\ - - $wdelta% $totalnswerrs - $nswdelta%" >$tfile -cat $rdir/1995.*.sum $rdir/$vid.*.sum >$rdir/$vid.summary + - $wdelta% $totalnswerrs - $nswdelta%" >"$tfile" +cat $rdir/1995.*.sum "$rdir/$vid".*.sum >"$rdir/$vid".summary diff --git a/testing/runtestset.sh b/testing/runtestset.sh index 0c9595f9..b53e15aa 100755 --- a/testing/runtestset.sh +++ b/testing/runtestset.sh @@ -15,7 +15,7 @@ # See the License for the specific language governing permissions and # limitations under the License. -if [ $# -ne 1 -a $# -ne 2 ] +if [ $# -ne 1 ] && [ $# -ne 2 ] then echo "Usage:$0 pagesfile [-zoning]" exit 1 @@ -42,7 +42,7 @@ fi pages=$1 imdir=${pages%/pages} setname=${imdir##*/} -if [ $# -eq 2 -a "$2" = "-zoning" ] +if [ $# -eq 2 ] && [ "$2" = "-zoning" ] then config=unlv.auto resdir=testing/results/zoning.$setname @@ -51,8 +51,8 @@ else resdir=testing/results/$setname fi echo -e "Testing on set $setname in directory $imdir to $resdir\n" -mkdir -p $resdir -rm -f testing/reports/$setname.times +mkdir -p "$resdir" +rm -f "testing/reports/$setname.times" while read page dir do # A pages file may be a list of files with subdirs or maybe just @@ -64,15 +64,15 @@ do srcdir="$imdir" fi # echo "$srcdir/$page.tif" - $tess $srcdir/$page.tif $resdir/$page -psm 6 $config 2>&1 |grep -v "OCR Engine" + $tess "$srcdir/$page.tif" "$resdir/$page" --psm 6 $config 2>&1 |grep -v "OCR Engine" if [ -r times.txt ] then read t >testing/reports/$setname.times + echo "$page $t" >>"testing/reports/$setname.times" echo -e "\033M$page $t" if [ "$t" = "Command terminated by signal 2" ] then exit 0 fi fi -done <$pages +done <"$pages" diff --git a/textord/alignedblob.cpp b/textord/alignedblob.cpp index 007d4ad3..1aee958f 100644 --- a/textord/alignedblob.cpp +++ b/textord/alignedblob.cpp @@ -30,7 +30,6 @@ INT_VAR(textord_testregion_left, -1, "Left edge of debug reporting rectangle"); INT_VAR(textord_testregion_top, -1, "Top edge of debug reporting rectangle"); INT_VAR(textord_testregion_right, MAX_INT32, "Right edge of debug rectangle"); INT_VAR(textord_testregion_bottom, MAX_INT32, "Bottom edge of debug rectangle"); -BOOL_VAR(textord_debug_images, false, "Use greyed image background for debug"); BOOL_VAR(textord_debug_printable, false, "Make debug windows printable"); namespace tesseract { @@ -64,25 +63,6 @@ const double kMinTabGradient = 4.0; // If the angle is small, the angle in degrees is roughly 60/kMaxSkewFactor. const int kMaxSkewFactor = 15; -// Constant part of textord_debug_pix_. -const char* kTextordDebugPix = "psdebug_pix"; - -// Name of image file to use if textord_debug_images is true. -STRING AlignedBlob::textord_debug_pix_ = kTextordDebugPix; -// Index to image file to use if textord_debug_images is true. -int AlignedBlob::debug_pix_index_ = 0; - -// Increment the serial number counter and set the string to use -// for a filename if textord_debug_images is true. -void AlignedBlob::IncrementDebugPix() { - ++debug_pix_index_; - textord_debug_pix_ = kTextordDebugPix; - char numbuf[32]; - snprintf(numbuf, sizeof(numbuf), "%d", debug_pix_index_); - textord_debug_pix_ += numbuf; - textord_debug_pix_ += ".pix"; -} - // Constructor to set the parameters for finding aligned and ragged tabs. // Vertical_x and vertical_y are the current estimates of the true vertical // direction (up) in the image. Height is the height of the starter blob. @@ -188,7 +168,7 @@ ScrollView* AlignedBlob::DisplayTabs(const char* window_name, gsearch.StartFullSearch(); BLOBNBOX* bbox; while ((bbox = gsearch.NextFullSearch()) != NULL) { - TBOX box = bbox->bounding_box(); + const TBOX& box = bbox->bounding_box(); int left_x = box.left(); int right_x = box.right(); int top_y = box.top(); diff --git a/textord/alignedblob.h b/textord/alignedblob.h index cbc727a1..3930e955 100644 --- a/textord/alignedblob.h +++ b/textord/alignedblob.h @@ -18,8 +18,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_ALIGNEDBLOB_H__ -#define TESSERACT_TEXTORD_ALIGNEDBLOB_H__ +#ifndef TESSERACT_TEXTORD_ALIGNEDBLOB_H_ +#define TESSERACT_TEXTORD_ALIGNEDBLOB_H_ #include "bbgrid.h" #include "blobbox.h" @@ -29,8 +29,6 @@ extern INT_VAR_H(textord_debug_bugs, 0, "Turn on output related to bugs in tab finding"); extern INT_VAR_H(textord_debug_tabfind, 2, "Debug tab finding"); -extern BOOL_VAR_H(textord_debug_images, false, - "Use greyed image background for debug"); extern BOOL_VAR_H(textord_debug_printable, false, "Make debug windows printable"); @@ -102,17 +100,6 @@ class AlignedBlob : public BlobGrid { BLOBNBOX* bbox, int* vertical_x, int* vertical_y); - // Increment the serial number counter and set the string to use - // for a filename if textord_debug_images is true. - static void IncrementDebugPix(); - - // Return the string to use for a filename if textord_debug_images is true. - // Use IncrementDebugPix first to set the filename, and each time is - // to be incremented. - static const STRING& textord_debug_pix() { - return textord_debug_pix_; - } - private: // Find a set of blobs that are aligned in the given vertical // direction with the given blob. Returns a list of aligned @@ -132,14 +119,8 @@ class AlignedBlob : public BlobGrid { BLOBNBOX* FindAlignedBlob(const AlignedBlobParams& p, bool top_to_bottom, BLOBNBOX* bbox, int x_start, int* end_y); - - // Name of image file to use if textord_debug_images is true. - static STRING textord_debug_pix_; - // Index to image file to use if textord_debug_images is true. - static int debug_pix_index_; }; } // namespace tesseract. -#endif // TESSERACT_TEXTORD_ALIGNEDBLOB_H__ - +#endif // TESSERACT_TEXTORD_ALIGNEDBLOB_H_ diff --git a/textord/baselinedetect.cpp b/textord/baselinedetect.cpp index a2b01739..d90f2b1a 100644 --- a/textord/baselinedetect.cpp +++ b/textord/baselinedetect.cpp @@ -782,11 +782,9 @@ double BaselineBlock::FitLineSpacingModel( return rms_error; } - BaselineDetect::BaselineDetect(int debug_level, const FCOORD& page_skew, TO_BLOCK_LIST* blocks) - : page_skew_(page_skew), debug_level_(debug_level), pix_debug_(NULL), - debug_file_prefix_("") { + : page_skew_(page_skew), debug_level_(debug_level) { TO_BLOCK_IT it(blocks); for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { TO_BLOCK* to_block = it.data(); @@ -804,7 +802,6 @@ BaselineDetect::BaselineDetect(int debug_level, const FCOORD& page_skew, } BaselineDetect::~BaselineDetect() { - pixDestroy(&pix_debug_); } // Finds the initial baselines for each TO_ROW in each TO_BLOCK, gathers @@ -847,30 +844,15 @@ void BaselineDetect::ComputeBaselineSplinesAndXheights(const ICOORD& page_tr, bool remove_noise, bool show_final_rows, Textord* textord) { - Pix* pix_spline = pix_debug_ ? pixConvertTo32(pix_debug_) : NULL; for (int i = 0; i < blocks_.size(); ++i) { BaselineBlock* bl_block = blocks_[i]; - bl_block->PrepareForSplineFitting(page_tr, remove_noise); + if (enable_splines) + bl_block->PrepareForSplineFitting(page_tr, remove_noise); bl_block->FitBaselineSplines(enable_splines, show_final_rows, textord); - if (pix_spline) { - bl_block->DrawPixSpline(pix_spline); - } if (show_final_rows) { bl_block->DrawFinalRows(page_tr); } } - - if (pix_spline) { - STRING outfile_name = debug_file_prefix_ + "_spline.png"; - pixWrite(outfile_name.string(), pix_spline, IFF_PNG); - pixDestroy(&pix_spline); - } -} - -void BaselineDetect::SetDebugImage(Pix* pixIn, const STRING& output_path) { - pixDestroy(&pix_debug_); - pix_debug_ = pixClone(pixIn); - debug_file_prefix_ = output_path; } } // namespace tesseract. diff --git a/textord/baselinedetect.h b/textord/baselinedetect.h index 7a47931f..8d11bdff 100644 --- a/textord/baselinedetect.h +++ b/textord/baselinedetect.h @@ -262,10 +262,6 @@ class BaselineDetect { bool show_final_rows, Textord* textord); - // Set up the image and filename, so that a debug image with the detected - // baseline rendered will be saved. - void SetDebugImage(Pix* pixIn, const STRING& output_path); - private: // Average (median) skew of the blocks on the page among those that have // a good angle of their own. @@ -274,9 +270,6 @@ class BaselineDetect { int debug_level_; // The blocks that we are working with. PointerVector blocks_; - - Pix* pix_debug_; - STRING debug_file_prefix_; }; } // namespace tesseract diff --git a/textord/bbgrid.cpp b/textord/bbgrid.cpp index 06114748..4cadcdcf 100644 --- a/textord/bbgrid.cpp +++ b/textord/bbgrid.cpp @@ -231,7 +231,7 @@ Pix* GridReducedPix(const TBOX& box, int gridsize, // Note that the Pix is used upside-down, with (0, 0) being the bottom-left. Pix* TraceOutlineOnReducedPix(C_OUTLINE* outline, int gridsize, ICOORD bleft, int* left, int* bottom) { - TBOX box = outline->bounding_box(); + const TBOX& box = outline->bounding_box(); Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom); int wpl = pixGetWpl(pix); l_uint32* data = pixGetData(pix); @@ -257,7 +257,7 @@ Pix* TraceOutlineOnReducedPix(C_OUTLINE* outline, int gridsize, // As TraceOutlineOnReducedPix above, but on a BLOCK instead of a C_OUTLINE. Pix* TraceBlockOnReducedPix(BLOCK* block, int gridsize, ICOORD bleft, int* left, int* bottom) { - TBOX box = block->bounding_box(); + const TBOX& box = block->bounding_box(); Pix* pix = GridReducedPix(box, gridsize, bleft, left, bottom); int wpl = pixGetWpl(pix); l_uint32* data = pixGetData(pix); diff --git a/textord/bbgrid.h b/textord/bbgrid.h index d16b902e..fb175efb 100644 --- a/textord/bbgrid.h +++ b/textord/bbgrid.h @@ -18,12 +18,13 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_BBGRID_H__ -#define TESSERACT_TEXTORD_BBGRID_H__ +#ifndef TESSERACT_TEXTORD_BBGRID_H_ +#define TESSERACT_TEXTORD_BBGRID_H_ + +#include #include "clst.h" #include "coutln.h" -#include "hashfn.h" #include "rect.h" #include "scrollview.h" @@ -364,7 +365,7 @@ template class GridSearch { // An iterator over the list at (x_, y_) in the grid_. BBC_C_IT it_; // Set of unique returned elements used when unique_mode_ is true. - unordered_set > returns_; + std::unordered_set > returns_; }; // Sort function to sort a BBC by bounding_box().left(). @@ -623,7 +624,7 @@ void BBGrid::DisplayBoxes(ScrollView* tab_win) { gsearch.StartFullSearch(); BBC* bbox; while ((bbox = gsearch.NextFullSearch()) != NULL) { - TBOX box = bbox->bounding_box(); + const TBOX& box = bbox->bounding_box(); int left_x = box.left(); int right_x = box.right(); int top_y = box.top(); @@ -958,4 +959,4 @@ void GridSearch::SetIterator() { } // namespace tesseract. -#endif // TESSERACT_TEXTORD_BBGRID_H__ +#endif // TESSERACT_TEXTORD_BBGRID_H_ diff --git a/textord/blkocc.h b/textord/blkocc.h index 89462dc8..f27bb9a5 100644 --- a/textord/blkocc.h +++ b/textord/blkocc.h @@ -52,9 +52,8 @@ class REGION_OCC:public ELIST_LINK float max_x; //Highest x in region inT16 region_type; //Type of crossing - REGION_OCC() { - }; //constructor used - //only in COPIER etc + REGION_OCC() {} // constructor used + // only in COPIER etc REGION_OCC( //constructor float min, float max, @@ -72,12 +71,12 @@ ELISTIZEH (REGION_OCC) Adapted from the following procedure so that it can be used in the bands class in an include file... -BOOL8 range_in_band[ +BOOL8 range_in_band[ range within band? -inT16 band_max, -inT16 band_min, -inT16 range_max, -inT16 range_min] +inT16 band_max, +inT16 band_min, +inT16 range_max, +inT16 range_min] { if ( (range_min >= band_min) && (range_max < band_max) ) return TRUE; @@ -91,12 +90,12 @@ inT16 range_min] Adapted from the following procedure so that it can be used in the bands class in an include file... -BOOL8 range_overlaps_band[ +BOOL8 range_overlaps_band[ range crosses band? -inT16 band_max, -inT16 band_min, -inT16 range_max, -inT16 range_min] +inT16 band_max, +inT16 band_min, +inT16 range_max, +inT16 range_min] { if ( (range_max >= band_min) && (range_min < band_max) ) return TRUE; diff --git a/textord/ccnontextdetect.cpp b/textord/ccnontextdetect.cpp index 1cb0e4c6..f6a7d8f4 100644 --- a/textord/ccnontextdetect.cpp +++ b/textord/ccnontextdetect.cpp @@ -305,7 +305,7 @@ bool CCNonTextDetect::BlobOverlapsTooMuch(BLOBNBOX* blob, int max_overlaps) { // Search the grid to see what intersects it. // Setup a Rectangle search for overlapping this blob. BlobGridSearch rsearch(this); - TBOX box = blob->bounding_box(); + const TBOX& box = blob->bounding_box(); rsearch.StartRectSearch(box); rsearch.SetUniqueMode(true); BLOBNBOX* neighbour; diff --git a/textord/colfind.cpp b/textord/colfind.cpp index ae48e9ff..44e27242 100644 --- a/textord/colfind.cpp +++ b/textord/colfind.cpp @@ -290,8 +290,8 @@ void ColumnFinder::CorrectOrientation(TO_BLOCK* block, int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor, TO_BLOCK* input_block, Pix* photo_mask_pix, Pix* thresholds_pix, - Pix* grey_pix, BLOCK_LIST* blocks, - BLOBNBOX_LIST* diacritic_blobs, + Pix* grey_pix, DebugPixa* pixa_debug, + BLOCK_LIST* blocks, BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks) { pixOr(photo_mask_pix, photo_mask_pix, nontext_map_); stroke_width_->FindLeaderPartitions(input_block, &part_grid_); @@ -304,11 +304,13 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, &projection_, diacritic_blobs, &part_grid_, &big_parts_); if (!PSM_SPARSE(pageseg_mode)) { ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, - input_block, this, &part_grid_, &big_parts_); + input_block, this, pixa_debug, &part_grid_, + &big_parts_); ImageFind::TransferImagePartsToImageMask(rerotate_, &part_grid_, photo_mask_pix); ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, - input_block, this, &part_grid_, &big_parts_); + input_block, this, pixa_debug, &part_grid_, + &big_parts_); } part_grid_.ReTypeBlobs(&image_bblobs_); TidyBlobs(input_block); @@ -441,9 +443,6 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, if (textord_tabfind_show_partitions) { ScrollView* window = MakeWindow(400, 300, "Partitions"); if (window != NULL) { - if (textord_debug_images) - window->Image(AlignedBlob::textord_debug_pix().string(), - image_origin().x(), image_origin().y()); part_grid_.DisplayBoxes(window); if (!textord_debug_printable) DisplayTabVectors(window); @@ -519,11 +518,7 @@ void ColumnFinder::DisplayBlocks(BLOCK_LIST* blocks) { blocks_win_ = MakeWindow(700, 300, "Blocks"); else blocks_win_->Clear(); - if (textord_debug_images) - blocks_win_->Image(AlignedBlob::textord_debug_pix().string(), - image_origin().x(), image_origin().y()); - else - DisplayBoxes(blocks_win_); + DisplayBoxes(blocks_win_); BLOCK_IT block_it(blocks); int serial = 1; for (block_it.mark_cycle_pt(); !block_it.cycled_list(); @@ -543,11 +538,7 @@ void ColumnFinder::DisplayBlocks(BLOCK_LIST* blocks) { void ColumnFinder::DisplayColumnBounds(PartSetVector* sets) { #ifndef GRAPHICS_DISABLED ScrollView* col_win = MakeWindow(50, 300, "Columns"); - if (textord_debug_images) - col_win->Image(AlignedBlob::textord_debug_pix().string(), - image_origin().x(), image_origin().y()); - else - DisplayBoxes(col_win); + DisplayBoxes(col_win); col_win->Pen(textord_debug_printable ? ScrollView::BLUE : ScrollView::GREEN); for (int i = 0; i < gridheight_; ++i) { ColPartitionSet* columns = best_columns_[i]; diff --git a/textord/colfind.h b/textord/colfind.h index dc40cbb0..1918f41b 100644 --- a/textord/colfind.h +++ b/textord/colfind.h @@ -17,14 +17,15 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_COLFIND_H__ -#define TESSERACT_TEXTORD_COLFIND_H__ +#ifndef TESSERACT_TEXTORD_COLFIND_H_ +#define TESSERACT_TEXTORD_COLFIND_H_ -#include "tabfind.h" -#include "imagefind.h" #include "colpartitiongrid.h" #include "colpartitionset.h" +#include "debugpixa.h" +#include "imagefind.h" #include "ocrblock.h" +#include "tabfind.h" #include "textlineprojection.h" class BLOCK_LIST; @@ -163,7 +164,7 @@ class ColumnFinder : public TabFind { // in debug mode, which requests a retry with more debug info. int FindBlocks(PageSegMode pageseg_mode, Pix* scaled_color, int scaled_factor, TO_BLOCK* block, Pix* photo_mask_pix, Pix* thresholds_pix, - Pix* grey_pix, BLOCK_LIST* blocks, + Pix* grey_pix, DebugPixa* pixa_debug, BLOCK_LIST* blocks, BLOBNBOX_LIST* diacritic_blobs, TO_BLOCK_LIST* to_blocks); // Get the rotation required to deskew, and its inverse rotation. @@ -365,4 +366,4 @@ class ColumnFinder : public TabFind { } // namespace tesseract. -#endif // TESSERACT_TEXTORD_COLFIND_H__ +#endif // TESSERACT_TEXTORD_COLFIND_H_ diff --git a/textord/colpartition.cpp b/textord/colpartition.cpp index effb5a91..0d0b4ca3 100644 --- a/textord/colpartition.cpp +++ b/textord/colpartition.cpp @@ -918,7 +918,7 @@ void ColPartition::ComputeLimits() { for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { bbox = it.data(); if (non_leader_count == 0 || bbox->flow() != BTFT_LEADER) { - TBOX box = bbox->bounding_box(); + const TBOX& box = bbox->bounding_box(); int area = box.area(); top_stats.add(box.top(), area); bottom_stats.add(box.bottom(), area); @@ -1181,8 +1181,8 @@ bool ColPartition::MarkAsLeaderIfMonospaced() { if (best_end == NULL) { tprintf("No path\n"); } else { - tprintf("Total cost = %d vs allowed %d\n", - best_end->total_cost(), blob_count); + tprintf("Total cost = %d vs allowed %d\n", best_end->total_cost(), + blob_count); } } delete [] projection; @@ -1632,6 +1632,10 @@ TO_BLOCK* ColPartition::MakeBlock(const ICOORD& bleft, const ICOORD& tright, ColPartition_LIST* used_parts) { if (block_parts->empty()) return NULL; // Nothing to do. + // If the block_parts are not in reading order, then it will make an invalid + // block polygon and bounding_box, so sort by bounding box now just to make + // sure. + block_parts->sort(&ColPartition::SortByBBox); ColPartition_IT it(block_parts); ColPartition* part = it.data(); PolyBlockType type = part->type(); @@ -2121,7 +2125,7 @@ void ColPartition::RefinePartnersByOverlap(bool upper, // Return true if bbox belongs better in this than other. bool ColPartition::ThisPartitionBetter(BLOBNBOX* bbox, const ColPartition& other) { - TBOX box = bbox->bounding_box(); + const TBOX& box = bbox->bounding_box(); // Margins take priority. int left = box.left(); int right = box.right(); diff --git a/textord/colpartition.h b/textord/colpartition.h index 5c941cce..ed126299 100644 --- a/textord/colpartition.h +++ b/textord/colpartition.h @@ -18,8 +18,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_COLPARTITION_H__ -#define TESSERACT_TEXTORD_COLPARTITION_H__ +#ifndef TESSERACT_TEXTORD_COLPARTITION_H_ +#define TESSERACT_TEXTORD_COLPARTITION_H_ #include "bbgrid.h" #include "blobbox.h" // For BlobRegionType. @@ -704,6 +704,25 @@ class ColPartition : public ELIST2_LINK { // doing a SideSearch when you want things in the same page column. bool IsInSameColumnAs(const ColPartition& part) const; + // Sort function to sort by bounding box. + static int SortByBBox(const void* p1, const void* p2) { + const ColPartition* part1 = + *reinterpret_cast(p1); + const ColPartition* part2 = + *reinterpret_cast(p2); + int mid_y1 = part1->bounding_box_.y_middle(); + int mid_y2 = part2->bounding_box_.y_middle(); + if ((part2->bounding_box_.bottom() <= mid_y1 && + mid_y1 <= part2->bounding_box_.top()) || + (part1->bounding_box_.bottom() <= mid_y2 && + mid_y2 <= part1->bounding_box_.top())) { + // Sort by increasing x. + return part1->bounding_box_.x_middle() - part2->bounding_box_.x_middle(); + } + // Sort by decreasing y. + return mid_y2 - mid_y1; + } + // Sets the column bounds. Primarily used in testing. void set_first_column(int column) { first_column_ = column; @@ -914,4 +933,4 @@ typedef GridSearchbounding_box(); + const TBOX& nbox = neighbour->bounding_box(); if (nbox.contains(click)) { tprintf("Block box:"); neighbour->bounding_box().print(); @@ -645,46 +645,6 @@ bool ColPartitionGrid::GridSmoothNeighbours(BlobTextFlowType source_type, return any_changed; } -// Compute the mean RGB of the light and dark pixels in each ColPartition -// and also the rms error in the linearity of color. -void ColPartitionGrid::ComputePartitionColors(Pix* scaled_color, - int scaled_factor, - const FCOORD& rerotation) { - if (scaled_color == NULL) - return; - Pix* color_map1 = NULL; - Pix* color_map2 = NULL; - Pix* rms_map = NULL; - if (textord_tabfind_show_color_fit) { - int width = pixGetWidth(scaled_color); - int height = pixGetHeight(scaled_color); - color_map1 = pixCreate(width, height, 32); - color_map2 = pixCreate(width, height, 32); - rms_map = pixCreate(width, height, 8); - } - // Iterate the ColPartitions in the grid. - ColPartitionGridSearch gsearch(this); - gsearch.StartFullSearch(); - ColPartition* part; - while ((part = gsearch.NextFullSearch()) != NULL) { - TBOX part_box = part->bounding_box(); - part_box.rotate_large(rerotation); - ImageFind::ComputeRectangleColors(part_box, scaled_color, - scaled_factor, - color_map1, color_map2, rms_map, - part->color1(), part->color2()); - } - if (color_map1 != NULL) { - pixWrite("swcolorinput.png", scaled_color, IFF_PNG); - pixWrite("swcolor1.png", color_map1, IFF_PNG); - pixWrite("swcolor2.png", color_map2, IFF_PNG); - pixWrite("swrms.png", rms_map, IFF_PNG); - pixDestroy(&color_map1); - pixDestroy(&color_map2); - pixDestroy(&rms_map); - } -} - // Reflects the grid and its colpartitions in the y-axis, assuming that // all blob boxes have already been done. void ColPartitionGrid::ReflectInYAxis() { @@ -1037,7 +997,7 @@ void ColPartitionGrid::ListFindMargins(ColPartitionSet** best_columns, ColPartition* part = part_it.data(); ColPartitionSet* columns = NULL; if (best_columns != NULL) { - TBOX part_box = part->bounding_box(); + const TBOX& part_box = part->bounding_box(); // Get the columns from the y grid coord. int grid_x, grid_y; GridCoords(part_box.left(), part_box.bottom(), &grid_x, &grid_y); @@ -1376,7 +1336,7 @@ void ColPartitionGrid::FindMergeCandidates(const ColPartition* part, // combined box to see if anything else is inappropriately overlapped. if (!part_box.contains(c_box) && !c_box.contains(part_box)) { // Search the combined rectangle to see if anything new is overlapped. - // This is a preliminary test designed to quickly weed-out stupid + // This is a preliminary test designed to quickly weed-out poor // merge candidates that would create a big list of overlapped objects // for the squared-order overlap analysis. Eg. vertical and horizontal // line-like objects that overlap real text when merged: @@ -1569,7 +1529,7 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection( const TBOX& im_box, const FCOORD& rerotation, bool debug, const ColPartition& part, int* best_distance) { // Set up a rectangle search bounded by the part. - TBOX part_box = part.bounding_box(); + const TBOX& part_box = part.bounding_box(); TBOX search_box; ICOORD dist_scaling; ComputeSearchBoxAndScaling(direction, part_box, gridsize(), @@ -1619,10 +1579,10 @@ BlobRegionType ColPartitionGrid::SmoothInOneDirection( image_bias - htext_score >= kSmoothDecisionMargin && image_bias - vtext_score >= kSmoothDecisionMargin) { *best_distance = dists[NPT_IMAGE][0]; - if (dists[NPT_WEAK_VTEXT].size() > 0 && + if (!dists[NPT_WEAK_VTEXT].empty() && *best_distance > dists[NPT_WEAK_VTEXT][0]) *best_distance = dists[NPT_WEAK_VTEXT][0]; - if (dists[NPT_WEAK_HTEXT].size() > 0 && + if (!dists[NPT_WEAK_HTEXT].empty() && *best_distance > dists[NPT_WEAK_HTEXT][0]) *best_distance = dists[NPT_WEAK_HTEXT][0]; return BRT_POLYIMAGE; diff --git a/textord/colpartitiongrid.h b/textord/colpartitiongrid.h index 42f7d4ed..1e5b756e 100644 --- a/textord/colpartitiongrid.h +++ b/textord/colpartitiongrid.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_COLPARTITIONGRID_H__ -#define TESSERACT_TEXTORD_COLPARTITIONGRID_H__ +#ifndef TESSERACT_TEXTORD_COLPARTITIONGRID_H_ +#define TESSERACT_TEXTORD_COLPARTITIONGRID_H_ #include "bbgrid.h" #include "colpartition.h" @@ -106,11 +106,6 @@ class ColPartitionGrid : public BBGridboxa->box[i]; Pix* word_pix = pixClipRectangle(orig_pix_, box, NULL); ASSERT_HOST(word_pix); @@ -161,9 +159,9 @@ bool ShiroRekhaSplitter::Split(bool split_for_pageseg) { } boxaDestroy(®ions_to_clear); pixaDestroy(&ccs); - if (devanagari_split_debugimage) { - DumpDebugImage(split_for_pageseg ? "pageseg_split_debug.png" : - "ocr_split_debug.png"); + if (devanagari_split_debugimage && pixa_debug != nullptr) { + pixa_debug->AddPix(debug_image_, + split_for_pageseg ? "pageseg_split" : "ocr_split"); } return true; } diff --git a/textord/devanagari_processing.h b/textord/devanagari_processing.h index 990a5dfe..f95ef25a 100644 --- a/textord/devanagari_processing.h +++ b/textord/devanagari_processing.h @@ -1,9 +1,20 @@ // Copyright 2008 Google Inc. All Rights Reserved. // Author: shobhitsaxena@google.com (Shobhit Saxena) +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef TESSERACT_TEXTORD_DEVNAGARI_PROCESSING_H_ #define TESSERACT_TEXTORD_DEVNAGARI_PROCESSING_H_ +#include "allheaders.h" +#include "debugpixa.h" #include "ocrblock.h" #include "params.h" @@ -41,9 +52,7 @@ class PixelHistogram { length_ = 0; } - int* hist() const { - return hist_; - } + int* hist() const { return hist_; } int length() const { return length_; @@ -77,7 +86,7 @@ class ShiroRekhaSplitter { // Returns true if a split was actually performed. // If split_for_pageseg is true, the pageseg_split_strategy_ is used for // splitting. If false, the ocr_split_strategy_ is used. - bool Split(bool split_for_pageseg); + bool Split(bool split_for_pageseg, DebugPixa* pixa_debug); // Clears the memory held by this object. void Clear(); @@ -145,9 +154,6 @@ class ShiroRekhaSplitter { return segmentation_block_list_; } - // This method dumps a debug image to the specified location. - void DumpDebugImage(const char* filename) const; - // This method returns the computed mode-height of blobs in the pix. // It also prunes very small blobs from calculation. Could be used to provide // a global xheight estimate for images which have the same point-size text. diff --git a/textord/drawedg.h b/textord/drawedg.h index 6bf062d4..ef5ed5e2 100644 --- a/textord/drawedg.h +++ b/textord/drawedg.h @@ -1,8 +1,9 @@ /********************************************************************** * File: drawedg.h (Formerly drawedge.h) - * Description: Collection of functions to draw things to do with edge detection. - * Author: Ray Smith - * Created: Thu Jun 06 13:29:20 BST 1991 + * Description: Collection of functions to draw things to do with edge + *detection. + * Author: Ray Smith + * Created: Thu Jun 06 13:29:20 BST 1991 * * (C) Copyright 1991, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); diff --git a/textord/equationdetectbase.h b/textord/equationdetectbase.h index d47c74a5..06a26cc7 100644 --- a/textord/equationdetectbase.h +++ b/textord/equationdetectbase.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_EQUATIONDETECTBASE_H__ -#define TESSERACT_TEXTORD_EQUATIONDETECTBASE_H__ +#ifndef TESSERACT_TEXTORD_EQUATIONDETECTBASE_H_ +#define TESSERACT_TEXTORD_EQUATIONDETECTBASE_H_ class BLOBNBOX_LIST; class TO_BLOCK; @@ -56,4 +56,4 @@ class EquationDetectBase { }; // namespace tesseract -#endif // TESSERACT_TEXTORD_EQUATIONDETECTBASE_H__ +#endif // TESSERACT_TEXTORD_EQUATIONDETECTBASE_H_ diff --git a/textord/fpchop.cpp b/textord/fpchop.cpp index 4c18338b..699d4196 100644 --- a/textord/fpchop.cpp +++ b/textord/fpchop.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: fpchop.cpp (Formerly fp_chop.c) * Description: Code to chop fixed pitch text into character cells. - * Author: Ray Smith - * Created: Thu Sep 16 11:14:15 BST 1993 + * Author: Ray Smith + * Created: Thu Sep 16 11:14:15 BST 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -259,8 +259,8 @@ void split_to_blob( //split the blob pitch_error, left_coutlines, right_coutlines); - if (blob != NULL) - delete blob; //free it + + delete blob; } /********************************************************************** @@ -730,7 +730,6 @@ C_OUTLINE *join_chopped_fragments( //join pieces return NULL; } - /********************************************************************** * join_segments * diff --git a/textord/gap_map.cpp b/textord/gap_map.cpp index 2f8440e6..42120878 100644 --- a/textord/gap_map.cpp +++ b/textord/gap_map.cpp @@ -1,3 +1,12 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "statistc.h" #include "gap_map.h" diff --git a/textord/gap_map.h b/textord/gap_map.h index 914e8dbd..227db364 100644 --- a/textord/gap_map.h +++ b/textord/gap_map.h @@ -1,3 +1,12 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #ifndef GAP_MAP_H #define GAP_MAP_H diff --git a/textord/imagefind.cpp b/textord/imagefind.cpp index c119e69f..ad730bcc 100644 --- a/textord/imagefind.cpp +++ b/textord/imagefind.cpp @@ -62,7 +62,8 @@ const int kNoisePadding = 4; // the image regions as a mask image. // The returned pix may be NULL, meaning no images found. // If not NULL, it must be PixDestroyed by the caller. -Pix* ImageFind::FindImages(Pix* pix) { +// If textord_tabfind_show_images, debug images are appended to pixa_debug. +Pix* ImageFind::FindImages(Pix* pix, DebugPixa* pixa_debug) { // Not worth looking at small images. if (pixGetWidth(pix) < kMinImageFindSize || pixGetHeight(pix) < kMinImageFindSize) @@ -70,14 +71,15 @@ Pix* ImageFind::FindImages(Pix* pix) { // Reduce by factor 2. Pix *pixr = pixReduceRankBinaryCascade(pix, 1, 0, 0, 0); - pixDisplayWrite(pixr, textord_tabfind_show_images); + if (textord_tabfind_show_images && pixa_debug != nullptr) + pixa_debug->AddPix(pixr, "CascadeReduced"); // Get the halftone mask directly from Leptonica. // // Leptonica will print an error message and return NULL if we call // pixGenHalftoneMask(pixr, NULL, ...) with too small image, so we // want to bypass that. - if (pixGetWidth(pixr) < kMinImageFindSize || + if (pixGetWidth(pixr) < kMinImageFindSize || pixGetHeight(pixr) < kMinImageFindSize) { pixDestroy(&pixr); return pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1); @@ -93,7 +95,8 @@ Pix* ImageFind::FindImages(Pix* pix) { // Expand back up again. Pix *pixht = pixExpandReplicate(pixht2, 2); - pixDisplayWrite(pixht, textord_tabfind_show_images); + if (textord_tabfind_show_images && pixa_debug != nullptr) + pixa_debug->AddPix(pixht, "HalftoneReplicated"); pixDestroy(&pixht2); // Fill to capture pixels near the mask edges that were missed @@ -104,14 +107,16 @@ Pix* ImageFind::FindImages(Pix* pix) { // Eliminate lines and bars that may be joined to images. Pix* pixfinemask = pixReduceRankBinaryCascade(pixht, 1, 1, 3, 3); pixDilateBrick(pixfinemask, pixfinemask, 5, 5); - pixDisplayWrite(pixfinemask, textord_tabfind_show_images); + if (textord_tabfind_show_images && pixa_debug != nullptr) + pixa_debug->AddPix(pixfinemask, "FineMask"); Pix* pixreduced = pixReduceRankBinaryCascade(pixht, 1, 1, 1, 1); Pix* pixreduced2 = pixReduceRankBinaryCascade(pixreduced, 3, 3, 3, 0); pixDestroy(&pixreduced); pixDilateBrick(pixreduced2, pixreduced2, 5, 5); Pix* pixcoarsemask = pixExpandReplicate(pixreduced2, 8); pixDestroy(&pixreduced2); - pixDisplayWrite(pixcoarsemask, textord_tabfind_show_images); + if (textord_tabfind_show_images && pixa_debug != nullptr) + pixa_debug->AddPix(pixcoarsemask, "CoarseMask"); // Combine the coarse and fine image masks. pixAnd(pixcoarsemask, pixcoarsemask, pixfinemask); pixDestroy(&pixfinemask); @@ -119,13 +124,13 @@ Pix* ImageFind::FindImages(Pix* pix) { pixDilateBrick(pixcoarsemask, pixcoarsemask, 3, 3); Pix* pixmask = pixExpandReplicate(pixcoarsemask, 16); pixDestroy(&pixcoarsemask); - if (textord_tabfind_show_images) - pixWrite("junkexpandedcoarsemask.png", pixmask, IFF_PNG); + if (textord_tabfind_show_images && pixa_debug != nullptr) + pixa_debug->AddPix(pixmask, "MaskDilated"); // And the image mask with the line and bar remover. pixAnd(pixht, pixht, pixmask); pixDestroy(&pixmask); - if (textord_tabfind_show_images) - pixWrite("junkfinalimagemask.png", pixht, IFF_PNG); + if (textord_tabfind_show_images && pixa_debug != nullptr) + pixa_debug->AddPix(pixht, "FinalMask"); // Make the result image the same size as the input. Pix* result = pixCreate(pixGetWidth(pix), pixGetHeight(pix), 1); pixOr(result, result, pixht); @@ -140,22 +145,25 @@ Pix* ImageFind::FindImages(Pix* pix) { // If not NULL, they must be destroyed by the caller. // Resolution of pix should match the source image (Tesseract::pix_binary_) // so the output coordinate systems match. -void ImageFind::ConnCompAndRectangularize(Pix* pix, Boxa** boxa, Pixa** pixa) { +void ImageFind::ConnCompAndRectangularize(Pix* pix, DebugPixa* pixa_debug, + Boxa** boxa, Pixa** pixa) { *boxa = NULL; *pixa = NULL; - if (textord_tabfind_show_images) - pixWrite("junkconncompimage.png", pix, IFF_PNG); + if (textord_tabfind_show_images && pixa_debug != nullptr) + pixa_debug->AddPix(pix, "Conncompimage"); // Find the individual image regions in the mask image. *boxa = pixConnComp(pix, pixa, 8); // Rectangularize the individual images. If a sharp edge in vertical and/or // horizontal occupancy can be found, it indicates a probably rectangular // image with unwanted bits merged on, so clip to the approximate rectangle. - int npixes = pixaGetCount(*pixa); + int npixes = 0; + if (*boxa != nullptr && *pixa != nullptr) npixes = pixaGetCount(*pixa); for (int i = 0; i < npixes; ++i) { int x_start, x_end, y_start, y_end; Pix* img_pix = pixaGetPix(*pixa, i, L_CLONE); - pixDisplayWrite(img_pix, textord_tabfind_show_images); + if (textord_tabfind_show_images && pixa_debug != nullptr) + pixa_debug->AddPix(img_pix, "A component"); if (pixNearlyRectangular(img_pix, kMinRectangularFraction, kMaxRectangularFraction, kMaxRectangularGradient, @@ -1115,7 +1123,7 @@ static bool TestWeakIntersectedPart(const TBOX& im_box, ColPartition* part) { if (part->flow() < BTFT_STRONG_CHAIN) { // A weak partition intersects the box. - TBOX part_box = part->bounding_box(); + const TBOX& part_box = part->bounding_box(); if (im_box.contains(part_box)) { int area = part_box.area(); int intersect_area = IntersectArea(part_box, part_list); @@ -1180,7 +1188,7 @@ static bool ScanForOverlappingText(ColPartitionGrid* part_grid, TBOX* box) { part->flow() == BTFT_STRONG_CHAIN) { // Text intersects the box. any_text_in_padded_rect = true; - TBOX part_box = part->bounding_box(); + const TBOX& part_box = part->bounding_box(); if (box->overlap(part_box)) { return true; } @@ -1281,19 +1289,18 @@ static void DeleteSmallImages(ColPartitionGrid* part_grid) { // Since the other blobs in the other partitions will be owned by the block, // ColPartitionGrid::ReTypeBlobs must be called afterwards to fix this // situation and collect the image blobs. -void ImageFind::FindImagePartitions(Pix* image_pix, - const FCOORD& rotation, - const FCOORD& rerotation, - TO_BLOCK* block, - TabFind* tab_grid, - ColPartitionGrid* part_grid, - ColPartition_LIST* big_parts) { +void ImageFind::FindImagePartitions(Pix* image_pix, const FCOORD& rotation, + const FCOORD& rerotation, TO_BLOCK* block, + TabFind* tab_grid, DebugPixa* pixa_debug, + ColPartitionGrid* part_grid, + ColPartition_LIST* big_parts) { int imageheight = pixGetHeight(image_pix); Boxa* boxa; Pixa* pixa; - ConnCompAndRectangularize(image_pix, &boxa, &pixa); + ConnCompAndRectangularize(image_pix, pixa_debug, &boxa, &pixa); // Iterate the connected components in the image regions mask. - int nboxes = boxaGetCount(boxa); + int nboxes = 0; + if (boxa != nullptr && pixa != nullptr) nboxes = boxaGetCount(boxa); for (int i = 0; i < nboxes; ++i) { l_int32 x, y, width, height; boxaGetBoxGeometry(boxa, i, &x, &y, &width, &height); @@ -1305,8 +1312,8 @@ void ImageFind::FindImagePartitions(Pix* image_pix, ColPartition_LIST part_list; DivideImageIntoParts(im_box, rotation, rerotation, pix, &rectsearch, &part_list); - if (textord_tabfind_show_images) { - pixWrite("junkimagecomponent.png", pix, IFF_PNG); + if (textord_tabfind_show_images && pixa_debug != nullptr) { + pixa_debug->AddPix(pix, "ImageComponent"); tprintf("Component has %d parts\n", part_list.length()); } pixDestroy(&pix); diff --git a/textord/imagefind.h b/textord/imagefind.h index 1bf408eb..479653a1 100644 --- a/textord/imagefind.h +++ b/textord/imagefind.h @@ -18,9 +18,10 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_IMAGEFIND_H__ -#define TESSERACT_TEXTORD_IMAGEFIND_H__ +#ifndef TESSERACT_TEXTORD_IMAGEFIND_H_ +#define TESSERACT_TEXTORD_IMAGEFIND_H_ +#include "debugpixa.h" #include "host.h" struct Boxa; @@ -45,7 +46,8 @@ class ImageFind { // the image regions as a mask image. // The returned pix may be NULL, meaning no images found. // If not NULL, it must be PixDestroyed by the caller. - static Pix* FindImages(Pix* pix); + // If textord_tabfind_show_images, debug images are appended to pixa_debug. + static Pix* FindImages(Pix* pix, DebugPixa* pixa_debug); // Generates a Boxa, Pixa pair from the input binary (image mask) pix, // analgous to pixConnComp, except that connected components which are nearly @@ -54,7 +56,8 @@ class ImageFind { // If not NULL, they must be destroyed by the caller. // Resolution of pix should match the source image (Tesseract::pix_binary_) // so the output coordinate systems match. - static void ConnCompAndRectangularize(Pix* pix, Boxa** boxa, Pixa** pixa); + static void ConnCompAndRectangularize(Pix* pix, DebugPixa* pixa_debug, + Boxa** boxa, Pixa** pixa); // Returns true if there is a rectangle in the source pix, such that all // pixel rows and column slices outside of it have less than @@ -144,16 +147,13 @@ class ImageFind { // Since the other blobs in the other partitions will be owned by the block, // ColPartitionGrid::ReTypeBlobs must be called afterwards to fix this // situation and collect the image blobs. - static void FindImagePartitions(Pix* image_pix, - const FCOORD& rotation, - const FCOORD& rerotation, - TO_BLOCK* block, - TabFind* tab_grid, + static void FindImagePartitions(Pix* image_pix, const FCOORD& rotation, + const FCOORD& rerotation, TO_BLOCK* block, + TabFind* tab_grid, DebugPixa* pixa_debug, ColPartitionGrid* part_grid, ColPartition_LIST* big_parts); }; } // namespace tesseract. -#endif // TESSERACT_TEXTORD_LINEFIND_H__ - +#endif // TESSERACT_TEXTORD_LINEFIND_H_ diff --git a/textord/linefind.cpp b/textord/linefind.cpp index 874caed4..f57ab8c5 100644 --- a/textord/linefind.cpp +++ b/textord/linefind.cpp @@ -306,10 +306,8 @@ void LineFinder::FindAndRemoveLines(int resolution, bool debug, Pix* pix, pixDestroy(&pix_non_hline); pixDestroy(&pix_intersections); if (pixa_display != NULL) { -#if LIBLEPT_MINOR_VERSION >= 69 || LIBLEPT_MAJOR_VERSION > 1 pixaConvertToPdf(pixa_display, resolution, 1.0f, 0, 0, "LineFinding", "vhlinefinding.pdf"); -#endif pixaDestroy(&pixa_display); } PERF_COUNT_END diff --git a/textord/linefind.h b/textord/linefind.h index abbd7cdd..4569b8f0 100644 --- a/textord/linefind.h +++ b/textord/linefind.h @@ -18,8 +18,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_LINEFIND_H__ -#define TESSERACT_TEXTORD_LINEFIND_H__ +#ifndef TESSERACT_TEXTORD_LINEFIND_H_ +#define TESSERACT_TEXTORD_LINEFIND_H_ struct Boxa; struct Pix; @@ -146,5 +146,4 @@ class LineFinder { } // namespace tesseract. -#endif // TESSERACT_TEXTORD_LINEFIND_H__ - +#endif // TESSERACT_TEXTORD_LINEFIND_H_ diff --git a/textord/makerow.cpp b/textord/makerow.cpp index 1df4855b..a5749ad6 100644 --- a/textord/makerow.cpp +++ b/textord/makerow.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: makerow.cpp (Formerly makerows.c) * Description: Code to arrange blobs into rows of text. - * Author: Ray Smith - * Created: Mon Sep 21 14:34:48 BST 1992 + * Author: Ray Smith + * Created: Mon Sep 21 14:34:48 BST 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -507,8 +507,7 @@ void vigorous_noise_removal(TO_BLOCK* block) { continue; // Looks OK. } // It might be noise so get rid of it. - if (blob->cblob() != NULL) - delete blob->cblob(); + delete blob->cblob(); delete b_it.extract(); } else { prev = blob; @@ -671,7 +670,7 @@ BOOL8 find_best_dropout_row( //find neighbours TO_ROW_IT *row_it, //current position BOOL8 testing_on //correct orientation ) { - inT32 next_index; //of neighbouring row + inT32 next_index; // of neighbouring row inT32 row_offset; //from current row inT32 abs_dist; //absolute distance inT8 row_inc; //increment to row_index @@ -1786,7 +1785,7 @@ static int CountOverlaps(const TBOX& box, int min_height, BLOBNBOX_IT blob_it(blobs); for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) { BLOBNBOX* blob = blob_it.data(); - TBOX blob_box = blob->bounding_box(); + const TBOX &blob_box = blob->bounding_box(); if (blob_box.height() >= min_height && box.major_overlap(blob_box)) { ++overlaps; } diff --git a/textord/oldbasel.cpp b/textord/oldbasel.cpp index c73fe8d5..99e55fdb 100644 --- a/textord/oldbasel.cpp +++ b/textord/oldbasel.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: oldbasel.cpp (Formerly oldbl.c) * Description: A re-implementation of the old baseline algorithm. - * Author: Ray Smith - * Created: Wed Oct 6 09:41:48 BST 1993 + * Author: Ray Smith + * Created: Wed Oct 6 09:41:48 BST 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -122,7 +122,7 @@ void Textord::correlate_lines(TO_BLOCK *block, float gradient) { TO_ROW **rows; //array of ptrs int rowcount; /*no of rows to do */ int rowindex; /*no of row */ - //iterator + // iterator TO_ROW_IT row_it = block->get_rows (); rowcount = row_it.length (); @@ -1018,61 +1018,6 @@ int *partcount /*no of partitions */ return bestpart; } - -///*merge_partitions(partids,partcount,blobcount,bestpart) discards funny looking -//partitions and gives all the rest partid 0*/ -// -//merge_partitions(partids,partcount,blobcount,bestpart) -//register char *partids; /*partition numbers*/ -//int partcount; /*no of partitions*/ -//int blobcount; /*no of blobs*/ -//int bestpart; /*best partition*/ -//{ -// int blobindex; /*no along text line*/ -// int runlength; /*run of same partition*/ -// int bestrun; /*biggest runlength*/ -// -// bestrun=0; /*no runs yet*/ -// runlength=1; -// for (blobindex=1;blobindexbestrun) -// bestrun=runlength; /*find biggest run*/ -// runlength=1; /*new run*/ -// } -// else -// { runlength++; -// } -// } -// if (runlength>bestrun) -// bestrun=runlength; -// -// for (blobindex=0;blobindex=blobcount -// || partids[blobindex]!=partids[blobindex+1]) -// /*loner*/ -// && (bestrun>2 || partids[blobindex]!=bestpart)) -// { partids[blobindex]=partcount; /*discard loner*/ -// } -// else if (blobindex+1=blobcount -// || partids[blobindex]!=partids[blobindex+2]) -// && (bestrun>3 || partids[blobindex]!=bestpart)) -// { partids[blobindex]=partcount; /*discard both*/ -// partids[blobindex+1]=partcount; -// } -// } -// } -// for (blobindex=0;blobindex= MINASCRISE && partsizes[partition] > poscount) { @@ -1459,8 +1402,8 @@ int blobcount, /*blobs in blobcoords */ QSPLINE * baseline, /*established */ float jumplimit /*min ascender height */ ) { - int blobindex; /*current blob */ - /*height statistics */ + int blobindex; /*current blob */ + /*height statistics */ STATS heightstat (0, MAXHEIGHT); int height; /*height of blob */ int xcentre; /*centre of blob */ diff --git a/textord/pithsync.h b/textord/pithsync.h index f9ba479e..386426be 100644 --- a/textord/pithsync.h +++ b/textord/pithsync.h @@ -1,8 +1,8 @@ /********************************************************************** * File: pithsync.h (Formerly pitsync2.h) * Description: Code to find the optimum fixed pitch segmentation of some blobs. - * Author: Ray Smith - * Created: Thu Nov 19 11:48:05 GMT 1992 + * Author: Ray Smith + * Created: Thu Nov 19 11:48:05 GMT 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -66,7 +66,7 @@ class FPCUTPT inT16 pitch, //proposed pitch inT16 pitch_error); //allowed tolerance - inT32 position() { //access func + inT32 position() { // access func return xpos; } double cost_function() { diff --git a/textord/pitsync1.h b/textord/pitsync1.h index c2fb9bec..5374b003 100644 --- a/textord/pitsync1.h +++ b/textord/pitsync1.h @@ -1,8 +1,8 @@ /********************************************************************** * File: pitsync1.h (Formerly pitsync.h) * Description: Code to find the optimum fixed pitch segmentation of some blobs. - * Author: Ray Smith - * Created: Thu Nov 19 11:48:05 GMT 1992 + * Author: Ray Smith + * Created: Thu Nov 19 11:48:05 GMT 1992 * * (C) Copyright 1992, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -46,7 +46,7 @@ class FPSEGPT:public ELIST_LINK FPSEGPT_LIST *prev_list); //previous segment FPSEGPT(FPCUTPT *cutpt); //build from new type - inT32 position() { //access func + inT32 position() { // access func return xpos; } double cost_function() { diff --git a/textord/scanedg.cpp b/textord/scanedg.cpp index dbb3b662..04308436 100644 --- a/textord/scanedg.cpp +++ b/textord/scanedg.cpp @@ -335,7 +335,7 @@ void join_edges(CRACKEDGE *edge1, // edges to join if (edge1->pos.x() + edge1->stepx != edge2->pos.x() || edge1->pos.y() + edge1->stepy != edge2->pos.y()) { CRACKEDGE *tempedge = edge1; - edge1 = edge2; // swap araound + edge1 = edge2; // swap around edge2 = tempedge; } diff --git a/textord/strokewidth.cpp b/textord/strokewidth.cpp index 5d0fdc51..059aa9b8 100644 --- a/textord/strokewidth.cpp +++ b/textord/strokewidth.cpp @@ -393,7 +393,7 @@ void StrokeWidth::GradeBlobsIntoPartitions( } static void PrintBoxWidths(BLOBNBOX* neighbour) { - TBOX nbox = neighbour->bounding_box(); + const TBOX& nbox = neighbour->bounding_box(); tprintf("Box (%d,%d)->(%d,%d): h-width=%.1f, v-width=%.1f p-width=%1.f\n", nbox.left(), nbox.bottom(), nbox.right(), nbox.top(), neighbour->horz_stroke_width(), neighbour->vert_stroke_width(), @@ -1939,7 +1939,7 @@ ScrollView* StrokeWidth::DisplayGoodBlobs(const char* window_name, gsearch.StartFullSearch(); BLOBNBOX* bbox; while ((bbox = gsearch.NextFullSearch()) != NULL) { - TBOX box = bbox->bounding_box(); + const TBOX& box = bbox->bounding_box(); int left_x = box.left(); int right_x = box.right(); int top_y = box.top(); diff --git a/textord/strokewidth.h b/textord/strokewidth.h index 71ec95da..497ffaeb 100644 --- a/textord/strokewidth.h +++ b/textord/strokewidth.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_STROKEWIDTH_H__ -#define TESSERACT_TEXTORD_STROKEWIDTH_H__ +#ifndef TESSERACT_TEXTORD_STROKEWIDTH_H_ +#define TESSERACT_TEXTORD_STROKEWIDTH_H_ #include "blobbox.h" // BlobNeighourDir. #include "blobgrid.h" // Base class. @@ -352,4 +352,4 @@ class StrokeWidth : public BlobGrid { } // namespace tesseract. -#endif // TESSERACT_TEXTORD_STROKEWIDTH_H__ +#endif // TESSERACT_TEXTORD_STROKEWIDTH_H_ diff --git a/textord/tabfind.cpp b/textord/tabfind.cpp index dc7a072b..fff7fa53 100644 --- a/textord/tabfind.cpp +++ b/textord/tabfind.cpp @@ -229,7 +229,7 @@ void TabFind::GutterWidthAndNeighbourGap(int tab_x, int mean_height, bbox->flow() == BTFT_TEXT_ON_IMAGE, 0.0, *gutter_width, box.top(), box.bottom()); if (gutter_bbox != NULL) { - TBOX gutter_box = gutter_bbox->bounding_box(); + const TBOX& gutter_box = gutter_bbox->bounding_box(); *gutter_width = left ? tab_x - gutter_box.right() : gutter_box.left() - tab_x; } @@ -261,7 +261,7 @@ void TabFind::GutterWidthAndNeighbourGap(int tab_x, int mean_height, int neighbour_edge = left ? RightEdgeForBox(box, true, false) : LeftEdgeForBox(box, true, false); if (neighbour != NULL) { - TBOX n_box = neighbour->bounding_box(); + const TBOX& n_box = neighbour->bounding_box(); if (debug) { tprintf("Found neighbour:"); n_box.print(); @@ -440,13 +440,8 @@ bool TabFind::FindTabVectors(TabVector_LIST* hlines, #ifndef GRAPHICS_DISABLED if (textord_tabfind_show_finaltabs) { tab_win = MakeWindow(640, 50, "FinalTabs"); - if (textord_debug_images) { - tab_win->Image(AlignedBlob::textord_debug_pix().string(), - image_origin_.x(), image_origin_.y()); - } else { - DisplayBoxes(tab_win); - DisplayTabs("FinalTabs", tab_win); - } + DisplayBoxes(tab_win); + DisplayTabs("FinalTabs", tab_win); tab_win = DisplayTabVectors(tab_win); } #endif // GRAPHICS_DISABLED @@ -1277,32 +1272,6 @@ bool TabFind::Deskew(TabVector_LIST* hlines, BLOBNBOX_LIST* image_blobs, RotateBlobList(*deskew, &block->blobs); RotateBlobList(*deskew, &block->small_blobs); RotateBlobList(*deskew, &block->noise_blobs); - if (textord_debug_images) { - // Rotate the debug pix and arrange for it to be drawn at the correct - // pixel offset. - Pix* pix_grey = pixRead(AlignedBlob::textord_debug_pix().string()); - int width = pixGetWidth(pix_grey); - int height = pixGetHeight(pix_grey); - float angle = atan2(deskew->y(), deskew->x()); - // Positive angle is clockwise to pixRotate. - Pix* pix_rot = pixRotate(pix_grey, -angle, L_ROTATE_AREA_MAP, - L_BRING_IN_WHITE, width, height); - // The image must be translated by the rotation of its center, since it - // has just been rotated about its center. - ICOORD center_offset(width / 2, height / 2); - ICOORD new_center_offset(center_offset); - new_center_offset.rotate(*deskew); - image_origin_ += new_center_offset - center_offset; - // The image grew as it was rotated, so offset the (top/left) origin - // by half the change in size. y is opposite to x because it is drawn - // at ist top/left, not bottom/left. - ICOORD corner_offset((width - pixGetWidth(pix_rot)) / 2, - (pixGetHeight(pix_rot) - height) / 2); - image_origin_ += corner_offset; - pixWrite(AlignedBlob::textord_debug_pix().string(), pix_rot, IFF_PNG); - pixDestroy(&pix_grey); - pixDestroy(&pix_rot); - } // Rotate the horizontal vectors. The vertical vectors don't need // rotating as they can just be refitted. diff --git a/textord/tabfind.h b/textord/tabfind.h index 841c18a9..0e1f753a 100644 --- a/textord/tabfind.h +++ b/textord/tabfind.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_TABFIND_H__ -#define TESSERACT_TEXTORD_TABFIND_H__ +#ifndef TESSERACT_TEXTORD_TABFIND_H_ +#define TESSERACT_TEXTORD_TABFIND_H_ #include "alignedblob.h" #include "tesscallback.h" @@ -382,4 +382,4 @@ class TabFind : public AlignedBlob { } // namespace tesseract. -#endif // TESSERACT_TEXTORD_TABFIND_H__ +#endif // TESSERACT_TEXTORD_TABFIND_H_ diff --git a/textord/tablefind.cpp b/textord/tablefind.cpp index 425bdbc2..e2fa6cc6 100644 --- a/textord/tablefind.cpp +++ b/textord/tablefind.cpp @@ -143,7 +143,6 @@ const double kMaxXProjectionGapFactor = 2.0; const double kStrokeWidthFractionalTolerance = 0.25; const double kStrokeWidthConstantTolerance = 2.0; -BOOL_VAR(textord_dump_table_images, false, "Paint table detection output"); BOOL_VAR(textord_show_tables, false, "Show table regions"); BOOL_VAR(textord_tablefind_show_mark, false, "Debug table marking steps in detail"); @@ -371,9 +370,6 @@ void TableFinder::LocateTables(ColPartitionGrid* grid, #endif // GRAPHICS_DISABLED } - if (textord_dump_table_images) - WriteToPix(reskew); - // Merge all colpartitions in table regions to make them a single // colpartition and revert types of isolated table cells not // assigned to any table to their original types. @@ -550,7 +546,7 @@ void TableFinder::GroupColumnBlocks(ColSegment_LIST* new_blocks, // iterate through the source list for (src_it.mark_cycle_pt(); !src_it.cycled_list(); src_it.forward()) { ColSegment* src_seg = src_it.data(); - TBOX src_box = src_seg->bounding_box(); + const TBOX& src_box = src_seg->bounding_box(); bool match_found = false; // iterate through the destination list to find a matching column block for (dest_it.mark_cycle_pt(); !dest_it.cycled_list(); dest_it.forward()) { @@ -1342,7 +1338,7 @@ void TableFinder::GetTableRegions(ColSegment_LIST* table_columns, // create a bool array to hold projection on y-axis bool* table_region = new bool[page_height]; while ((part = gsearch.NextFullSearch()) != NULL) { - TBOX part_box = part->bounding_box(); + const TBOX& part_box = part->bounding_box(); // reset the projection array for (int i = 0; i < page_height; i++) { table_region[i] = false; @@ -1974,7 +1970,7 @@ void TableFinder::DisplayColPartitionConnections( ColPartition* upper_part = part->nearest_neighbor_above(); if (upper_part) { - TBOX upper_box = upper_part->bounding_box(); + const TBOX& upper_box = upper_part->bounding_box(); int mid_x = (left_x + right_x) / 2; int mid_y = (top_y + bottom_y) / 2; int other_x = (upper_box.left() + upper_box.right()) / 2; @@ -1985,7 +1981,7 @@ void TableFinder::DisplayColPartitionConnections( } ColPartition* lower_part = part->nearest_neighbor_below(); if (lower_part) { - TBOX lower_box = lower_part->bounding_box(); + const TBOX& lower_box = lower_part->bounding_box(); int mid_x = (left_x + right_x) / 2; int mid_y = (top_y + bottom_y) / 2; int other_x = (lower_box.left() + lower_box.right()) / 2; @@ -1999,80 +1995,6 @@ void TableFinder::DisplayColPartitionConnections( #endif } - -// Write debug image and text file. -// Note: This method is only for debug purpose during development and -// would not be part of checked in code -void TableFinder::WriteToPix(const FCOORD& reskew) { - // Input file must be named test1.tif - PIX* pix = pixRead("test1.tif"); - if (!pix) { - tprintf("Input file test1.tif not found.\n"); - return; - } - int img_height = pixGetHeight(pix); - int img_width = pixGetWidth(pix); - // Maximum number of text or table partitions - int num_boxes = 10; - BOXA* text_box_array = boxaCreate(num_boxes); - BOXA* table_box_array = boxaCreate(num_boxes); - GridSearch - gsearch(&clean_part_grid_); - gsearch.StartFullSearch(); - ColPartition* part; - // load colpartitions into text_box_array and table_box_array - while ((part = gsearch.NextFullSearch()) != NULL) { - TBOX box = part->bounding_box(); - box.rotate_large(reskew); - BOX* lept_box = boxCreate(box.left(), img_height - box.top(), - box.right() - box.left(), - box.top() - box.bottom()); - if (part->type() == PT_TABLE) - boxaAddBox(table_box_array, lept_box, L_INSERT); - else - boxaAddBox(text_box_array, lept_box, L_INSERT); - } - // draw colpartitions on the output image - PIX* out = pixDrawBoxa(pix, text_box_array, 3, 0xff000000); - out = pixDrawBoxa(out, table_box_array, 3, 0x0000ff00); - - BOXA* table_array = boxaCreate(num_boxes); - // text file containing detected table bounding boxes - FILE* fptr = fopen("tess-table.txt", "wb"); - GridSearch - table_search(&table_grid_); - table_search.StartFullSearch(); - ColSegment* table; - // load table boxes to table_array and write them to text file as well - while ((table = table_search.NextFullSearch()) != NULL) { - TBOX box = table->bounding_box(); - box.rotate_large(reskew); - // Since deskewing introduces negative coordinates, reskewing - // might not completely recover from that since both steps enlarge - // the actual box. Hence a box that undergoes deskewing/reskewing - // may go out of image boundaries. Crop a table box if needed to - // contain it inside the image dimensions. - box = box.intersection(TBOX(0, 0, img_width - 1, img_height - 1)); - BOX* lept_box = boxCreate(box.left(), img_height - box.top(), - box.right() - box.left(), - box.top() - box.bottom()); - boxaAddBox(table_array, lept_box, L_INSERT); - fprintf(fptr, "%d %d %d %d TABLE\n", box.left(), - img_height - box.top(), box.right(), img_height - box.bottom()); - } - fclose(fptr); - // paint table boxes on the debug image - out = pixDrawBoxa(out, table_array, 5, 0x7fff0000); - - pixWrite("out.png", out, IFF_PNG); - // memory cleanup - boxaDestroy(&text_box_array); - boxaDestroy(&table_box_array); - boxaDestroy(&table_array); - pixDestroy(&pix); - pixDestroy(&out); -} - // Merge all colpartitions in table regions to make them a single // colpartition and revert types of isolated table cells not // assigned to any table to their original types. @@ -2098,7 +2020,7 @@ void TableFinder::MakeTableBlocks(ColPartitionGrid* grid, table_search.StartFullSearch(); ColSegment* table; while ((table = table_search.NextFullSearch()) != NULL) { - TBOX table_box = table->bounding_box(); + const TBOX& table_box = table->bounding_box(); // Start a rect search on table_box GridSearch rectsearch(grid); diff --git a/textord/tablefind.h b/textord/tablefind.h index 3612e48d..49590a4f 100644 --- a/textord/tablefind.h +++ b/textord/tablefind.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_TABLEFIND_H__ -#define TESSERACT_TEXTORD_TABLEFIND_H__ +#ifndef TESSERACT_TEXTORD_TABLEFIND_H_ +#define TESSERACT_TEXTORD_TABLEFIND_H_ #include "colpartitiongrid.h" #include "elst.h" @@ -389,11 +389,6 @@ class TableFinder { void DisplayColSegmentGrid(ScrollView* win, ColSegmentGrid* grid, ScrollView::Color color); - // Write ColParitions and Tables to a PIX image - // Note: This method is only for debug purpose during development and - // would not be part of checked in code - void WriteToPix(const FCOORD& reskew); - // Merge all colpartitions in table regions to make them a single // colpartition and revert types of isolated table cells not // assigned to any table to their original types. @@ -432,4 +427,4 @@ class TableFinder { } // namespace tesseract. -#endif // TESSERACT_TEXTORD_TABLEFIND_H__ +#endif // TESSERACT_TEXTORD_TABLEFIND_H_ diff --git a/textord/tabvector.cpp b/textord/tabvector.cpp index c8d508f0..f3e99fa3 100644 --- a/textord/tabvector.cpp +++ b/textord/tabvector.cpp @@ -435,7 +435,7 @@ bool TabVector::SimilarTo(const ICOORD& vertical, vsearch.StartVerticalSearch(left, right, top_y); BLOBNBOX* blob; while ((blob = vsearch.NextVerticalSearch(true)) != NULL) { - TBOX box = blob->bounding_box(); + const TBOX& box = blob->bounding_box(); if (box.top() > bottom_y) return true; // Nothing found. if (box.bottom() < top_y) @@ -523,12 +523,12 @@ const char* kAlignmentNames[] = { // Print basic information about this tab vector. void TabVector::Print(const char* prefix) { - tprintf("%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d," - " partners=%d\n", - prefix, kAlignmentNames[alignment_], - startpt_.x(), startpt_.y(), endpt_.x(), endpt_.y(), - mean_width_, percent_score_, sort_key_, - boxes_.length(), partners_.length()); + tprintf( + "%s %s (%d,%d)->(%d,%d) w=%d s=%d, sort key=%d, boxes=%d," + " partners=%d\n", + prefix, kAlignmentNames[alignment_], startpt_.x(), startpt_.y(), + endpt_.x(), endpt_.y(), mean_width_, percent_score_, sort_key_, + boxes_.length(), partners_.length()); } // Print basic information about this tab vector and every box in it. @@ -806,7 +806,7 @@ bool TabVector::Fit(ICOORD vertical, bool force_parallel) { // Fit a line to all the boxes in the list. for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { BLOBNBOX* bbox = it.data(); - TBOX box = bbox->bounding_box(); + const TBOX& box = bbox->bounding_box(); int x1 = IsRightTab() ? box.right() : box.left(); ICOORD boxpt(x1, box.bottom()); linepoints.Add(boxpt); @@ -831,7 +831,7 @@ bool TabVector::Fit(ICOORD vertical, bool force_parallel) { int width_count = 0; for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) { BLOBNBOX* bbox = it.data(); - TBOX box = bbox->bounding_box(); + const TBOX& box = bbox->bounding_box(); mean_width_ += box.width(); ++width_count; int x1 = IsRightTab() ? box.right() : box.left(); diff --git a/textord/tabvector.h b/textord/tabvector.h index dbeff106..e8f05864 100644 --- a/textord/tabvector.h +++ b/textord/tabvector.h @@ -17,8 +17,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_TABVECTOR_H__ -#define TESSERACT_TEXTORD_TABVECTOR_H__ +#ifndef TESSERACT_TEXTORD_TABVECTOR_H_ +#define TESSERACT_TEXTORD_TABVECTOR_H_ #include "blobgrid.h" #include "clst.h" @@ -427,4 +427,4 @@ class TabVector : public ELIST2_LINK { } // namespace tesseract. -#endif // TESSERACT_TEXTORD_TABVECTOR_H__ +#endif // TESSERACT_TEXTORD_TABVECTOR_H_ diff --git a/textord/textlineprojection.cpp b/textord/textlineprojection.cpp index 6018e5fd..8220b95e 100644 --- a/textord/textlineprojection.cpp +++ b/textord/textlineprojection.cpp @@ -119,6 +119,7 @@ void TextlineProjection::MoveNonTextlineBlobs( // Create a window and display the projection in it. void TextlineProjection::DisplayProjection() const { +#ifndef GRAPHICS_DISABLED int width = pixGetWidth(pix_); int height = pixGetHeight(pix_); Pix* pixc = pixCreate(width, height, 32); @@ -139,16 +140,12 @@ void TextlineProjection::DisplayProjection() const { col_data[x] = result; } } -#if 0 - // TODO(rays) uncomment when scrollview can display non-binary images. ScrollView* win = new ScrollView("Projection", 0, 0, width, height, width, height); win->Image(pixc, 0, 0); win->Update(); -#else - pixWrite("projection.png", pixc, IFF_PNG); -#endif pixDestroy(&pixc); +#endif // GRAPHICS_DISABLED } // Compute the distance of the box from the partition using curved projection @@ -760,7 +757,7 @@ void TextlineProjection::TruncateToImageBounds(TPOINT* pt) const { pt->y = ClipToRange(pt->y, 0, pixGetHeight(pix_) - 1); } #ifdef _MSC_VER -#pragma optimize( "", on ) +#pragma optimize("", on) #endif // _MSC_VER // Transform tesseract image coordinates to coordinates used in the projection. diff --git a/textord/textord.cpp b/textord/textord.cpp index 1f7e8a88..94ef49c7 100644 --- a/textord/textord.cpp +++ b/textord/textord.cpp @@ -33,7 +33,8 @@ namespace tesseract { Textord::Textord(CCStruct* ccstruct) - : ccstruct_(ccstruct), use_cjk_fp_model_(false), + : ccstruct_(ccstruct), + use_cjk_fp_model_(false), // makerow.cpp /////////////////////////////////////////// BOOL_MEMBER(textord_single_height_mode, false, "Script has no xheight, so use a single mode", @@ -46,24 +47,20 @@ Textord::Textord(CCStruct* ccstruct) "old_to_method.", ccstruct_->params()), BOOL_MEMBER(tosp_only_use_prop_rows, true, - "Block stats to use fixed pitch rows?", - ccstruct_->params()), + "Block stats to use fixed pitch rows?", ccstruct_->params()), BOOL_MEMBER(tosp_force_wordbreak_on_punct, false, "Force word breaks on punct to break long lines in non-space " "delimited langs", ccstruct_->params()), - BOOL_MEMBER(tosp_use_pre_chopping, false, - "Space stats use prechopping?", + BOOL_MEMBER(tosp_use_pre_chopping, false, "Space stats use prechopping?", ccstruct_->params()), BOOL_MEMBER(tosp_old_to_bug_fix, false, "Fix suspected bug in old code", ccstruct_->params()), - BOOL_MEMBER(tosp_block_use_cert_spaces, true, - "Only stat OBVIOUS spaces", + BOOL_MEMBER(tosp_block_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params()), BOOL_MEMBER(tosp_row_use_cert_spaces, true, "Only stat OBVIOUS spaces", ccstruct_->params()), - BOOL_MEMBER(tosp_narrow_blobs_not_cert, true, - "Only stat OBVIOUS spaces", + BOOL_MEMBER(tosp_narrow_blobs_not_cert, true, "Only stat OBVIOUS spaces", ccstruct_->params()), BOOL_MEMBER(tosp_row_use_cert_spaces1, true, "Only stat OBVIOUS spaces", ccstruct_->params()), @@ -78,30 +75,24 @@ Textord::Textord(CCStruct* ccstruct) "Don't restrict kn->sp fuzzy limit to tables", ccstruct_->params()), BOOL_MEMBER(tosp_stats_use_xht_gaps, true, - "Use within xht gap for wd breaks", - ccstruct_->params()), + "Use within xht gap for wd breaks", ccstruct_->params()), BOOL_MEMBER(tosp_use_xht_gaps, true, "Use within xht gap for wd breaks", ccstruct_->params()), BOOL_MEMBER(tosp_only_use_xht_gaps, false, - "Only use within xht gap for wd breaks", - ccstruct_->params()), + "Only use within xht gap for wd breaks", ccstruct_->params()), BOOL_MEMBER(tosp_rule_9_test_punct, false, - "Don't chng kn to space next to punct", - ccstruct_->params()), + "Don't chng kn to space next to punct", ccstruct_->params()), BOOL_MEMBER(tosp_flip_fuzz_kn_to_sp, true, "Default flip", ccstruct_->params()), BOOL_MEMBER(tosp_flip_fuzz_sp_to_kn, true, "Default flip", ccstruct_->params()), BOOL_MEMBER(tosp_improve_thresh, false, "Enable improvement heuristic", ccstruct_->params()), - INT_MEMBER(tosp_debug_level, 0, "Debug data", - ccstruct_->params()), + INT_MEMBER(tosp_debug_level, 0, "Debug data", ccstruct_->params()), INT_MEMBER(tosp_enough_space_samples_for_median, 3, - "or should we use mean", - ccstruct_->params()), + "or should we use mean", ccstruct_->params()), INT_MEMBER(tosp_redo_kern_limit, 10, - "No.samples reqd to reestimate for row", - ccstruct_->params()), + "No.samples reqd to reestimate for row", ccstruct_->params()), INT_MEMBER(tosp_few_samples, 40, "No.gaps reqd with 1 large gap to treat as a table", ccstruct_->params()), @@ -114,30 +105,24 @@ Textord::Textord(CCStruct* ccstruct) "Factor for defining space threshold in terms of space and " "kern sizes", ccstruct_->params()), - double_MEMBER(tosp_threshold_bias1, 0, - "how far between kern and space?", + double_MEMBER(tosp_threshold_bias1, 0, "how far between kern and space?", ccstruct_->params()), - double_MEMBER(tosp_threshold_bias2, 0, - "how far between kern and space?", + double_MEMBER(tosp_threshold_bias2, 0, "how far between kern and space?", ccstruct_->params()), double_MEMBER(tosp_narrow_fraction, 0.3, "Fract of xheight for narrow", ccstruct_->params()), double_MEMBER(tosp_narrow_aspect_ratio, 0.48, - "narrow if w/h less than this", - ccstruct_->params()), + "narrow if w/h less than this", ccstruct_->params()), double_MEMBER(tosp_wide_fraction, 0.52, "Fract of xheight for wide", ccstruct_->params()), double_MEMBER(tosp_wide_aspect_ratio, 0.0, "wide if w/h less than this", ccstruct_->params()), double_MEMBER(tosp_fuzzy_space_factor, 0.6, - "Fract of xheight for fuzz sp", - ccstruct_->params()), + "Fract of xheight for fuzz sp", ccstruct_->params()), double_MEMBER(tosp_fuzzy_space_factor1, 0.5, - "Fract of xheight for fuzz sp", - ccstruct_->params()), + "Fract of xheight for fuzz sp", ccstruct_->params()), double_MEMBER(tosp_fuzzy_space_factor2, 0.72, - "Fract of xheight for fuzz sp", - ccstruct_->params()), + "Fract of xheight for fuzz sp", ccstruct_->params()), double_MEMBER(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern", ccstruct_->params()), double_MEMBER(tosp_kern_gap_factor1, 2.0, "gap ratio to flip kern->sp", @@ -156,14 +141,11 @@ Textord::Textord(CCStruct* ccstruct) "Fract of kerns reqd for isolated row stats", ccstruct_->params()), double_MEMBER(tosp_table_kn_sp_ratio, 2.25, - "Min difference of kn & sp in table", - ccstruct_->params()), + "Min difference of kn & sp in table", ccstruct_->params()), double_MEMBER(tosp_table_xht_sp_ratio, 0.33, - "Expect spaces bigger than this", - ccstruct_->params()), + "Expect spaces bigger than this", ccstruct_->params()), double_MEMBER(tosp_table_fuzzy_kn_sp_ratio, 3.0, - "Fuzzy if less than this", - ccstruct_->params()), + "Fuzzy if less than this", ccstruct_->params()), double_MEMBER(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg", ccstruct_->params()), double_MEMBER(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg", @@ -172,20 +154,16 @@ Textord::Textord(CCStruct* ccstruct) "Don't trust spaces less than this time kn", ccstruct_->params()), double_MEMBER(tosp_init_guess_kn_mult, 2.2, - "Thresh guess - mult kn by this", - ccstruct_->params()), + "Thresh guess - mult kn by this", ccstruct_->params()), double_MEMBER(tosp_init_guess_xht_mult, 0.28, - "Thresh guess - mult xht by this", - ccstruct_->params()), + "Thresh guess - mult xht by this", ccstruct_->params()), double_MEMBER(tosp_max_sane_kn_thresh, 5.0, - "Multiplier on kn to limit thresh", - ccstruct_->params()), + "Multiplier on kn to limit thresh", ccstruct_->params()), double_MEMBER(tosp_flip_caution, 0.0, "Don't autoflip kn to sp when large separation", ccstruct_->params()), double_MEMBER(tosp_large_kerning, 0.19, - "Limit use of xht gap with large kns", - ccstruct_->params()), + "Limit use of xht gap with large kns", ccstruct_->params()), double_MEMBER(tosp_dont_fool_with_small_kerns, -1, "Limit use of xht gap with odd small kns", ccstruct_->params()), @@ -193,11 +171,9 @@ Textord::Textord(CCStruct* ccstruct) "Don't reduce box if the top left is non blank", ccstruct_->params()), double_MEMBER(tosp_silly_kn_sp_gap, 0.2, - "Don't let sp minus kn get too small", - ccstruct_->params()), + "Don't let sp minus kn get too small", ccstruct_->params()), double_MEMBER(tosp_pass_wide_fuzz_sp_to_context, 0.75, - "How wide fuzzies need context", - ccstruct_->params()), + "How wide fuzzies need context", ccstruct_->params()), // tordmain.cpp /////////////////////////////////////////// BOOL_MEMBER(textord_no_rejects, false, "Don't remove noise blobs", ccstruct_->params()), @@ -206,34 +182,27 @@ Textord::Textord(CCStruct* ccstruct) BOOL_MEMBER(textord_show_boxes, false, "Display unsorted blobs", ccstruct_->params()), INT_MEMBER(textord_max_noise_size, 7, "Pixel size of noise", - ccstruct_->params()), + ccstruct_->params()), INT_MEMBER(textord_baseline_debug, 0, "Baseline debug level", - ccstruct_->params()), + ccstruct_->params()), double_MEMBER(textord_blob_size_bigile, 95, "Percentile for large blobs", ccstruct_->params()), double_MEMBER(textord_noise_area_ratio, 0.7, - "Fraction of bounding box for noise", - ccstruct_->params()), + "Fraction of bounding box for noise", ccstruct_->params()), double_MEMBER(textord_blob_size_smallile, 20, - "Percentile for small blobs", - ccstruct_->params()), + "Percentile for small blobs", ccstruct_->params()), double_MEMBER(textord_initialx_ile, 0.75, - "Ile of sizes for xheight guess", - ccstruct_->params()), + "Ile of sizes for xheight guess", ccstruct_->params()), double_MEMBER(textord_initialasc_ile, 0.90, - "Ile of sizes for xheight guess", - ccstruct_->params()), - INT_MEMBER(textord_noise_sizefraction, 10, - "Fraction of size for maxima", + "Ile of sizes for xheight guess", ccstruct_->params()), + INT_MEMBER(textord_noise_sizefraction, 10, "Fraction of size for maxima", ccstruct_->params()), double_MEMBER(textord_noise_sizelimit, 0.5, - "Fraction of x for big t count", - ccstruct_->params()), + "Fraction of x for big t count", ccstruct_->params()), INT_MEMBER(textord_noise_translimit, 16, "Transitions for normal blob", ccstruct_->params()), double_MEMBER(textord_noise_normratio, 2.0, - "Dot to norm ratio for deletion", - ccstruct_->params()), + "Dot to norm ratio for deletion", ccstruct_->params()), BOOL_MEMBER(textord_noise_rejwords, true, "Reject noise-like words", ccstruct_->params()), BOOL_MEMBER(textord_noise_rejrows, true, "Reject noise-like rows", @@ -242,24 +211,20 @@ Textord::Textord(CCStruct* ccstruct) "xh fract height error for norm blobs", ccstruct_->params()), double_MEMBER(textord_noise_sxfract, 0.4, - "xh fract width error for norm blobs", - ccstruct_->params()), - double_MEMBER(textord_noise_hfract, 1.0/64, + "xh fract width error for norm blobs", ccstruct_->params()), + double_MEMBER(textord_noise_hfract, 1.0 / 64, "Height fraction to discard outlines as speckle noise", ccstruct_->params()), INT_MEMBER(textord_noise_sncount, 1, "super norm blobs to save row", ccstruct_->params()), double_MEMBER(textord_noise_rowratio, 6.0, - "Dot to norm ratio for deletion", - ccstruct_->params()), + "Dot to norm ratio for deletion", ccstruct_->params()), BOOL_MEMBER(textord_noise_debug, false, "Debug row garbage detector", ccstruct_->params()), double_MEMBER(textord_blshift_maxshift, 0.00, "Max baseline shift", ccstruct_->params()), double_MEMBER(textord_blshift_xfraction, 9.99, - "Min size of baseline shift", - ccstruct_->params()) { -} + "Min size of baseline shift", ccstruct_->params()) {} Textord::~Textord() { } @@ -324,10 +289,9 @@ void Textord::TextordPage(PageSegMode pageseg_mode, const FCOORD& reskew, BaselineDetect baseline_detector(textord_baseline_debug, reskew, to_blocks); baseline_detector.ComputeStraightBaselines(use_box_bottoms); - baseline_detector.ComputeBaselineSplinesAndXheights(page_tr_, true, - textord_heavy_nr, - textord_show_final_rows, - this); + baseline_detector.ComputeBaselineSplinesAndXheights( + page_tr_, pageseg_mode != PSM_RAW_LINE, textord_heavy_nr, + textord_show_final_rows, this); // Now make the words in the lines. if (PSM_WORD_FIND_ENABLED(pageseg_mode)) { // SINGLE_LINE uses the old word maker on the single line. diff --git a/textord/textord.h b/textord/textord.h index b34ecd2c..6c9b12f7 100644 --- a/textord/textord.h +++ b/textord/textord.h @@ -18,8 +18,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_TEXTORD_H__ -#define TESSERACT_TEXTORD_TEXTORD_H__ +#ifndef TESSERACT_TEXTORD_TEXTORD_H_ +#define TESSERACT_TEXTORD_TEXTORD_H_ #include "ccstruct.h" #include "bbgrid.h" @@ -400,4 +400,4 @@ class Textord { }; } // namespace tesseract. -#endif // TESSERACT_TEXTORD_TEXTORD_H__ +#endif // TESSERACT_TEXTORD_TEXTORD_H_ diff --git a/textord/topitch.cpp b/textord/topitch.cpp index ae9999f7..cfde683b 100644 --- a/textord/topitch.cpp +++ b/textord/topitch.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: topitch.cpp (Formerly to_pitch.c) * Description: Code to determine fixed pitchness and the pitch if fixed. - * Author: Ray Smith - * Created: Tue Aug 24 16:57:29 BST 1993 + * Author: Ray Smith + * Created: Tue Aug 24 16:57:29 BST 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -1084,7 +1084,7 @@ BOOL8 count_pitch_stats( //find lines return FALSE; prev_valid = FALSE; prev_centre = 0; - prev_right = 0; //stop compiler warning + prev_right = 0; // stop compiler warning joined_box = blob_it.data ()->bounding_box (); do { blob_it.forward (); @@ -1285,8 +1285,6 @@ float tune_row_pitch2( //find fp cells return initial_pitch; } sum_proj = new STATS[textord_pitch_range * 2 + 1]; - if (sum_proj == NULL) - return initial_pitch; for (pitch_delta = -textord_pitch_range; pitch_delta <= textord_pitch_range; pitch_delta++) diff --git a/textord/tordmain.cpp b/textord/tordmain.cpp index f09a186d..0eaf843e 100644 --- a/textord/tordmain.cpp +++ b/textord/tordmain.cpp @@ -251,6 +251,7 @@ void Textord::filter_blobs(ICOORD page_tr, // top right &block->noise_blobs, &block->small_blobs, &block->large_blobs); + if (block->line_size == 0) block->line_size = 1; block->line_spacing = block->line_size * (tesseract::CCStruct::kDescenderFraction + tesseract::CCStruct::kXHeightFraction + @@ -360,7 +361,7 @@ void Textord::cleanup_nontext_block(BLOCK* block) { // Non-text blocks must contain at least one row. ROW_IT row_it(block->row_list()); if (row_it.empty()) { - TBOX box = block->bounding_box(); + const TBOX& box = block->bounding_box(); float height = box.height(); inT32 xstarts[2] = {box.left(), box.right()}; double coeffs[3] = {0.0, 0.0, static_cast(box.bottom())}; @@ -769,6 +770,7 @@ void Textord::TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs, PointerVector word_ptrs; for (int g = 0; g < groups.size(); ++g) { const BlockGroup* group = groups[g]; + if (group->bounding_box.null_box()) continue; WordGrid word_grid(group->min_xheight, group->bounding_box.botleft(), group->bounding_box.topright()); for (int b = 0; b < group->blocks.size(); ++b) { diff --git a/textord/tospace.cpp b/textord/tospace.cpp index bec346ef..43584369 100644 --- a/textord/tospace.cpp +++ b/textord/tospace.cpp @@ -1,3 +1,12 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. /********************************************************************** * tospace.cpp * @@ -419,9 +428,8 @@ void Textord::row_spacing_stats( if (suspected_table && (row->space_size < tosp_table_kn_sp_ratio * row->kern_size)) { if (tosp_debug_level > 5) - tprintf ("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", - block_idx, row_idx, - row->kern_size, row->space_threshold, row->space_size); + tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f.\n", block_idx, + row_idx, row->kern_size, row->space_threshold, row->space_size); row->space_threshold = (inT32) (tosp_table_kn_sp_ratio * row->kern_size); row->space_size = MAX (row->space_threshold + 1, row->xheight); @@ -441,10 +449,9 @@ void Textord::row_spacing_stats( MAX (tosp_min_sane_kn_sp * MAX (row->kern_size, 2.5), row->xheight / 2); if (tosp_debug_level > 5) - tprintf - ("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", - block_idx, row_idx, row->kern_size, row->space_threshold, - row->space_size, sane_space); + tprintf("B:%d R:%d -- DON'T BELIEVE SPACE %3.2f %d %3.2f -> %3.2f.\n", + block_idx, row_idx, row->kern_size, row->space_threshold, + row->space_size, sane_space); row->space_size = sane_space; row->space_threshold = inT32 (floor ((row->space_size + row->kern_size) / @@ -455,10 +462,9 @@ void Textord::row_spacing_stats( MAX (row->kern_size, 2.5))); if (row->space_threshold > sane_threshold) { if (tosp_debug_level > 5) - tprintf ("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", - block_idx, row_idx, - row->kern_size, - row->space_threshold, row->space_size, sane_threshold); + tprintf("B:%d R:%d -- DON'T BELIEVE THRESH %3.2f %d %3.2f->%d.\n", + block_idx, row_idx, row->kern_size, row->space_threshold, + row->space_size, sane_threshold); row->space_threshold = sane_threshold; if (row->space_size <= sane_threshold) row->space_size = row->space_threshold + 1.0f; @@ -498,7 +504,7 @@ void Textord::row_spacing_stats( MIN (inT32 (ceil (tosp_fuzzy_space_factor * row->xheight)), inT32 (row->space_size)); if (row->min_space <= row->space_threshold) - //Don't be silly + // Don't be silly row->min_space = row->space_threshold + 1; /* Lets try to guess the max certain kern gap by looking at the cluster of @@ -559,7 +565,7 @@ void Textord::row_spacing_stats( row->kern_size)); } if (row->max_nonspace > row->space_threshold) { - //Don't be silly + // Don't be silly row->max_nonspace = row->space_threshold; } @@ -700,8 +706,8 @@ BOOL8 Textord::isolated_row_stats(TO_ROW *row, ((small_gaps_count / (float) total) < tosp_enough_small_gaps) || (total - small_gaps_count < 1)) { if (tosp_debug_level > 5) - tprintf ("B:%d R:%d -- Can't do isolated row stats.\n", - block_idx, row_idx); + tprintf("B:%d R:%d -- Can't do isolated row stats.\n", block_idx, + row_idx); return FALSE; } blob_it.set_to_list (row->blob_list ()); @@ -1130,10 +1136,10 @@ ROW *Textord::make_prop_words( else blanks = 0; if (tosp_debug_level > 5) - tprintf - ("Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n", - word->bounding_box ().left (), word->bounding_box ().bottom (), - repetition_spacing, current_gap, blanks); + tprintf( + "Repch wd at EOL (%d,%d). rep spacing %5.2f; Lgap:%d (%d blanks)\n", + word->bounding_box().left(), word->bounding_box().bottom(), + repetition_spacing, current_gap, blanks); word->set_blanks (blanks); //NO uncertainty word->set_flag (W_FUZZY_SP, FALSE); @@ -1326,9 +1332,10 @@ BOOL8 Textord::make_a_word_break( we may need to set PARTICULAR spaces to fuzzy or not. The values will ONLY be used if the function returns TRUE - ie the word is to be broken. */ - blanks = (uinT8) (current_gap / row->space_size); - if (blanks < 1) - blanks = 1; + int num_blanks = current_gap; + if (row->space_size > 1.0f) + num_blanks = IntCastRounded(current_gap / row->space_size); + blanks = static_cast(ClipToRange(num_blanks, 1, MAX_UINT8)); fuzzy_sp = FALSE; fuzzy_non = FALSE; /* @@ -1686,10 +1693,9 @@ void Textord::mark_gap( blob.bottom () + blob.height () / 2.0f); } if (tosp_debug_level > 5) - tprintf (" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", - blob.left () - current_gap / 2, blob.bottom (), rule, - prev_gap, prev_blob_width, current_gap, - next_blob_width, next_gap); + tprintf(" (%d,%d) Sp<->Kn Rule %d %d %d %d %d %d\n", + blob.left() - current_gap / 2, blob.bottom(), rule, prev_gap, + prev_blob_width, current_gap, next_blob_width, next_gap); } #endif @@ -1727,8 +1733,7 @@ BOOL8 Textord::ignore_big_gap(TO_ROW *row, inT16 right) { inT16 gap = right - left + 1; - if (tosp_ignore_big_gaps > 999) - return FALSE; //Don't ignore + if (tosp_ignore_big_gaps > 999) return FALSE; // Don't ignore if (tosp_ignore_big_gaps > 0) return (gap > tosp_ignore_big_gaps * row->xheight); if (gap > tosp_ignore_very_big_gaps * row->xheight) @@ -1750,7 +1755,6 @@ BOOL8 Textord::ignore_big_gap(TO_ROW *row, return FALSE; } - /********************************************************************** * reduced_box_next * diff --git a/textord/tovars.cpp b/textord/tovars.cpp index 71114358..6b1b8332 100644 --- a/textord/tovars.cpp +++ b/textord/tovars.cpp @@ -1,8 +1,8 @@ /********************************************************************** * File: tovars.cpp (Formerly to_vars.c) * Description: Variables used by textord. - * Author: Ray Smith - * Created: Tue Aug 24 16:55:02 BST 1993 + * Author: Ray Smith + * Created: Tue Aug 24 16:55:02 BST 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -49,8 +49,8 @@ EXTERN double_VAR (textord_words_default_minspace, 0.6, EXTERN double_VAR (textord_words_min_minspace, 0.3, "Fraction of xheight"); EXTERN double_VAR (textord_words_default_nonspace, 0.2, "Fraction of xheight"); -EXTERN double_VAR (textord_words_initial_lower, 0.25, -"Max initial cluster size"); +EXTERN double_VAR(textord_words_initial_lower, 0.25, + "Max initial cluster size"); EXTERN double_VAR (textord_words_initial_upper, 0.15, "Min initial cluster spacing"); EXTERN double_VAR (textord_words_minlarge, 0.75, @@ -67,7 +67,7 @@ EXTERN double_VAR (textord_pitch_rowsimilarity, 0.08, "Fraction of xheight for sameness"); EXTERN BOOL_VAR (textord_pitch_scalebigwords, FALSE, "Scale scores on big words"); -EXTERN double_VAR (words_initial_lower, 0.5, "Max initial cluster size"); +EXTERN double_VAR(words_initial_lower, 0.5, "Max initial cluster size"); EXTERN double_VAR (words_initial_upper, 0.15, "Min initial cluster spacing"); EXTERN double_VAR (words_default_prop_nonspace, 0.25, "Fraction of xheight"); EXTERN double_VAR (words_default_fixed_space, 0.75, "Fraction of xheight"); diff --git a/textord/tovars.h b/textord/tovars.h index 99edae1d..46315bb9 100644 --- a/textord/tovars.h +++ b/textord/tovars.h @@ -1,8 +1,8 @@ /********************************************************************** * File: tovars.h (Formerly to_vars.h) * Description: Variables used by textord. - * Author: Ray Smith - * Created: Tue Aug 24 16:55:02 BST 1993 + * Author: Ray Smith + * Created: Tue Aug 24 16:55:02 BST 1993 * * (C) Copyright 1993, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -51,8 +51,8 @@ extern double_VAR_H (textord_words_default_minspace, 0.6, extern double_VAR_H (textord_words_min_minspace, 0.3, "Fraction of xheight"); extern double_VAR_H (textord_words_default_nonspace, 0.2, "Fraction of xheight"); -extern double_VAR_H (textord_words_initial_lower, 0.25, -"Max initial cluster size"); +extern double_VAR_H(textord_words_initial_lower, 0.25, + "Max initial cluster size"); extern double_VAR_H (textord_words_initial_upper, 0.15, "Min initial cluster spacing"); extern double_VAR_H (textord_words_minlarge, 0.75, @@ -69,7 +69,7 @@ extern double_VAR_H (textord_pitch_rowsimilarity, 0.08, "Fraction of xheight for sameness"); extern BOOL_VAR_H (textord_pitch_scalebigwords, FALSE, "Scale scores on big words"); -extern double_VAR_H (words_initial_lower, 0.5, "Max initial cluster size"); +extern double_VAR_H(words_initial_lower, 0.5, "Max initial cluster size"); extern double_VAR_H (words_initial_upper, 0.15, "Min initial cluster spacing"); extern double_VAR_H (words_default_prop_nonspace, 0.25, diff --git a/textord/workingpartset.h b/textord/workingpartset.h index f47c6667..2cbf53a4 100644 --- a/textord/workingpartset.h +++ b/textord/workingpartset.h @@ -18,8 +18,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_TEXTORD_WORKINGPARSET_H__ -#define TESSERACT_TEXTORD_WORKINGPARSET_H__ +#ifndef TESSERACT_TEXTORD_WORKINGPARSET_H_ +#define TESSERACT_TEXTORD_WORKINGPARSET_H_ #include "blobbox.h" // For TO_BLOCK_LIST and BLOCK_LIST. #include "colpartition.h" // For ColPartition_LIST. @@ -87,5 +87,4 @@ ELISTIZEH(WorkingPartSet) } // namespace tesseract. -#endif // TESSERACT_TEXTORD_WORKINGPARSET_H__ - +#endif // TESSERACT_TEXTORD_WORKINGPARSET_H_ diff --git a/training/CMakeLists.txt b/training/CMakeLists.txt index a733e73e..d8f9ad51 100644 --- a/training/CMakeLists.txt +++ b/training/CMakeLists.txt @@ -2,56 +2,62 @@ # tesseract # -if (STATIC OR NOT (WIN32 OR CYGWIN)) +if (NOT CPPAN_BUILD AND NOT (WIN32 OR CYGWIN)) + return() +endif() + +if (CPPAN_BUILD) + set(ICU_FOUND 1) +endif() # experimental -if (MSVC) +if (MSVC AND NOT CPPAN_BUILD) + include(CheckTypeSize) + check_type_size("void *" SIZEOF_VOID_P) -include(CheckTypeSize) -check_type_size("void *" SIZEOF_VOID_P) + if (SIZEOF_VOID_P EQUAL 8) + set(X64 1) + set(ARCH_DIR_NAME 64) + elseif (SIZEOF_VOID_P EQUAL 4) + set(X86 1) + set(ARCH_DIR_NAME 32) + else() + message(FATAL_ERROR "Cannot determine target architecture") + endif() -if (SIZEOF_VOID_P EQUAL 8) - set(X64 1) - set(ARCH_DIR_NAME 64) -elseif (SIZEOF_VOID_P EQUAL 4) - set(X86 1) - set(ARCH_DIR_NAME 32) -else() - message(FATAL_ERROR "Cannot determine target architecture") + set(icu_dir "${CMAKE_CURRENT_BINARY_DIR}/icu") + set(icu_archive "${icu_dir}/icu${ARCH_DIR_NAME}.zip") + + if (X86) + set(icu_hash 45167a240b60e36b59a87eda23490ce4) + else() + set(icu_hash 480c72491576c048de1218c3c5519399) + endif() + + message(STATUS "Downloading latest ICU binaries") + + file(DOWNLOAD + "http://download.icu-project.org/files/icu4c/56.1/icu4c-56_1-Win${ARCH_DIR_NAME}-msvc10.zip" + "${icu_archive}" + SHOW_PROGRESS + INACTIVITY_TIMEOUT 60 # seconds + EXPECTED_HASH MD5=${icu_hash} + ) + execute_process(COMMAND ${CMAKE_COMMAND} -E tar xz "${icu_archive}" + WORKING_DIRECTORY "${icu_dir}" + RESULT_VARIABLE __result + ) + if(NOT __result EQUAL 0) + message(FATAL_ERROR "error ${__result}") + endif() + + set(ICU_ROOT ${icu_dir}/icu) endif() - -set(icu_dir "${CMAKE_CURRENT_BINARY_DIR}/icu") -set(icu_archive "${icu_dir}/icu${ARCH_DIR_NAME}.zip") - -if (X86) - set(icu_hash 45167a240b60e36b59a87eda23490ce4) -else() - set(icu_hash 480c72491576c048de1218c3c5519399) -endif() - -message(STATUS "Downloading latest ICU binaries") - -file(DOWNLOAD - "http://download.icu-project.org/files/icu4c/56.1/icu4c-56_1-Win${ARCH_DIR_NAME}-msvc10.zip" - "${icu_archive}" - SHOW_PROGRESS - INACTIVITY_TIMEOUT 60 # seconds - EXPECTED_HASH MD5=${icu_hash} -) -execute_process(COMMAND ${CMAKE_COMMAND} -E tar xz "${icu_archive}" - WORKING_DIRECTORY "${icu_dir}" - RESULT_VARIABLE __result -) -if(NOT __result EQUAL 0) - message(FATAL_ERROR "error ${__result}") -endif() - -set(ICU_ROOT ${icu_dir}/icu) - -endif(MSVC) # experimental +if (NOT CPPAN_BUILD) find_package(ICU COMPONENTS uc i18n) +endif() ######################################## # LIBRARY tessopt @@ -67,14 +73,12 @@ project_group (tessopt "Training Tools") set(common_training_src commandlineflags.cpp - commontraining.cpp -) -set(common_training_hdr commandlineflags.h + commontraining.cpp commontraining.h ) -add_library (common_training ${common_training_src} ${common_training_hdr}) -target_link_libraries (common_training tesseract tessopt) +add_library (common_training ${common_training_src}) +target_link_libraries (common_training libtesseract tessopt) project_group (common_training "Training Tools") @@ -83,7 +87,7 @@ project_group (common_training "Training Tools") ######################################## add_executable (ambiguous_words ambiguous_words.cpp) -target_link_libraries (ambiguous_words tesseract) +target_link_libraries (ambiguous_words libtesseract) project_group (ambiguous_words "Training Tools") @@ -101,7 +105,7 @@ project_group (classifier_tester "Training Tools") ######################################## add_executable (combine_tessdata combine_tessdata.cpp) -target_link_libraries (combine_tessdata tesseract) +target_link_libraries (combine_tessdata libtesseract) project_group (combine_tessdata "Training Tools") @@ -119,7 +123,7 @@ project_group (cntraining "Training Tools") ######################################## add_executable (dawg2wordlist dawg2wordlist.cpp) -target_link_libraries (dawg2wordlist tesseract) +target_link_libraries (dawg2wordlist libtesseract) project_group (dawg2wordlist "Training Tools") @@ -146,7 +150,7 @@ project_group (shapeclustering "Training Tools") ######################################## add_executable (unicharset_extractor unicharset_extractor.cpp) -target_link_libraries (unicharset_extractor tesseract tessopt) +target_link_libraries (unicharset_extractor libtesseract tessopt) project_group (unicharset_extractor "Training Tools") @@ -155,29 +159,64 @@ project_group (unicharset_extractor "Training Tools") ######################################## add_executable (wordlist2dawg wordlist2dawg.cpp) -target_link_libraries (wordlist2dawg tesseract) +target_link_libraries (wordlist2dawg libtesseract) project_group (wordlist2dawg "Training Tools") +if (ICU_FOUND) + +if (NOT CPPAN_BUILD) +include_directories(${ICU_INCLUDE_DIRS}) +endif() + +######################################## +# LIBRARY unicharset_training +######################################## + +set(unicharset_training_src + fileio.cpp + fileio.h + icuerrorcode.h + lstmtester.cpp + lstmtester.h + normstrngs.cpp + normstrngs.h + unicharset_training_utils.cpp + unicharset_training_utils.h +) +add_library (unicharset_training ${unicharset_training_src}) +if (NOT CPPAN_BUILD) +target_link_libraries (unicharset_training common_training ${ICU_LIBRARIES}) +else() +target_link_libraries (unicharset_training common_training pvt.cppan.demo.unicode.icu.i18n) +endif() +project_group (unicharset_training "Training Tools") + + +######################################## +# EXECUTABLE lstmeval +######################################## + +add_executable (lstmeval lstmeval.cpp) +target_link_libraries (lstmeval unicharset_training) +project_group (lstmeval "Training Tools") + + +######################################## +# EXECUTABLE lstmtraining +######################################## + +add_executable (lstmtraining lstmtraining.cpp) +target_link_libraries (lstmtraining unicharset_training) +project_group (lstmtraining "Training Tools") + + ######################################## # EXECUTABLE set_unicharset_properties ######################################## -if (ICU_FOUND) - -include_directories(${ICU_INCLUDE_DIRS}) - -add_executable (set_unicharset_properties - set_unicharset_properties.cpp - unicharset_training_utils.cpp - unicharset_training_utils.h - fileio.cpp - fileio.h - normstrngs.cpp - normstrngs.h - icuerrorcode.h -) -target_link_libraries (set_unicharset_properties common_training ${ICU_LIBRARIES}) +add_executable (set_unicharset_properties set_unicharset_properties.cpp) +target_link_libraries (set_unicharset_properties unicharset_training) project_group (set_unicharset_properties "Training Tools") @@ -185,6 +224,10 @@ project_group (set_unicharset_properties "Training Tools") # EXECUTABLE text2image ######################################## +if (NOT CPPAN_BUILD) +find_package(PkgConfig) +endif() + if (PKG_CONFIG_FOUND) pkg_check_modules(Pango REQUIRED pango) @@ -199,8 +242,6 @@ set(text2image_src boxchar.h degradeimage.cpp degradeimage.h - fileio.cpp - fileio.h ligature_table.cpp ligature_table.h normstrngs.cpp @@ -221,8 +262,7 @@ endif() add_executable (text2image ${text2image_src}) target_include_directories (text2image BEFORE PRIVATE ${Cairo_INCLUDE_DIRS} ${Pango_INCLUDE_DIRS}) target_compile_definitions (text2image PRIVATE -DPANGO_ENABLE_ENGINE) -target_link_libraries (text2image tesseract common_training - ${ICU_LIBRARIES} +target_link_libraries (text2image libtesseract unicharset_training ${Pango_LIBRARIES} ${Cairo_LIBRARIES} ${PangoCairo_LIBRARIES} @@ -233,6 +273,5 @@ project_group (text2image "Training Tools") endif(PKG_CONFIG_FOUND) endif(ICU_FOUND) -endif(STATIC OR NOT (WIN32 OR CYGWIN)) ############################################################################### diff --git a/training/Makefile.am b/training/Makefile.am index fe3d85bc..defc550a 100644 --- a/training/Makefile.am +++ b/training/Makefile.am @@ -3,6 +3,7 @@ AM_CPPFLAGS += \ -DUSE_STD_NAMESPACE -DPANGO_ENABLE_ENGINE\ -I$(top_srcdir)/ccmain -I$(top_srcdir)/api \ -I$(top_srcdir)/ccutil -I$(top_srcdir)/ccstruct \ + -I$(top_srcdir)/lstm -I$(top_srcdir)/arch \ -I$(top_srcdir)/viewer \ -I$(top_srcdir)/textord -I$(top_srcdir)/dict \ -I$(top_srcdir)/classify -I$(top_srcdir)/display \ @@ -10,14 +11,6 @@ AM_CPPFLAGS += \ EXTRA_DIST = language-specific.sh tesstrain.sh tesstrain_utils.sh -if MINGW -# try static build -#AM_LDFLAGS += -all-static -#libic=-lsicuin -licudt -lsicuuc -libicu=-licuin -licuuc -else -libicu=-licui18n -licuuc -endif # TODO: training programs can not be linked to shared library created # with -fvisibility if VISIBILITY @@ -26,7 +19,7 @@ endif noinst_HEADERS = \ boxchar.h commandlineflags.h commontraining.h degradeimage.h \ - fileio.h icuerrorcode.h ligature_table.h normstrngs.h \ + fileio.h icuerrorcode.h ligature_table.h lstmtester.h normstrngs.h \ mergenf.h pango_font_info.h stringrenderer.h \ tessopt.h tlog.h unicharset_training_utils.h util.h @@ -38,14 +31,14 @@ libtesseract_training_la_LIBADD = \ libtesseract_training_la_SOURCES = \ boxchar.cpp commandlineflags.cpp commontraining.cpp degradeimage.cpp \ - fileio.cpp ligature_table.cpp normstrngs.cpp pango_font_info.cpp \ + fileio.cpp ligature_table.cpp lstmtester.cpp normstrngs.cpp pango_font_info.cpp \ stringrenderer.cpp tlog.cpp unicharset_training_utils.cpp libtesseract_tessopt_la_SOURCES = \ tessopt.cpp bin_PROGRAMS = ambiguous_words classifier_tester cntraining combine_tessdata \ - dawg2wordlist mftraining set_unicharset_properties shapeclustering \ + dawg2wordlist lstmeval lstmtraining mftraining set_unicharset_properties shapeclustering \ text2image unicharset_extractor wordlist2dawg ambiguous_words_SOURCES = ambiguous_words.cpp @@ -58,12 +51,13 @@ ambiguous_words_LDADD += \ ../textord/libtesseract_textord.la \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccmain/libtesseract_main.la \ - ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ ../wordrec/libtesseract_wordrec.la \ ../ccutil/libtesseract_ccutil.la else @@ -82,12 +76,13 @@ classifier_tester_LDADD += \ ../textord/libtesseract_textord.la \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccmain/libtesseract_main.la \ - ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ ../wordrec/libtesseract_wordrec.la \ ../ccutil/libtesseract_ccutil.la else @@ -115,12 +110,13 @@ cntraining_LDADD += \ ../textord/libtesseract_textord.la \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccmain/libtesseract_main.la \ - ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ ../wordrec/libtesseract_wordrec.la \ ../ccutil/libtesseract_ccutil.la else @@ -136,12 +132,13 @@ if USING_MULTIPLELIBS dawg2wordlist_LDADD += \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccmain/libtesseract_main.la \ - ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ ../wordrec/libtesseract_wordrec.la \ ../textord/libtesseract_textord.la \ ../ccutil/libtesseract_ccutil.la @@ -150,22 +147,74 @@ dawg2wordlist_LDADD += \ ../api/libtesseract.la endif +lstmeval_SOURCES = lstmeval.cpp +#lstmeval_LDFLAGS = -static +lstmeval_LDADD = \ + libtesseract_training.la \ + libtesseract_tessopt.la \ + $(ICU_UC_LIBS) +if USING_MULTIPLELIBS +lstmeval_LDADD += \ + ../textord/libtesseract_textord.la \ + ../classify/libtesseract_classify.la \ + ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ + ../ccstruct/libtesseract_ccstruct.la \ + ../cutil/libtesseract_cutil.la \ + ../viewer/libtesseract_viewer.la \ + ../ccmain/libtesseract_main.la \ + ../wordrec/libtesseract_wordrec.la \ + ../ccutil/libtesseract_ccutil.la +else +lstmeval_LDADD += \ + ../api/libtesseract.la +endif + +lstmtraining_SOURCES = lstmtraining.cpp +#lstmtraining_LDFLAGS = -static +lstmtraining_LDADD = \ + libtesseract_training.la \ + libtesseract_tessopt.la \ + $(ICU_I18N_LIBS) $(ICU_UC_LIBS) +if USING_MULTIPLELIBS +lstmtraining_LDADD += \ + ../textord/libtesseract_textord.la \ + ../classify/libtesseract_classify.la \ + ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ + ../ccstruct/libtesseract_ccstruct.la \ + ../cutil/libtesseract_cutil.la \ + ../viewer/libtesseract_viewer.la \ + ../ccmain/libtesseract_main.la \ + ../wordrec/libtesseract_wordrec.la \ + ../ccutil/libtesseract_ccutil.la +else +lstmtraining_LDADD += \ + ../api/libtesseract.la +endif + mftraining_SOURCES = mftraining.cpp mergenf.cpp #mftraining_LDFLAGS = -static mftraining_LDADD = \ libtesseract_training.la \ - libtesseract_tessopt.la + libtesseract_tessopt.la \ + $(ICU_UC_LIBS) if USING_MULTIPLELIBS mftraining_LDADD += \ ../textord/libtesseract_textord.la \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccmain/libtesseract_main.la \ - ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ ../wordrec/libtesseract_wordrec.la \ ../ccutil/libtesseract_ccutil.la else @@ -174,22 +223,22 @@ mftraining_LDADD += \ endif set_unicharset_properties_SOURCES = set_unicharset_properties.cpp -#set_unicharset_properties_LDFLAGS = $(pkg-config --libs icu-uc) set_unicharset_properties_LDADD = \ libtesseract_training.la \ libtesseract_tessopt.la \ - $(libicu) + $(ICU_I18N_LIBS) $(ICU_UC_LIBS) if USING_MULTIPLELIBS set_unicharset_properties_LDADD += \ ../textord/libtesseract_textord.la \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ ../ccstruct/libtesseract_ccstruct.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccmain/libtesseract_main.la \ - ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ ../wordrec/libtesseract_wordrec.la \ ../ccutil/libtesseract_ccutil.la else @@ -207,12 +256,13 @@ shapeclustering_LDADD += \ ../textord/libtesseract_textord.la \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccmain/libtesseract_main.la \ - ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ ../wordrec/libtesseract_wordrec.la \ ../ccutil/libtesseract_ccutil.la else @@ -224,25 +274,27 @@ text2image_SOURCES = text2image.cpp #text2image_LDFLAGS = -static text2image_LDADD = \ libtesseract_training.la \ - libtesseract_tessopt.la + libtesseract_tessopt.la \ + $(ICU_I18N_LIBS) $(ICU_UC_LIBS) if USING_MULTIPLELIBS text2image_LDADD += \ ../textord/libtesseract_textord.la \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccmain/libtesseract_main.la \ - ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ ../wordrec/libtesseract_wordrec.la \ ../ccutil/libtesseract_ccutil.la else text2image_LDADD += \ ../api/libtesseract.la endif -text2image_LDADD += $(libicu) -lpango-1.0 -lpangocairo-1.0 \ +text2image_LDADD += $(ICU_UC_LIBS) -lpango-1.0 -lpangocairo-1.0 \ -lgobject-2.0 -lglib-2.0 -lcairo -lpangoft2-1.0 -lfontconfig unicharset_extractor_SOURCES = unicharset_extractor.cpp @@ -266,12 +318,13 @@ if USING_MULTIPLELIBS wordlist2dawg_LDADD += \ ../classify/libtesseract_classify.la \ ../dict/libtesseract_dict.la \ + ../arch/libtesseract_avx.la \ + ../arch/libtesseract_sse.la \ + ../lstm/libtesseract_lstm.la \ ../ccstruct/libtesseract_ccstruct.la \ ../cutil/libtesseract_cutil.la \ ../viewer/libtesseract_viewer.la \ ../ccmain/libtesseract_main.la \ - ../cube/libtesseract_cube.la \ - ../neural_networks/runtime/libtesseract_neural.la \ ../wordrec/libtesseract_wordrec.la \ ../textord/libtesseract_textord.la \ ../ccutil/libtesseract_ccutil.la @@ -309,3 +362,17 @@ shapeclustering_LDFLAGS = $(OPENCL_LDFLAGS) text2image_LDFLAGS = $(OPENCL_LDFLAGS) unicharset_extractor_LDFLAGS = $(OPENCL_LDFLAGS) wordlist2dawg_LDFLAGS = $(OPENCL_LDFLAGS) + +ambiguous_words_LDADD += $(LEPTONICA_LIBS) +classifier_tester_LDADD += $(LEPTONICA_LIBS) +cntraining_LDADD += $(LEPTONICA_LIBS) +combine_tessdata_LDADD += $(LEPTONICA_LIBS) +dawg2wordlist_LDADD += $(LEPTONICA_LIBS) +lstmeval_LDADD += $(LEPTONICA_LIBS) +lstmtraining_LDADD += $(LEPTONICA_LIBS) +mftraining_LDADD += $(LEPTONICA_LIBS) +set_unicharset_properties_LDADD += $(LEPTONICA_LIBS) +shapeclustering_LDADD += $(LEPTONICA_LIBS) +text2image_LDADD += $(LEPTONICA_LIBS) +unicharset_extractor_LDFLAGS += $(LEPTONICA_LIBS) +wordlist2dawg_LDADD += $(LEPTONICA_LIBS) diff --git a/training/ambiguous_words.cpp b/training/ambiguous_words.cpp index 23358089..b6bc7cdc 100644 --- a/training/ambiguous_words.cpp +++ b/training/ambiguous_words.cpp @@ -55,11 +55,11 @@ int main(int argc, char** argv) { GenericVector vars_values; vars_vec.push_back("output_ambig_words_file"); vars_values.push_back(output_file_str); - api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY, - NULL, 0, &vars_vec, &vars_values, false); + api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY, nullptr, + 0, &vars_vec, &vars_values, false); tesseract::Dict &dict = api.tesseract()->getDict(); FILE *input_file = fopen(input_file_str, "rb"); - if (input_file == NULL) { + if (input_file == nullptr) { tprintf("Failed to open input wordlist file %s\n", input_file_str); exit(1); } @@ -67,10 +67,10 @@ int main(int argc, char** argv) { // Read word list and call Dict::NoDangerousAmbig() for each word // to record ambiguities in the output file. - while (fgets(str, CHARS_PER_LINE, input_file) != NULL) { + while (fgets(str, CHARS_PER_LINE, input_file) != nullptr) { chomp_string(str); // remove newline WERD_CHOICE word(str, dict.getUnicharset()); - dict.NoDangerousAmbig(&word, NULL, false, NULL); + dict.NoDangerousAmbig(&word, nullptr, false, nullptr); } // Clean up. fclose(input_file); diff --git a/training/boxchar.cpp b/training/boxchar.cpp index b99c12a6..16092101 100644 --- a/training/boxchar.cpp +++ b/training/boxchar.cpp @@ -39,7 +39,7 @@ const int kMinNewlineRatio = 5; namespace tesseract { BoxChar::BoxChar(const char* utf8_str, int len) : ch_(utf8_str, len) { - box_ = NULL; + box_ = nullptr; } BoxChar::~BoxChar() { boxDestroy(&box_); } @@ -49,10 +49,11 @@ void BoxChar::AddBox(int x, int y, int width, int height) { } /* static */ -void BoxChar::TranslateBoxes(int xshift, int yshift, vector* boxes) { +void BoxChar::TranslateBoxes(int xshift, int yshift, + std::vector* boxes) { for (int i = 0; i < boxes->size(); ++i) { BOX* box = (*boxes)[i]->box_; - if (box != NULL) { + if (box != nullptr) { box->x += xshift; box->y += yshift; } @@ -62,36 +63,36 @@ void BoxChar::TranslateBoxes(int xshift, int yshift, vector* boxes) { // Prepares for writing the boxes to a file by inserting newlines, spaces, // and re-ordering so the boxes are strictly left-to-right. /* static */ -void BoxChar::PrepareToWrite(vector* boxes) { +void BoxChar::PrepareToWrite(std::vector* boxes) { bool rtl_rules = ContainsMostlyRTL(*boxes); bool vertical_rules = MostlyVertical(*boxes); InsertNewlines(rtl_rules, vertical_rules, boxes); InsertSpaces(rtl_rules, vertical_rules, boxes); for (int i = 0; i < boxes->size(); ++i) { - if ((*boxes)[i]->box_ == NULL) tprintf("Null box at index %d\n", i); + if ((*boxes)[i]->box_ == nullptr) tprintf("Null box at index %d\n", i); } if (rtl_rules) { ReorderRTLText(boxes); } - tprintf("Rtl = %d ,vertical=%d\n", rtl_rules, vertical_rules); } // Inserts newline (tab) characters into the vector at newline positions. /* static */ void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, - vector* boxes) { + std::vector* boxes) { int prev_i = -1; int max_shift = 0; for (int i = 0; i < boxes->size(); ++i) { Box* box = (*boxes)[i]->box_; - if (box == NULL) { + if (box == nullptr) { if (prev_i < 0 || prev_i < i - 1 || i + 1 == boxes->size()) { // Erase null boxes at the start of a line and after another null box. do { delete (*boxes)[i]; boxes->erase(boxes->begin() + i); --i; - } while (i >= 0 && i + 1 == boxes->size() && (*boxes)[i]->box_ == NULL); + } while (i >= 0 && i + 1 == boxes->size() && + (*boxes)[i]->box_ == nullptr); } continue; } @@ -139,18 +140,18 @@ void BoxChar::InsertNewlines(bool rtl_rules, bool vertical_rules, } } -// Converts NULL boxes to space characters, with appropriate bounding boxes. +// Converts nullptr boxes to space characters, with appropriate bounding boxes. /* static */ void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, - vector* boxes) { + std::vector* boxes) { // After InsertNewlines, any remaining null boxes are not newlines, and are // singletons, so add a box to each remaining null box. for (int i = 1; i + 1 < boxes->size(); ++i) { Box* box = (*boxes)[i]->box_; - if (box == NULL) { + if (box == nullptr) { Box* prev = (*boxes)[i - 1]->box_; Box* next = (*boxes)[i + 1]->box_; - ASSERT_HOST(prev != NULL && next != NULL); + ASSERT_HOST(prev != nullptr && next != nullptr); int top = MIN(prev->y, next->y); int bottom = MAX(prev->y + prev->h, next->y + next->h); int left = prev->x + prev->w; @@ -170,15 +171,15 @@ void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, j >= 0 && (*boxes)[j]->ch_ != " " && (*boxes)[j]->ch_ != "\t"; --j) { prev = (*boxes)[j]->box_; - ASSERT_HOST(prev != NULL); + ASSERT_HOST(prev != nullptr); if (prev->x < right) { right = prev->x; } } // Left becomes the max right of all next boxes forward to the first // space or newline. - for (int j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != NULL && - (*boxes)[j]->ch_ != "\t"; + for (int j = i + 2; j < boxes->size() && (*boxes)[j]->box_ != nullptr && + (*boxes)[j]->ch_ != "\t"; ++j) { next = (*boxes)[j]->box_; if (next->x + next->w > left) { @@ -198,7 +199,7 @@ void BoxChar::InsertSpaces(bool rtl_rules, bool vertical_rules, // Reorders text in a right-to-left script in left-to-right order. /* static */ -void BoxChar::ReorderRTLText(vector* boxes) { +void BoxChar::ReorderRTLText(std::vector* boxes) { // After adding newlines and spaces, this task is simply a matter of sorting // by left each group of boxes between newlines. BoxCharPtrSort sorter; @@ -206,13 +207,13 @@ void BoxChar::ReorderRTLText(vector* boxes) { for (int start = 0; start < boxes->size(); start = end + 1) { end = start + 1; while (end < boxes->size() && (*boxes)[end]->ch_ != "\t") ++end; - sort(boxes->begin() + start, boxes->begin() + end, sorter); + std::sort(boxes->begin() + start, boxes->begin() + end, sorter); } } // Returns true if the vector contains mostly RTL characters. /* static */ -bool BoxChar::ContainsMostlyRTL(const vector& boxes) { +bool BoxChar::ContainsMostlyRTL(const std::vector& boxes) { int num_rtl = 0, num_ltr = 0; for (int i = 0; i < boxes.size(); ++i) { // Convert the unichar to UTF32 representation @@ -241,10 +242,10 @@ bool BoxChar::ContainsMostlyRTL(const vector& boxes) { // Returns true if the text is mostly laid out vertically. /* static */ -bool BoxChar::MostlyVertical(const vector& boxes) { +bool BoxChar::MostlyVertical(const std::vector& boxes) { inT64 total_dx = 0, total_dy = 0; for (int i = 1; i < boxes.size(); ++i) { - if (boxes[i - 1]->box_ != NULL && boxes[i]->box_ != NULL && + if (boxes[i - 1]->box_ != nullptr && boxes[i]->box_ != nullptr && boxes[i - 1]->page_ == boxes[i]->page_) { int dx = boxes[i]->box_->x - boxes[i - 1]->box_->x; int dy = boxes[i]->box_->y - boxes[i - 1]->box_->y; @@ -260,7 +261,7 @@ bool BoxChar::MostlyVertical(const vector& boxes) { // Returns the total length of all the strings in the boxes. /* static */ -int BoxChar::TotalByteLength(const vector& boxes) { +int BoxChar::TotalByteLength(const std::vector& boxes) { int total_length = 0; for (int i = 0; i < boxes.size(); ++i) total_length += boxes[i]->ch_.size(); return total_length; @@ -270,7 +271,8 @@ int BoxChar::TotalByteLength(const vector& boxes) { // The rotation is in radians clockwise about the given center. /* static */ void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter, - int start_box, int end_box, vector* boxes) { + int start_box, int end_box, + std::vector* boxes) { Boxa* orig = boxaCreate(0); for (int i = start_box; i < end_box; ++i) { BOX* box = (*boxes)[i]->box_; @@ -290,14 +292,21 @@ void BoxChar::RotateBoxes(float rotation, int xcenter, int ycenter, const int kMaxLineLength = 1024; /* static */ void BoxChar::WriteTesseractBoxFile(const string& filename, int height, - const vector& boxes) { + const std::vector& boxes) { + string output = GetTesseractBoxStr(height, boxes); + File::WriteStringToFileOrDie(output, filename); +} + +/* static */ +string BoxChar::GetTesseractBoxStr(int height, + const std::vector& boxes) { string output; char buffer[kMaxLineLength]; for (int i = 0; i < boxes.size(); ++i) { const Box* box = boxes[i]->box_; - if (box == NULL) { + if (box == nullptr) { tprintf("Error: Call PrepareToWrite before WriteTesseractBoxFile!!\n"); - return; + return ""; } int nbytes = snprintf(buffer, kMaxLineLength, "%s %d %d %d %d %d\n", @@ -305,6 +314,7 @@ void BoxChar::WriteTesseractBoxFile(const string& filename, int height, box->x + box->w, height - box->y, boxes[i]->page_); output.append(buffer, nbytes); } - File::WriteStringToFileOrDie(output, filename); + return output; } + } // namespace tesseract diff --git a/training/boxchar.h b/training/boxchar.h index 27b568a1..67fc4ae5 100644 --- a/training/boxchar.h +++ b/training/boxchar.h @@ -60,32 +60,33 @@ class BoxChar { // Sort function for sorting by left edge of box. Note that this will not // work properly until after InsertNewlines and InsertSpaces. bool operator<(const BoxChar& other) const { - if (box_ == NULL) return true; - if (other.box_ == NULL) return false; + if (box_ == nullptr) return true; + if (other.box_ == nullptr) return false; return box_->x < other.box_->x; } static void TranslateBoxes(int xshift, int yshift, - vector* boxes); + std::vector* boxes); // Prepares for writing the boxes to a file by inserting newlines, spaces, // and re-ordering so the boxes are strictly left-to-right. - static void PrepareToWrite(vector* boxes); + static void PrepareToWrite(std::vector* boxes); // Inserts newline (tab) characters into the vector at newline positions. static void InsertNewlines(bool rtl_rules, bool vertical_rules, - vector* boxes); - // Converts NULL boxes to space characters, with appropriate bounding boxes. + std::vector* boxes); + // Converts nullptr boxes to space characters, with appropriate bounding + // boxes. static void InsertSpaces(bool rtl_rules, bool vertical_rules, - vector* boxes); + std::vector* boxes); // Reorders text in a right-to-left script in left-to-right order. - static void ReorderRTLText(vector* boxes); + static void ReorderRTLText(std::vector* boxes); // Returns true if the vector contains mostly RTL characters. - static bool ContainsMostlyRTL(const vector& boxes); + static bool ContainsMostlyRTL(const std::vector& boxes); // Returns true if the text is mostly laid out vertically. - static bool MostlyVertical(const vector& boxes); + static bool MostlyVertical(const std::vector& boxes); // Returns the total length of all the strings in the boxes. - static int TotalByteLength(const vector& boxes); + static int TotalByteLength(const std::vector& boxes); // Rotate the vector of boxes between start and end by the given rotation. // The rotation is in radians clockwise about the given center. @@ -94,12 +95,16 @@ class BoxChar { int ycenter, int start_box, int end_box, - vector* boxes); + std::vector* boxes); // Create a tesseract box file from the vector of boxes. The image height // is needed to convert to tesseract coordinates. static void WriteTesseractBoxFile(const string& name, int height, - const vector& boxes); + const std::vector& boxes); + // Gets the tesseract box file as a string from the vector of boxes. + // The image height is needed to convert to tesseract coordinates. + static string GetTesseractBoxStr(int height, + const std::vector& boxes); private: string ch_; diff --git a/training/classifier_tester.cpp b/training/classifier_tester.cpp index 48f3781e..37b0b60b 100644 --- a/training/classifier_tester.cpp +++ b/training/classifier_tester.cpp @@ -22,9 +22,6 @@ #endif // USE_STD_NAMESPACE #include "baseapi.h" #include "commontraining.h" -#ifndef NO_CUBE_BUILD -#include "cubeclassifier.h" -#endif // NO_CUBE_BUILD #include "mastertrainer.h" #include "params.h" #include "strngs.h" @@ -34,23 +31,14 @@ STRING_PARAM_FLAG(classifier, "", "Classifier to test"); STRING_PARAM_FLAG(lang, "eng", "Language to test"); STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files"); DECLARE_INT_PARAM_FLAG(debug_level); -DECLARE_STRING_PARAM_FLAG(T); enum ClassifierName { CN_PRUNER, CN_FULL, -#ifndef NO_CUBE_BUILD - CN_CUBE, - CN_CUBETESS, -#endif // NO_CUBE_BUILD CN_COUNT }; -const char* names[] = {"pruner", "full", -#ifndef NO_CUBE_BUILD - "cube", "cubetess", -#endif // NO_CUBE_BUILD - NULL }; +const char* names[] = {"pruner", "full", nullptr}; static tesseract::ShapeClassifier* InitializeClassifier( const char* classifer_name, const UNICHARSET& unicharset, @@ -66,60 +54,37 @@ static tesseract::ShapeClassifier* InitializeClassifier( } if (classifier == CN_COUNT) { fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str()); - return NULL; + return nullptr; } // We need to initialize tesseract to test. *api = new tesseract::TessBaseAPI; tesseract::OcrEngineMode engine_mode = tesseract::OEM_TESSERACT_ONLY; -#ifndef NO_CUBE_BUILD - if (classifier == CN_CUBE || classifier == CN_CUBETESS) - engine_mode = tesseract::OEM_TESSERACT_CUBE_COMBINED; -#endif // NO_CUBE_BUILD - tesseract::Tesseract* tesseract = NULL; - tesseract::Classify* classify = NULL; + tesseract::Tesseract* tesseract = nullptr; + tesseract::Classify* classify = nullptr; if ( -#ifndef NO_CUBE_BUILD - classifier == CN_CUBE || classifier == CN_CUBETESS || -#endif // NO_CUBE_BUILD classifier == CN_PRUNER || classifier == CN_FULL) { -#ifndef NO_CUBE_BUILD - (*api)->SetVariable("cube_debug_level", "2"); -#endif // NO_CUBE_BUILD if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(), engine_mode) < 0) { fprintf(stderr, "Tesseract initialization failed!\n"); - return NULL; + return nullptr; } tesseract = const_cast((*api)->tesseract()); classify = reinterpret_cast(tesseract); - if (classify->shape_table() == NULL) { + if (classify->shape_table() == nullptr) { fprintf(stderr, "Tesseract must contain a ShapeTable!\n"); - return NULL; + return nullptr; } } - tesseract::ShapeClassifier* shape_classifier = NULL; + tesseract::ShapeClassifier* shape_classifier = nullptr; - if (!FLAGS_T.empty()) { - const char* config_name; - while ((config_name = GetNextFilename(argc, argv)) != NULL) { - tprintf("Reading config file %s ...\n", config_name); - (*api)->ReadConfigFile(config_name); - } - } if (classifier == CN_PRUNER) { shape_classifier = new tesseract::TessClassifier(true, classify); } else if (classifier == CN_FULL) { shape_classifier = new tesseract::TessClassifier(false, classify); -#ifndef NO_CUBE_BUILD - } else if (classifier == CN_CUBE) { - shape_classifier = new tesseract::CubeClassifier(tesseract); - } else if (classifier == CN_CUBETESS) { - shape_classifier = new tesseract::CubeTessClassifier(tesseract); -#endif // NO_CUBE_BUILD } else { fprintf(stderr, "%s tester not yet implemented\n", classifer_name); - return NULL; + return nullptr; } tprintf("Testing classifier %s:\n", classifer_name); return shape_classifier; @@ -143,19 +108,17 @@ static tesseract::ShapeClassifier* InitializeClassifier( // Available values of classifier (x above) are: // pruner : Tesseract class pruner only. // full : Tesseract full classifier. -// cube : Cube classifier. (Not possible with an input trainer.) -// cubetess : Tesseract class pruner with rescoring by Cube. (Not possible // with an input trainer.) int main(int argc, char **argv) { ParseArguments(&argc, &argv); STRING file_prefix; - tesseract::MasterTrainer* trainer = tesseract::LoadTrainingData( - argc, argv, false, NULL, &file_prefix); + tesseract::MasterTrainer* trainer = + tesseract::LoadTrainingData(argc, argv, false, nullptr, &file_prefix); tesseract::TessBaseAPI* api; // Decode the classifier string. tesseract::ShapeClassifier* shape_classifier = InitializeClassifier( FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api); - if (shape_classifier == NULL) { + if (shape_classifier == nullptr) { fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str()); return 1; } @@ -165,9 +128,9 @@ int main(int argc, char **argv) { // We want to test with replicated samples too. trainer->ReplicateAndRandomizeSamplesIfRequired(); - trainer->TestClassifierOnSamples(tesseract:: CT_UNICHAR_TOP1_ERR, + trainer->TestClassifierOnSamples(tesseract::CT_UNICHAR_TOP1_ERR, MAX(3, FLAGS_debug_level), false, - shape_classifier, NULL); + shape_classifier, nullptr); delete shape_classifier; delete api; delete trainer; diff --git a/training/cntraining.cpp b/training/cntraining.cpp index ab19ddb9..2ddcf92e 100644 --- a/training/cntraining.cpp +++ b/training/cntraining.cpp @@ -20,7 +20,6 @@ ** limitations under the License. ******************************************************************************/ - /*---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------*/ @@ -53,10 +52,8 @@ int main ( Private Function Prototypes ----------------------------------------------------------------------------*/ -void WriteNormProtos ( - const char *Directory, - LIST LabeledProtoList, - CLUSTERER *Clusterer); +void WriteNormProtos(const char *Directory, LIST LabeledProtoList, + const FEATURE_DESC_STRUCT *feature_desc); /* PARAMDESC *ConvertToPARAMDESC( @@ -81,7 +78,6 @@ CLUSTERCONFIG CNConfig = elliptical, 0.025, 0.05, 0.8, 1e-3, 0 }; - /*---------------------------------------------------------------------------- Public Code ----------------------------------------------------------------------------*/ @@ -134,15 +130,14 @@ CLUSTERCONFIG CNConfig = * @note Exceptions: none * @note History: Fri Aug 18 08:56:17 1989, DSJ, Created. */ -int main(int argc, char* argv[]) -{ +int main(int argc, char *argv[]) { // Set the global Config parameters before parsing the command line. Config = CNConfig; const char *PageName; FILE *TrainingPage; LIST CharList = NIL_LIST; - CLUSTERER *Clusterer = NULL; + CLUSTERER *Clusterer = nullptr; LIST ProtoList = NIL_LIST; LIST NormProtoList = NIL_LIST; LIST pCharList; @@ -152,11 +147,11 @@ int main(int argc, char* argv[]) ParseArguments(&argc, &argv); int num_fonts = 0; - while ((PageName = GetNextFilename(argc, argv)) != NULL) { + while ((PageName = GetNextFilename(argc, argv)) != nullptr) { printf("Reading %s ...\n", PageName); TrainingPage = Efopen(PageName, "rb"); - ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, - 100, NULL, TrainingPage, &CharList); + ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr, + TrainingPage, &CharList); fclose(TrainingPage); ++num_fonts; } @@ -165,13 +160,18 @@ int main(int argc, char* argv[]) // reduce the min samples: // Config.MinSamples = 0.5 / num_fonts; pCharList = CharList; + // The norm protos will count the source protos, so we keep them here in + // freeable_protos, so they can be freed later. + GenericVector freeable_protos; iterate(pCharList) { //Cluster - if (Clusterer) - FreeClusterer(Clusterer); CharSample = (LABELEDLIST)first_node(pCharList); Clusterer = SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE); + if (Clusterer == nullptr) { // To avoid a SIGSEGV + fprintf(stderr, "Error: NULL clusterer!\n"); + return 1; + } float SavedMinSamples = Config.MinSamples; // To disable the tendency to produce a single cluster for all fonts, // make MagicSamples an impossible to achieve number: @@ -190,21 +190,21 @@ int main(int argc, char* argv[]) } Config.MinSamples = SavedMinSamples; AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label); + freeable_protos.push_back(ProtoList); + FreeClusterer(Clusterer); } FreeTrainingSamples(CharList); - if (Clusterer == NULL) { // To avoid a SIGSEGV - fprintf(stderr, "Error: NULL clusterer!\n"); - return 1; - } - WriteNormProtos(FLAGS_D.c_str(), NormProtoList, Clusterer); + int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE); + WriteNormProtos(FLAGS_D.c_str(), NormProtoList, + FeatureDefs.FeatureDesc[desc_index]); FreeNormProtoList(NormProtoList); - FreeProtoList(&ProtoList); - FreeClusterer(Clusterer); + for (int i = 0; i < freeable_protos.size(); ++i) { + FreeProtoList(&freeable_protos[i]); + } printf ("\n"); return 0; } // main - /*---------------------------------------------------------------------------- Private Code ----------------------------------------------------------------------------*/ @@ -216,32 +216,28 @@ int main(int argc, char* argv[]) * of the samples. * @param Directory directory to place sample files into * @param LabeledProtoList List of labeled protos -* @param Clusterer The CLUSTERER to use +* @param feature_desc Description of the features * @return none * @note Exceptions: none * @note History: Fri Aug 18 16:17:06 1989, DSJ, Created. */ -void WriteNormProtos ( - const char *Directory, - LIST LabeledProtoList, - CLUSTERER *Clusterer) -{ +void WriteNormProtos(const char *Directory, LIST LabeledProtoList, + const FEATURE_DESC_STRUCT *feature_desc) { FILE *File; STRING Filename; LABELEDLIST LabeledProto; int N; Filename = ""; - if (Directory != NULL && Directory[0] != '\0') - { + if (Directory != nullptr && Directory[0] != '\0') { Filename += Directory; Filename += "/"; } Filename += "normproto"; printf ("\nWriting %s ...", Filename.string()); File = Efopen (Filename.string(), "wb"); - fprintf(File,"%0d\n",Clusterer->SampleSize); - WriteParamDesc(File,Clusterer->SampleSize,Clusterer->ParamDesc); + fprintf(File, "%0d\n", feature_desc->NumParams); + WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc); iterate(LabeledProtoList) { LabeledProto = (LABELEDLIST) first_node (LabeledProtoList); @@ -256,7 +252,7 @@ void WriteNormProtos ( exit(1); } fprintf(File, "\n%s %d\n", LabeledProto->Label, N); - WriteProtos(File, Clusterer->SampleSize, LabeledProto->List, true, false); + WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false); } fclose (File); diff --git a/training/combine_tessdata.cpp b/training/combine_tessdata.cpp index 894d56d7..815c71bc 100644 --- a/training/combine_tessdata.cpp +++ b/training/combine_tessdata.cpp @@ -65,6 +65,7 @@ // int main(int argc, char **argv) { int i; + tesseract::TessdataManager tm; if (argc == 2) { printf("Combining tessdata files\n"); STRING lang = argv[1]; @@ -73,8 +74,7 @@ int main(int argc, char **argv) { lang += '.'; STRING output_file = lang; output_file += kTrainedDataSuffix; - if (!tesseract::TessdataManager::CombineDataFiles( - lang.string(), output_file.string())) { + if (!tm.CombineDataFiles(lang.string(), output_file.string())) { printf("Error combining tessdata files into %s\n", output_file.string()); } else { @@ -83,8 +83,7 @@ int main(int argc, char **argv) { } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 || strcmp(argv[1], "-u") == 0)) { // Initialize TessdataManager with the data in the given traineddata file. - tesseract::TessdataManager tm; - tm.Init(argv[2], 0); + tm.Init(argv[2]); printf("Extracting tessdata components from %s\n", argv[2]); if (strcmp(argv[1], "-e") == 0) { for (i = 3; i < argc; ++i) { @@ -107,7 +106,6 @@ int main(int argc, char **argv) { } } } - tm.End(); } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) { // Rename the current traineddata file to a temporary name. const char *new_traineddata_filename = argv[2]; @@ -120,12 +118,10 @@ int main(int argc, char **argv) { } // Initialize TessdataManager with the data in the given traineddata file. - tesseract::TessdataManager tm; - tm.Init(traineddata_filename.string(), 0); + tm.Init(traineddata_filename.string()); // Write the updated traineddata file. tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3); - tm.End(); } else { printf("Usage for combining tessdata components:\n" " %s language_data_path_prefix\n" @@ -143,4 +139,5 @@ int main(int argc, char **argv) { " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]); return 1; } + tm.Directory(); } diff --git a/training/commandlineflags.cpp b/training/commandlineflags.cpp index 06bfbe65..6e5cded4 100644 --- a/training/commandlineflags.cpp +++ b/training/commandlineflags.cpp @@ -1,3 +1,12 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. #include "commandlineflags.h" #ifdef USE_STD_NAMESPACE @@ -9,7 +18,7 @@ bool IntFlagExists(const char* flag_name, inT32* value) { GenericVector empty; IntParam *p = ParamUtils::FindParam( full_flag_name.string(), GlobalParams()->int_params, empty); - if (p == NULL) return false; + if (p == nullptr) return false; *value = (inT32)(*p); return true; } @@ -20,7 +29,7 @@ bool DoubleFlagExists(const char* flag_name, double* value) { GenericVector empty; DoubleParam *p = ParamUtils::FindParam( full_flag_name.string(), GlobalParams()->double_params, empty); - if (p == NULL) return false; + if (p == nullptr) return false; *value = static_cast(*p); return true; } @@ -31,7 +40,7 @@ bool BoolFlagExists(const char* flag_name, bool* value) { GenericVector empty; BoolParam *p = ParamUtils::FindParam( full_flag_name.string(), GlobalParams()->bool_params, empty); - if (p == NULL) return false; + if (p == nullptr) return false; *value = (BOOL8)(*p); return true; } @@ -42,8 +51,8 @@ bool StringFlagExists(const char* flag_name, const char** value) { GenericVector empty; StringParam *p = ParamUtils::FindParam( full_flag_name.string(), GlobalParams()->string_params, empty); - *value = (p != NULL) ? p->string() : NULL; - return p != NULL; + *value = (p != nullptr) ? p->string() : nullptr; + return p != nullptr; } @@ -53,7 +62,7 @@ void SetIntFlagValue(const char* flag_name, const inT32 new_val) { GenericVector empty; IntParam *p = ParamUtils::FindParam( full_flag_name.string(), GlobalParams()->int_params, empty); - ASSERT_HOST(p != NULL); + ASSERT_HOST(p != nullptr); p->set_value(new_val); } @@ -63,7 +72,7 @@ void SetDoubleFlagValue(const char* flag_name, const double new_val) { GenericVector empty; DoubleParam *p = ParamUtils::FindParam( full_flag_name.string(), GlobalParams()->double_params, empty); - ASSERT_HOST(p != NULL); + ASSERT_HOST(p != nullptr); p->set_value(new_val); } @@ -73,7 +82,7 @@ void SetBoolFlagValue(const char* flag_name, const bool new_val) { GenericVector empty; BoolParam *p = ParamUtils::FindParam( full_flag_name.string(), GlobalParams()->bool_params, empty); - ASSERT_HOST(p != NULL); + ASSERT_HOST(p != nullptr); p->set_value(new_val); } @@ -83,20 +92,20 @@ void SetStringFlagValue(const char* flag_name, const char* new_val) { GenericVector empty; StringParam *p = ParamUtils::FindParam( full_flag_name.string(), GlobalParams()->string_params, empty); - ASSERT_HOST(p != NULL); + ASSERT_HOST(p != nullptr); p->set_value(STRING(new_val)); } bool SafeAtoi(const char* str, int* val) { - char *endptr = NULL; + char* endptr = nullptr; *val = strtol(str, &endptr, 10); - return endptr != NULL && *endptr == '\0'; + return endptr != nullptr && *endptr == '\0'; } bool SafeAtod(const char* str, double* val) { - char *endptr = NULL; + char* endptr = nullptr; *val = strtod(str, &endptr); - return endptr != NULL && *endptr == '\0'; + return endptr != nullptr && *endptr == '\0'; } void PrintCommandLineFlags() { @@ -106,36 +115,36 @@ void PrintCommandLineFlags() { if (!strncmp(GlobalParams()->int_params[i]->name_str(), kFlagNamePrefix, kFlagNamePrefixLen)) { printf(" --%s %s (type:int default:%d)\n", - GlobalParams()->int_params[i]->name_str() + kFlagNamePrefixLen, - GlobalParams()->int_params[i]->info_str(), - inT32(*(GlobalParams()->int_params[i]))); + GlobalParams()->int_params[i]->name_str() + kFlagNamePrefixLen, + GlobalParams()->int_params[i]->info_str(), + inT32(*(GlobalParams()->int_params[i]))); } } for (int i = 0; i < GlobalParams()->double_params.size(); ++i) { if (!strncmp(GlobalParams()->double_params[i]->name_str(), kFlagNamePrefix, kFlagNamePrefixLen)) { printf(" --%s %s (type:double default:%g)\n", - GlobalParams()->double_params[i]->name_str() + kFlagNamePrefixLen, - GlobalParams()->double_params[i]->info_str(), - static_cast(*(GlobalParams()->double_params[i]))); + GlobalParams()->double_params[i]->name_str() + kFlagNamePrefixLen, + GlobalParams()->double_params[i]->info_str(), + static_cast(*(GlobalParams()->double_params[i]))); } } for (int i = 0; i < GlobalParams()->bool_params.size(); ++i) { if (!strncmp(GlobalParams()->bool_params[i]->name_str(), kFlagNamePrefix, kFlagNamePrefixLen)) { printf(" --%s %s (type:bool default:%s)\n", - GlobalParams()->bool_params[i]->name_str() + kFlagNamePrefixLen, - GlobalParams()->bool_params[i]->info_str(), - (BOOL8(*(GlobalParams()->bool_params[i])) ? "true" : "false")); + GlobalParams()->bool_params[i]->name_str() + kFlagNamePrefixLen, + GlobalParams()->bool_params[i]->info_str(), + (BOOL8(*(GlobalParams()->bool_params[i])) ? "true" : "false")); } } for (int i = 0; i < GlobalParams()->string_params.size(); ++i) { if (!strncmp(GlobalParams()->string_params[i]->name_str(), kFlagNamePrefix, kFlagNamePrefixLen)) { printf(" --%s %s (type:string default:%s)\n", - GlobalParams()->string_params[i]->name_str() + kFlagNamePrefixLen, - GlobalParams()->string_params[i]->info_str(), - GlobalParams()->string_params[i]->string()); + GlobalParams()->string_params[i]->name_str() + kFlagNamePrefixLen, + GlobalParams()->string_params[i]->info_str(), + GlobalParams()->string_params[i]->string()); } } } @@ -172,13 +181,13 @@ void ParseCommandLineFlags(const char* usage, // Find the starting position of the value if it was specified in this // string. const char* equals_position = strchr(current_arg, '='); - const char* rhs = NULL; - if (equals_position != NULL) { + const char* rhs = nullptr; + if (equals_position != nullptr) { rhs = equals_position + 1; } // Extract the flag name. STRING lhs; - if (equals_position == NULL) { + if (equals_position == nullptr) { lhs = current_arg; } else { lhs.assign(current_arg, equals_position - current_arg); @@ -192,7 +201,7 @@ void ParseCommandLineFlags(const char* usage, // inT32 flag inT32 int_val; if (IntFlagExists(lhs.string(), &int_val)) { - if (rhs != NULL) { + if (rhs != nullptr) { if (!strlen(rhs)) { // Bad input of the format --int_flag= tprintf("ERROR: Bad argument: %s\n", (*argv)[i]); @@ -224,7 +233,7 @@ void ParseCommandLineFlags(const char* usage, // double flag double double_val; if (DoubleFlagExists(lhs.string(), &double_val)) { - if (rhs != NULL) { + if (rhs != nullptr) { if (!strlen(rhs)) { // Bad input of the format --double_flag= tprintf("ERROR: Bad argument: %s\n", (*argv)[i]); @@ -257,7 +266,7 @@ void ParseCommandLineFlags(const char* usage, // --flag=false, --flag=true, --flag=0 and --flag=1 bool bool_val; if (BoolFlagExists(lhs.string(), &bool_val)) { - if (rhs == NULL) { + if (rhs == nullptr) { // --flag form bool_val = true; } else { @@ -282,7 +291,7 @@ void ParseCommandLineFlags(const char* usage, // string flag const char* string_val; if (StringFlagExists(lhs.string(), &string_val)) { - if (rhs != NULL) { + if (rhs != nullptr) { string_val = rhs; } else { // Pick the next argument diff --git a/training/commontraining.cpp b/training/commontraining.cpp index 1c4cc832..72ce5c50 100644 --- a/training/commontraining.cpp +++ b/training/commontraining.cpp @@ -39,7 +39,6 @@ #include using tesseract::CCUtil; -using tesseract::FontInfo; using tesseract::IntFeatureSpace; using tesseract::ParamUtils; using tesseract::ShapeTable; @@ -60,7 +59,6 @@ STRING_PARAM_FLAG(F, "font_properties", "File listing font properties"); STRING_PARAM_FLAG(X, "", "File listing font xheights"); STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from"); STRING_PARAM_FLAG(O, "", "File to write unicharset to"); -STRING_PARAM_FLAG(T, "", "File to load trainer from"); STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to"); STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string"); DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples, @@ -116,15 +114,15 @@ void ParseArguments(int* argc, char ***argv) { namespace tesseract { // Helper loads shape table from the given file. ShapeTable* LoadShapeTable(const STRING& file_prefix) { - ShapeTable* shape_table = NULL; + ShapeTable* shape_table = nullptr; STRING shape_table_file = file_prefix; shape_table_file += kShapeTableFileSuffix; - FILE* shape_fp = fopen(shape_table_file.string(), "rb"); - if (shape_fp != NULL) { + TFile shape_fp; + if (shape_fp.Open(shape_table_file.string(), nullptr)) { shape_table = new ShapeTable; - if (!shape_table->DeSerialize(false, shape_fp)) { + if (!shape_table->DeSerialize(false, &shape_fp)) { delete shape_table; - shape_table = NULL; + shape_table = nullptr; tprintf("Error: Failed to read shape table %s\n", shape_table_file.string()); } else { @@ -132,7 +130,6 @@ ShapeTable* LoadShapeTable(const STRING& file_prefix) { tprintf("Read shape table %s of %d shapes\n", shape_table_file.string(), num_shapes); } - fclose(shape_fp); } else { tprintf("Warning: No shape table file present: %s\n", shape_table_file.string()); @@ -145,7 +142,7 @@ void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) { STRING shape_table_file = file_prefix; shape_table_file += kShapeTableFileSuffix; FILE* fp = fopen(shape_table_file.string(), "wb"); - if (fp != NULL) { + if (fp != nullptr) { if (!shape_table.Serialize(fp)) { fprintf(stderr, "Error writing shape table: %s\n", shape_table_file.string()); @@ -160,7 +157,7 @@ void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) { /** * Creates a MasterTraininer and loads the training data into it: * Initializes feature_defs and IntegerFX. - * Loads the shape_table if shape_table != NULL. + * Loads the shape_table if shape_table != nullptr. * Loads initial unicharset from -U command-line option. * If FLAGS_T is set, loads the majority of data from there, else: * - Loads font info from -F option. @@ -169,7 +166,7 @@ void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) { * - Deletes outliers and computes canonical samples. * - If FLAGS_output_trainer is set, saves the trainer for future use. * Computes canonical and cloud features. - * If shape_table is not NULL, but failed to load, make a fake flat one, + * If shape_table is not nullptr, but failed to load, make a fake flat one, * as shape clustering was not run. */ MasterTrainer* LoadTrainingData(int argc, const char* const * argv, @@ -183,15 +180,14 @@ MasterTrainer* LoadTrainingData(int argc, const char* const * argv, *file_prefix += FLAGS_D.c_str(); *file_prefix += "/"; } - // If we are shape clustering (NULL shape_table) or we successfully load + // If we are shape clustering (nullptr shape_table) or we successfully load // a shape_table written by a previous shape clustering, then // shape_analysis will be true, meaning that the MasterTrainer will replace // some members of the unicharset with their fragments. bool shape_analysis = false; - if (shape_table != NULL) { + if (shape_table != nullptr) { *shape_table = LoadShapeTable(*file_prefix); - if (*shape_table != NULL) - shape_analysis = true; + if (*shape_table != nullptr) shape_analysis = true; } else { shape_analysis = true; } @@ -201,87 +197,67 @@ MasterTrainer* LoadTrainingData(int argc, const char* const * argv, FLAGS_debug_level); IntFeatureSpace fs; fs.Init(kBoostXYBuckets, kBoostXYBuckets, kBoostDirBuckets); - if (FLAGS_T.empty()) { - trainer->LoadUnicharset(FLAGS_U.c_str()); - // Get basic font information from font_properties. - if (!FLAGS_F.empty()) { - if (!trainer->LoadFontInfo(FLAGS_F.c_str())) { - delete trainer; - return NULL; - } + trainer->LoadUnicharset(FLAGS_U.c_str()); + // Get basic font information from font_properties. + if (!FLAGS_F.empty()) { + if (!trainer->LoadFontInfo(FLAGS_F.c_str())) { + delete trainer; + return nullptr; } - if (!FLAGS_X.empty()) { - if (!trainer->LoadXHeights(FLAGS_X.c_str())) { - delete trainer; - return NULL; - } + } + if (!FLAGS_X.empty()) { + if (!trainer->LoadXHeights(FLAGS_X.c_str())) { + delete trainer; + return nullptr; } - trainer->SetFeatureSpace(fs); - const char* page_name; - // Load training data from .tr files on the command line. - while ((page_name = GetNextFilename(argc, argv)) != NULL) { - tprintf("Reading %s ...\n", page_name); - trainer->ReadTrainingSamples(page_name, feature_defs, false); + } + trainer->SetFeatureSpace(fs); + const char* page_name; + // Load training data from .tr files on the command line. + while ((page_name = GetNextFilename(argc, argv)) != nullptr) { + tprintf("Reading %s ...\n", page_name); + trainer->ReadTrainingSamples(page_name, feature_defs, false); - // If there is a file with [lang].[fontname].exp[num].fontinfo present, - // read font spacing information in to fontinfo_table. - int pagename_len = strlen(page_name); - char *fontinfo_file_name = new char[pagename_len + 7]; - strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr" - strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo" - trainer->AddSpacingInfo(fontinfo_file_name); - delete[] fontinfo_file_name; + // If there is a file with [lang].[fontname].exp[num].fontinfo present, + // read font spacing information in to fontinfo_table. + int pagename_len = strlen(page_name); + char* fontinfo_file_name = new char[pagename_len + 7]; + strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr" + strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo" + trainer->AddSpacingInfo(fontinfo_file_name); + delete[] fontinfo_file_name; - // Load the images into memory if required by the classifier. - if (FLAGS_load_images) { - STRING image_name = page_name; - // Chop off the tr and replace with tif. Extension must be tif! - image_name.truncate_at(image_name.length() - 2); - image_name += "tif"; - trainer->LoadPageImages(image_name.string()); - } + // Load the images into memory if required by the classifier. + if (FLAGS_load_images) { + STRING image_name = page_name; + // Chop off the tr and replace with tif. Extension must be tif! + image_name.truncate_at(image_name.length() - 2); + image_name += "tif"; + trainer->LoadPageImages(image_name.string()); } - trainer->PostLoadCleanup(); - // Write the master trainer if required. - if (!FLAGS_output_trainer.empty()) { - FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb"); - if (fp == NULL) { - tprintf("Can't create saved trainer data!\n"); - } else { - trainer->Serialize(fp); - fclose(fp); - } - } - } else { - bool success = false; - tprintf("Loading master trainer from file:%s\n", - FLAGS_T.c_str()); - FILE* fp = fopen(FLAGS_T.c_str(), "rb"); - if (fp == NULL) { - tprintf("Can't read file %s to initialize master trainer\n", - FLAGS_T.c_str()); + } + trainer->PostLoadCleanup(); + // Write the master trainer if required. + if (!FLAGS_output_trainer.empty()) { + FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb"); + if (fp == nullptr) { + tprintf("Can't create saved trainer data!\n"); } else { - success = trainer->DeSerialize(false, fp); + trainer->Serialize(fp); fclose(fp); } - if (!success) { - tprintf("Deserialize of master trainer failed!\n"); - delete trainer; - return NULL; - } - trainer->SetFeatureSpace(fs); } trainer->PreTrainingSetup(); if (!FLAGS_O.empty() && !trainer->unicharset().save_to_file(FLAGS_O.c_str())) { fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str()); delete trainer; - return NULL; + return nullptr; } - if (shape_table != NULL) { + if (shape_table != nullptr) { // If we previously failed to load a shapetable, then shape clustering // wasn't run so make a flat one now. - if (*shape_table == NULL) { + if (*shape_table == nullptr) { *shape_table = new ShapeTable; trainer->SetupFlatShapeTable(*shape_table); tprintf("Flat shape table summary: %s\n", @@ -298,12 +274,12 @@ MasterTrainer* LoadTrainingData(int argc, const char* const * argv, /** * This routine returns the next command line argument. If * there are no remaining command line arguments, it returns - * NULL. This routine should only be called after all option + * nullptr. This routine should only be called after all option * arguments have been parsed and removed with ParseArguments. * * Globals: * - tessoptind defined by tessopt sys call - * @return Next command line argument or NULL. + * @return Next command line argument or nullptr. * @note Exceptions: none * @note History: Fri Aug 18 09:34:12 1989, DSJ, Created. */ @@ -311,28 +287,23 @@ const char *GetNextFilename(int argc, const char* const * argv) { if (tessoptind < argc) return argv[tessoptind++]; else - return NULL; -} /* GetNextFilename */ - - + return nullptr; +} /* GetNextFilename */ /*---------------------------------------------------------------------------*/ /** * This routine searches through a list of labeled lists to find * a list with the specified label. If a matching labeled list - * cannot be found, NULL is returned. + * cannot be found, nullptr is returned. * @param List list to search * @param Label label to search for - * @return Labeled list with the specified Label or NULL. + * @return Labeled list with the specified label or nullptr. * @note Globals: none * @note Exceptions: none * @note History: Fri Aug 18 15:57:41 1989, DSJ, Created. */ -LABELEDLIST FindList ( - LIST List, - char *Label) -{ - LABELEDLIST LabeledList; +LABELEDLIST FindList(LIST List, char* Label) { + LABELEDLIST LabeledList; iterate (List) { @@ -340,9 +311,9 @@ LABELEDLIST FindList ( if (strcmp (LabeledList->Label, Label) == 0) return (LabeledList); } - return (NULL); + return (nullptr); -} /* FindList */ +} /* FindList */ /*---------------------------------------------------------------------------*/ /** @@ -354,10 +325,8 @@ LABELEDLIST FindList ( * @note Exceptions: none * @note History: Fri Aug 18 16:08:46 1989, DSJ, Created. */ -LABELEDLIST NewLabeledList ( - const char *Label) -{ - LABELEDLIST LabeledList; +LABELEDLIST NewLabeledList(const char* Label) { + LABELEDLIST LabeledList; LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE)); LabeledList->Label = (char*)Emalloc (strlen (Label)+1); @@ -367,7 +336,7 @@ LABELEDLIST NewLabeledList ( LabeledList->font_sample_count = 0; return (LabeledList); -} /* NewLabeledList */ +} /* NewLabeledList */ /*---------------------------------------------------------------------------*/ // TODO(rays) This is now used only by cntraining. Convert cntraining to use @@ -386,7 +355,7 @@ LABELEDLIST NewLabeledList ( * @return none * @note Globals: none * @note Exceptions: none - * @note History: + * @note History: * - Fri Aug 18 13:11:39 1989, DSJ, Created. * - Tue May 17 1998 simplifications to structure, illiminated * font, and feature specification levels of structure. @@ -409,12 +378,12 @@ void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs, char_sample->font_sample_count = 0; } - while (fgets(buffer, 2048, file) != NULL) { + while (fgets(buffer, 2048, file) != nullptr) { if (buffer[0] == '\n') continue; sscanf(buffer, "%*s %s", unichar); - if (unicharset != NULL && !unicharset->contains_unichar(unichar)) { + if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) { unicharset->unichar_insert(unichar); if (unicharset->size() > MAX_NUM_CLASSES) { tprintf("Error: Size of unicharset in training is " @@ -423,7 +392,7 @@ void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs, } } char_sample = FindList(*training_samples, unichar); - if (char_sample == NULL) { + if (char_sample == nullptr) { char_sample = NewLabeledList(unichar); *training_samples = push(*training_samples, char_sample); } @@ -460,17 +429,17 @@ void FreeTrainingSamples(LIST CharList) { FEATURE_SET FeatureSet; LIST FeatureList; - - iterate(CharList) { /* iterate through all of the fonts */ + LIST nodes = CharList; + iterate(CharList) { /* iterate through all of the fonts */ char_sample = (LABELEDLIST) first_node(CharList); FeatureList = char_sample->List; - iterate(FeatureList) { /* iterate through all of the classes */ + iterate(FeatureList) { /* iterate through all of the classes */ FeatureSet = (FEATURE_SET) first_node(FeatureList); FreeFeatureSet(FeatureSet); } FreeLabeledList(char_sample); } - destroy(CharList); + destroy(nodes); } /* FreeTrainingSamples */ /*---------------------------------------------------------------------------*/ @@ -509,11 +478,11 @@ CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, const char* program_feature_type) { uinT16 N; int i, j; - FLOAT32 *Sample = NULL; + FLOAT32* Sample = nullptr; CLUSTERER *Clusterer; inT32 CharID; - LIST FeatureList = NULL; - FEATURE_SET FeatureSet = NULL; + LIST FeatureList = nullptr; + FEATURE_SET FeatureSet = nullptr; int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type); N = FeatureDefs.FeatureDesc[desc_index]->NumParams; @@ -524,23 +493,22 @@ CLUSTERER *SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, iterate(FeatureList) { FeatureSet = (FEATURE_SET) first_node(FeatureList); for (i = 0; i < FeatureSet->MaxNumFeatures; i++) { - if (Sample == NULL) - Sample = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); + if (Sample == nullptr) Sample = (FLOAT32*)Emalloc(N * sizeof(FLOAT32)); for (j = 0; j < N; j++) Sample[j] = FeatureSet->Features[i]->Params[j]; MakeSample (Clusterer, Sample, CharID); } CharID++; } - if ( Sample != NULL ) free( Sample ); - return( Clusterer ); + free(Sample); + return Clusterer; -} /* SetUpForClustering */ +} /* SetUpForClustering */ /*------------------------------------------------------------------------*/ void MergeInsignificantProtos(LIST ProtoList, const char* label, - CLUSTERER *Clusterer, CLUSTERCONFIG *Config) { - PROTOTYPE *Prototype; + CLUSTERER* Clusterer, CLUSTERCONFIG* Config) { + PROTOTYPE* Prototype; bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0; LIST pProtoList = ProtoList; @@ -549,7 +517,7 @@ void MergeInsignificantProtos(LIST ProtoList, const char* label, if (Prototype->Significant || Prototype->Merged) continue; FLOAT32 best_dist = 0.125; - PROTOTYPE* best_match = NULL; + PROTOTYPE* best_match = nullptr; // Find the nearest alive prototype. LIST list_it = ProtoList; iterate(list_it) { @@ -564,7 +532,7 @@ void MergeInsignificantProtos(LIST ProtoList, const char* label, } } } - if (best_match != NULL && !best_match->Significant) { + if (best_match != nullptr && !best_match->Significant) { if (debug) tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n", best_match->NumSamples, Prototype->NumSamples, @@ -578,7 +546,7 @@ void MergeInsignificantProtos(LIST ProtoList, const char* label, best_match->Mean, Prototype->Mean); Prototype->NumSamples = 0; Prototype->Merged = 1; - } else if (best_match != NULL) { + } else if (best_match != nullptr) { if (debug) tprintf("Red proto at %g,%g matched a green one at %g,%g\n", Prototype->Mean[0], Prototype->Mean[1], @@ -600,7 +568,7 @@ void MergeInsignificantProtos(LIST ProtoList, const char* label, Prototype->Significant = true; } } -} /* MergeInsignificantProtos */ +} /* MergeInsignificantProtos */ /*-----------------------------------------------------------------------------*/ void CleanUpUnusedData( @@ -611,20 +579,17 @@ void CleanUpUnusedData( iterate(ProtoList) { Prototype = (PROTOTYPE *) first_node (ProtoList); - if(Prototype->Variance.Elliptical != NULL) - { + if (Prototype->Variance.Elliptical != nullptr) { memfree(Prototype->Variance.Elliptical); - Prototype->Variance.Elliptical = NULL; + Prototype->Variance.Elliptical = nullptr; } - if(Prototype->Magnitude.Elliptical != NULL) - { + if (Prototype->Magnitude.Elliptical != nullptr) { memfree(Prototype->Magnitude.Elliptical); - Prototype->Magnitude.Elliptical = NULL; + Prototype->Magnitude.Elliptical = nullptr; } - if(Prototype->Weight.Elliptical != NULL) - { + if (Prototype->Weight.Elliptical != nullptr) { memfree(Prototype->Weight.Elliptical); - Prototype->Weight.Elliptical = NULL; + Prototype->Weight.Elliptical = nullptr; } } } @@ -656,37 +621,34 @@ LIST RemoveInsignificantProtos( NewProto->Significant = Proto->Significant; NewProto->Style = Proto->Style; NewProto->NumSamples = Proto->NumSamples; - NewProto->Cluster = NULL; - NewProto->Distrib = NULL; + NewProto->Cluster = nullptr; + NewProto->Distrib = nullptr; for (i=0; i < N; i++) NewProto->Mean[i] = Proto->Mean[i]; - if (Proto->Variance.Elliptical != NULL) - { + if (Proto->Variance.Elliptical != nullptr) { NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); for (i=0; i < N; i++) NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i]; } else - NewProto->Variance.Elliptical = NULL; + NewProto->Variance.Elliptical = nullptr; //--------------------------------------------- - if (Proto->Magnitude.Elliptical != NULL) - { + if (Proto->Magnitude.Elliptical != nullptr) { NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); for (i=0; i < N; i++) NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i]; } else - NewProto->Magnitude.Elliptical = NULL; + NewProto->Magnitude.Elliptical = nullptr; //------------------------------------------------ - if (Proto->Weight.Elliptical != NULL) - { + if (Proto->Weight.Elliptical != nullptr) { NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32)); for (i=0; i < N; i++) NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i]; } else - NewProto->Weight.Elliptical = NULL; + NewProto->Weight.Elliptical = nullptr; NewProto->TotalMagnitude = Proto->TotalMagnitude; NewProto->LogMagnitude = Proto->LogMagnitude; @@ -695,14 +657,11 @@ LIST RemoveInsignificantProtos( } FreeProtoList(&ProtoList); return (NewProtoList); -} /* RemoveInsignificantProtos */ +} /* RemoveInsignificantProtos */ /*----------------------------------------------------------------------------*/ -MERGE_CLASS FindClass ( - LIST List, - const char *Label) -{ - MERGE_CLASS MergeClass; +MERGE_CLASS FindClass(LIST List, const char* Label) { + MERGE_CLASS MergeClass; iterate (List) { @@ -710,15 +669,13 @@ MERGE_CLASS FindClass ( if (strcmp (MergeClass->Label, Label) == 0) return (MergeClass); } - return (NULL); + return (nullptr); -} /* FindClass */ +} /* FindClass */ /*---------------------------------------------------------------------------*/ -MERGE_CLASS NewLabeledClass ( - const char *Label) -{ - MERGE_CLASS MergeClass; +MERGE_CLASS NewLabeledClass(const char* Label) { + MERGE_CLASS MergeClass; MergeClass = new MERGE_CLASS_NODE; MergeClass->Label = (char*)Emalloc (strlen (Label)+1); @@ -726,7 +683,7 @@ MERGE_CLASS NewLabeledClass ( MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS); return (MergeClass); -} /* NewLabeledClass */ +} /* NewLabeledClass */ /*-----------------------------------------------------------------------------*/ /** @@ -738,38 +695,37 @@ MERGE_CLASS NewLabeledClass ( * @note Exceptions: none * @note History: Fri Aug 18 17:44:27 1989, DSJ, Created. */ -void FreeLabeledClassList ( - LIST ClassList) -{ - MERGE_CLASS MergeClass; +void FreeLabeledClassList(LIST ClassList) { + MERGE_CLASS MergeClass; - iterate (ClassList) /* iterate through all of the fonts */ + LIST nodes = ClassList; + iterate(ClassList) /* iterate through all of the fonts */ { MergeClass = (MERGE_CLASS) first_node (ClassList); free (MergeClass->Label); FreeClass(MergeClass->Class); delete MergeClass; } - destroy (ClassList); + destroy(nodes); -} /* FreeLabeledClassList */ +} /* FreeLabeledClassList */ /* SetUpForFloat2Int */ CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset, LIST LabeledClassList) { - MERGE_CLASS MergeClass; - CLASS_TYPE Class; - int NumProtos; - int NumConfigs; - int NumWords; - int i, j; - float Values[3]; - PROTO NewProto; - PROTO OldProto; - BIT_VECTOR NewConfig; - BIT_VECTOR OldConfig; + MERGE_CLASS MergeClass; + CLASS_TYPE Class; + int NumProtos; + int NumConfigs; + int NumWords; + int i, j; + float Values[3]; + PROTO NewProto; + PROTO OldProto; + BIT_VECTOR NewConfig; + BIT_VECTOR OldConfig; - // printf("Float2Int ...\n"); + // printf("Float2Int ...\n"); CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()]; iterate(LabeledClassList) @@ -821,9 +777,9 @@ CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset, void Normalize ( float *Values) { - register float Slope; - register float Intercept; - register float Normalizer; + float Slope; + float Intercept; + float Normalizer; Slope = tan (Values [2] * 2 * PI); Intercept = Values [1] - Slope * Values [0]; @@ -835,20 +791,20 @@ void Normalize ( } // Normalize /*-------------------------------------------------------------------------*/ -void FreeNormProtoList ( - LIST CharList) +void FreeNormProtoList(LIST CharList) { - LABELEDLIST char_sample; + LABELEDLIST char_sample; - iterate (CharList) /* iterate through all of the fonts */ + LIST nodes = CharList; + iterate(CharList) /* iterate through all of the fonts */ { char_sample = (LABELEDLIST) first_node (CharList); FreeLabeledList (char_sample); } - destroy (CharList); + destroy(nodes); -} // FreeNormProtoList +} // FreeNormProtoList /*---------------------------------------------------------------------------*/ void AddToNormProtosList( @@ -869,19 +825,16 @@ void AddToNormProtosList( } /*---------------------------------------------------------------------------*/ -int NumberOfProtos( - LIST ProtoList, - BOOL8 CountSigProtos, - BOOL8 CountInsigProtos) -{ +int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, + BOOL8 CountInsigProtos) { int N = 0; - PROTOTYPE *Proto; + PROTOTYPE* Proto; iterate(ProtoList) { Proto = (PROTOTYPE *) first_node ( ProtoList ); - if (( Proto->Significant && CountSigProtos ) || - ( ! Proto->Significant && CountInsigProtos ) ) + if ((Proto->Significant && CountSigProtos) || + (!Proto->Significant && CountInsigProtos)) N++; } return(N); diff --git a/training/commontraining.h b/training/commontraining.h index 7f38cf51..20332ae1 100644 --- a/training/commontraining.h +++ b/training/commontraining.h @@ -11,8 +11,8 @@ // See the License for the specific language governing permissions and // limitations under the License. -#ifndef TESSERACT_TRAINING_COMMONTRAINING_H__ -#define TESSERACT_TRAINING_COMMONTRAINING_H__ +#ifndef TESSERACT_TRAINING_COMMONTRAINING_H_ +#define TESSERACT_TRAINING_COMMONTRAINING_H_ #include "cluster.h" #include "commandlineflags.h" @@ -70,7 +70,7 @@ void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table); // Creates a MasterTraininer and loads the training data into it: // Initializes feature_defs and IntegerFX. -// Loads the shape_table if shape_table != NULL. +// Loads the shape_table if shape_table != nullptr. // Loads initial unicharset from -U command-line option. // If FLAGS_input_trainer is set, loads the majority of data from there, else: // Loads font info from -F option. @@ -79,7 +79,7 @@ void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table); // Deletes outliers and computes canonical samples. // If FLAGS_output_trainer is set, saves the trainer for future use. // Computes canonical and cloud features. -// If shape_table is not NULL, but failed to load, make a fake flat one, +// If shape_table is not nullptr, but failed to load, make a fake flat one, // as shape clustering was not run. MasterTrainer* LoadTrainingData(int argc, const char* const * argv, bool replication, @@ -167,4 +167,4 @@ int NumberOfProtos( void allocNormProtos(); -#endif // TESSERACT_TRAINING_COMMONTRAINING_H__ +#endif // TESSERACT_TRAINING_COMMONTRAINING_H_ diff --git a/training/dawg2wordlist.cpp b/training/dawg2wordlist.cpp index 82698295..c1d769a3 100644 --- a/training/dawg2wordlist.cpp +++ b/training/dawg2wordlist.cpp @@ -19,6 +19,7 @@ #include "dawg.h" #include "host.h" +#include "serialis.h" #include "tesscallback.h" #include "trie.h" #include "unicharset.h" @@ -28,17 +29,20 @@ const int kDictDebugLevel = 1; tesseract::Dawg *LoadSquishedDawg(const UNICHARSET &unicharset, const char *filename) { const int kDictDebugLevel = 1; - FILE *dawg_file = fopen(filename, "rb"); - if (dawg_file == NULL) { + tesseract::TFile dawg_file; + if (!dawg_file.Open(filename, nullptr)) { tprintf("Could not open %s for reading.\n", filename); - return NULL; + return nullptr; } tprintf("Loading word list from %s\n", filename); - tesseract::Dawg *retval = new tesseract::SquishedDawg( - dawg_file, tesseract::DAWG_TYPE_WORD, "eng", SYSTEM_DAWG_PERM, - kDictDebugLevel); + tesseract::SquishedDawg *retval = new tesseract::SquishedDawg( + tesseract::DAWG_TYPE_WORD, "eng", SYSTEM_DAWG_PERM, kDictDebugLevel); + if (!retval->Load(&dawg_file)) { + tprintf("Could not read %s\n", filename); + delete retval; + return nullptr; + } tprintf("Word list loaded.\n"); - fclose(dawg_file); return retval; } @@ -55,7 +59,7 @@ int WriteDawgAsWordlist(const UNICHARSET &unicharset, const tesseract::Dawg *dawg, const char *outfile_name) { FILE *out = fopen(outfile_name, "wb"); - if (out == NULL) { + if (out == nullptr) { tprintf("Could not open %s for writing.\n", outfile_name); return 1; } @@ -83,7 +87,7 @@ int main(int argc, char *argv[]) { return 1; } tesseract::Dawg *dict = LoadSquishedDawg(unicharset, dawg_file); - if (dict == NULL) { + if (dict == nullptr) { tprintf("Error loading dictionary from %s.\n", dawg_file); return 1; } diff --git a/training/degradeimage.cpp b/training/degradeimage.cpp index f9c3cfb0..76f6cf09 100644 --- a/training/degradeimage.cpp +++ b/training/degradeimage.cpp @@ -22,10 +22,36 @@ #include #include "allheaders.h" // from leptonica +#include "genericvector.h" #include "helpers.h" // For TRand. +#include "rect.h" namespace tesseract { +// A randomized perspective distortion can be applied to synthetic input. +// The perspective distortion comes from leptonica, which uses 2 sets of 4 +// corners to determine the distortion. There are random values for each of +// the x numbers x0..x3 and y0..y3, except for x2 and x3 which are instead +// defined in terms of a single shear value. This reduces the degrees of +// freedom enough to make the distortion more realistic than it would otherwise +// be if all 8 coordinates could move independently. +// One additional factor is used for the color of the pixels that don't exist +// in the source image. +// Name for each of the randomizing factors. +enum FactorNames { + FN_INCOLOR, + FN_Y0, + FN_Y1, + FN_Y2, + FN_Y3, + FN_X0, + FN_X1, + FN_SHEAR, + // x2 = x1 - shear + // x3 = x0 + shear + FN_NUM_FACTORS +}; + // Rotation is +/- kRotationRange radians. const float kRotationRange = 0.02f; // Number of grey levels to shift by for each exposure step. @@ -38,8 +64,8 @@ const int kMinRampSize = 1000; // Degrade the pix as if by a print/copy/scan cycle with exposure > 0 // corresponding to darkening on the copier and <0 lighter and 0 not copied. // Exposures in [-2,2] are most useful, with -3 and 3 being extreme. -// If rotation is NULL, rotation is skipped. If *rotation is non-zero, the pix -// is rotated by *rotation else it is randomly rotated and *rotation is +// If rotation is nullptr, rotation is skipped. If *rotation is non-zero, the +// pix is rotated by *rotation else it is randomly rotated and *rotation is // modified. // // HOW IT WORKS: @@ -82,11 +108,11 @@ Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer, pix = pixBlockconv(input, 1, 1); pixDestroy(&input); // A small random rotation helps to make the edges jaggy in a realistic way. - if (rotation != NULL) { + if (rotation != nullptr) { float radians_clockwise = 0.0f; if (*rotation) { radians_clockwise = *rotation; - } else if (randomizer != NULL) { + } else if (randomizer != nullptr) { radians_clockwise = randomizer->SignedRand(kRotationRange); } @@ -128,7 +154,7 @@ Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer, for (int y = 0; y < height; ++y) { for (int x = 0; x < width; ++x) { int pixel = GET_DATA_BYTE(data, x); - if (randomizer != NULL) + if (randomizer != nullptr) pixel += randomizer->IntRand() % (kSaltnPepper*2 + 1) - kSaltnPepper; if (height + width > kMinRampSize) pixel -= (2*x + y) * 32 / (height + width); @@ -144,4 +170,141 @@ Pix* DegradeImage(Pix* input, int exposure, TRand* randomizer, return input; } +// Creates and returns a Pix distorted by various means according to the bool +// flags. If boxes is not nullptr, the boxes are resized/positioned according to +// any spatial distortion and also by the integer reduction factor box_scale +// so they will match what the network will output. +// Returns nullptr on error. The returned Pix must be pixDestroyed. +Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, + bool white_noise, bool smooth_noise, bool blur, + int box_reduction, TRand* randomizer, + GenericVector* boxes) { + Pix* distorted = pixCopy(nullptr, const_cast(pix)); + // Things to do to synthetic training data. + if (invert && randomizer->SignedRand(1.0) < 0) + pixInvert(distorted, distorted); + if ((white_noise || smooth_noise) && randomizer->SignedRand(1.0) > 0.0) { + // TODO(rays) Cook noise in a more thread-safe manner than rand(). + // Attempt to make the sequences reproducible. + srand(randomizer->IntRand()); + Pix* pixn = pixAddGaussianNoise(distorted, 8.0); + pixDestroy(&distorted); + if (smooth_noise) { + distorted = pixBlockconv(pixn, 1, 1); + pixDestroy(&pixn); + } else { + distorted = pixn; + } + } + if (blur && randomizer->SignedRand(1.0) > 0.0) { + Pix* blurred = pixBlockconv(distorted, 1, 1); + pixDestroy(&distorted); + distorted = blurred; + } + if (perspective) + GeneratePerspectiveDistortion(0, 0, randomizer, &distorted, boxes); + if (boxes != nullptr) { + for (int b = 0; b < boxes->size(); ++b) { + (*boxes)[b].scale(1.0f / box_reduction); + if ((*boxes)[b].width() <= 0) + (*boxes)[b].set_right((*boxes)[b].left() + 1); + } + } + return distorted; +} + +// Distorts anything that has a non-null pointer with the same pseudo-random +// perspective distortion. Width and height only need to be set if there +// is no pix. If there is a pix, then they will be taken from there. +void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, + Pix** pix, GenericVector* boxes) { + if (pix != nullptr && *pix != nullptr) { + width = pixGetWidth(*pix); + height = pixGetHeight(*pix); + } + float* im_coeffs = nullptr; + float* box_coeffs = nullptr; + l_int32 incolor = + ProjectiveCoeffs(width, height, randomizer, &im_coeffs, &box_coeffs); + if (pix != nullptr && *pix != nullptr) { + // Transform the image. + Pix* transformed = pixProjective(*pix, im_coeffs, incolor); + if (transformed == nullptr) { + tprintf("Projective transformation failed!!\n"); + return; + } + pixDestroy(pix); + *pix = transformed; + } + if (boxes != nullptr) { + // Transform the boxes. + for (int b = 0; b < boxes->size(); ++b) { + int x1, y1, x2, y2; + const TBOX& box = (*boxes)[b]; + projectiveXformSampledPt(box_coeffs, box.left(), height - box.top(), &x1, + &y1); + projectiveXformSampledPt(box_coeffs, box.right(), height - box.bottom(), + &x2, &y2); + TBOX new_box1(x1, height - y2, x2, height - y1); + projectiveXformSampledPt(box_coeffs, box.left(), height - box.bottom(), + &x1, &y1); + projectiveXformSampledPt(box_coeffs, box.right(), height - box.top(), &x2, + &y2); + TBOX new_box2(x1, height - y1, x2, height - y2); + (*boxes)[b] = new_box1.bounding_union(new_box2); + } + } + free(im_coeffs); + free(box_coeffs); +} + +// Computes the coefficients of a randomized projective transformation. +// The image transform requires backward transformation coefficient, and the +// box transform the forward coefficients. +// Returns the incolor arg to pixProjective. +int ProjectiveCoeffs(int width, int height, TRand* randomizer, + float** im_coeffs, float** box_coeffs) { + // Setup "from" points. + Pta* src_pts = ptaCreate(4); + ptaAddPt(src_pts, 0.0f, 0.0f); + ptaAddPt(src_pts, width, 0.0f); + ptaAddPt(src_pts, width, height); + ptaAddPt(src_pts, 0.0f, height); + // Extract factors from pseudo-random sequence. + float factors[FN_NUM_FACTORS]; + float shear = 0.0f; // Shear is signed. + for (int i = 0; i < FN_NUM_FACTORS; ++i) { + // Everything is squared to make wild values rarer. + if (i == FN_SHEAR) { + // Shear is signed. + shear = randomizer->SignedRand(0.5 / 3.0); + shear = shear >= 0.0 ? shear * shear : -shear * shear; + // Keep the sheared points within the original rectangle. + if (shear < -factors[FN_X0]) shear = -factors[FN_X0]; + if (shear > factors[FN_X1]) shear = factors[FN_X1]; + factors[i] = shear; + } else if (i != FN_INCOLOR) { + factors[i] = fabs(randomizer->SignedRand(1.0)); + if (i <= FN_Y3) + factors[i] *= 5.0 / 8.0; + else + factors[i] *= 0.5; + factors[i] *= factors[i]; + } + } + // Setup "to" points. + Pta* dest_pts = ptaCreate(4); + ptaAddPt(dest_pts, factors[FN_X0] * width, factors[FN_Y0] * height); + ptaAddPt(dest_pts, (1.0f - factors[FN_X1]) * width, factors[FN_Y1] * height); + ptaAddPt(dest_pts, (1.0f - factors[FN_X1] + shear) * width, + (1 - factors[FN_Y2]) * height); + ptaAddPt(dest_pts, (factors[FN_X0] + shear) * width, + (1 - factors[FN_Y3]) * height); + getProjectiveXformCoeffs(dest_pts, src_pts, im_coeffs); + getProjectiveXformCoeffs(src_pts, dest_pts, box_coeffs); + ptaDestroy(&src_pts); + ptaDestroy(&dest_pts); + return factors[FN_INCOLOR] > 0.5f ? L_BRING_IN_WHITE : L_BRING_IN_BLACK; +} + } // namespace tesseract diff --git a/training/degradeimage.h b/training/degradeimage.h index 2add6282..85e35f0a 100644 --- a/training/degradeimage.h +++ b/training/degradeimage.h @@ -20,20 +20,42 @@ #ifndef TESSERACT_TRAINING_DEGRADEIMAGE_H_ #define TESSERACT_TRAINING_DEGRADEIMAGE_H_ -struct Pix; +#include "allheaders.h" +#include "genericvector.h" +#include "helpers.h" // For TRand. +#include "rect.h" namespace tesseract { -class TRand; - // Degrade the pix as if by a print/copy/scan cycle with exposure > 0 // corresponding to darkening on the copier and <0 lighter and 0 not copied. -// If rotation is not NULL, the clockwise rotation in radians is saved there. +// If rotation is not nullptr, the clockwise rotation in radians is saved there. // The input pix must be 8 bit grey. (Binary with values 0 and 255 is OK.) // The input image is destroyed and a different image returned. struct Pix* DegradeImage(struct Pix* input, int exposure, TRand* randomizer, float* rotation); +// Creates and returns a Pix distorted by various means according to the bool +// flags. If boxes is not nullptr, the boxes are resized/positioned according to +// any spatial distortion and also by the integer reduction factor box_scale +// so they will match what the network will output. +// Returns nullptr on error. The returned Pix must be pixDestroyed. +Pix* PrepareDistortedPix(const Pix* pix, bool perspective, bool invert, + bool white_noise, bool smooth_noise, bool blur, + int box_reduction, TRand* randomizer, + GenericVector* boxes); +// Distorts anything that has a non-null pointer with the same pseudo-random +// perspective distortion. Width and height only need to be set if there +// is no pix. If there is a pix, then they will be taken from there. +void GeneratePerspectiveDistortion(int width, int height, TRand* randomizer, + Pix** pix, GenericVector* boxes); +// Computes the coefficients of a randomized projective transformation. +// The image transform requires backward transformation coefficient, and the +// box transform the forward coefficients. +// Returns the incolor arg to pixProjective. +int ProjectiveCoeffs(int width, int height, TRand* randomizer, + float** im_coeffs, float** box_coeffs); + } // namespace tesseract #endif // TESSERACT_TRAINING_DEGRADEIMAGE_H_ diff --git a/training/fileio.cpp b/training/fileio.cpp index e3e43bd0..8e1324ae 100644 --- a/training/fileio.cpp +++ b/training/fileio.cpp @@ -15,7 +15,6 @@ * **********************************************************************/ #ifdef _WIN32 -#include #ifndef unlink #include #endif @@ -43,7 +42,7 @@ FILE* File::Open(const string& filename, const string& mode) { FILE* File::OpenOrDie(const string& filename, const string& mode) { FILE* stream = fopen(filename.c_str(), mode.c_str()); - if (stream == NULL) { + if (stream == nullptr) { tprintf("Unable to open '%s' in mode '%s'\n", filename.c_str(), mode.c_str()); } @@ -53,7 +52,7 @@ FILE* File::OpenOrDie(const string& filename, void File::WriteStringToFileOrDie(const string& str, const string& filename) { FILE* stream = fopen(filename.c_str(), "wb"); - if (stream == NULL) { + if (stream == nullptr) { tprintf("Unable to open '%s' for writing\n", filename.c_str()); return; } @@ -63,7 +62,7 @@ void File::WriteStringToFileOrDie(const string& str, bool File::Readable(const string& filename) { FILE* stream = fopen(filename.c_str(), "rb"); - if (stream == NULL) { + if (stream == nullptr) { return false; } fclose(stream); @@ -72,8 +71,7 @@ bool File::Readable(const string& filename) { bool File::ReadFileToString(const string& filename, string* out) { FILE* stream = File::Open(filename.c_str(), "rb"); - if (stream == NULL) - return false; + if (stream == nullptr) return false; InputBuffer in(stream); *out = ""; in.Read(out); @@ -81,8 +79,9 @@ bool File::ReadFileToString(const string& filename, string* out) { } string File::JoinPath(const string& prefix, const string& suffix) { - return (!prefix.size() || prefix[prefix.size() - 1] == '/') ? - prefix + suffix : prefix + "/" + suffix; + return (prefix.empty() || prefix[prefix.size() - 1] == '/') + ? prefix + suffix + : prefix + "/" + suffix; } bool File::Delete(const char* pathname) { @@ -113,8 +112,8 @@ bool File::DeleteMatchingFiles(const char* pattern) { glob_t pglob; char **paths; bool all_deleted = true; - if (glob(pattern, 0, NULL, &pglob) == 0) { - for (paths = pglob.gl_pathv; *paths != NULL; paths++) { + if (glob(pattern, 0, nullptr, &pglob) == 0) { + for (paths = pglob.gl_pathv; *paths != nullptr; paths++) { all_deleted &= File::Delete(*paths); } globfree(&pglob); @@ -141,7 +140,7 @@ InputBuffer::InputBuffer(FILE* stream, size_t) } InputBuffer::~InputBuffer() { - if (stream_ != NULL) { + if (stream_ != nullptr) { fclose(stream_); } } @@ -162,7 +161,7 @@ bool InputBuffer::Read(string* out) { bool InputBuffer::CloseFile() { int ret = fclose(stream_); - stream_ = NULL; + stream_ = nullptr; return ret == 0; } @@ -179,7 +178,7 @@ OutputBuffer::OutputBuffer(FILE* stream, size_t) } OutputBuffer::~OutputBuffer() { - if (stream_ != NULL) { + if (stream_ != nullptr) { fclose(stream_); } } @@ -190,7 +189,7 @@ void OutputBuffer::WriteString(const string& str) { bool OutputBuffer::CloseFile() { int ret = fclose(stream_); - stream_ = NULL; + stream_ = nullptr; return ret == 0; } diff --git a/training/language-specific.sh b/training/language-specific.sh index a62f1e3c..24901d6c 100755 --- a/training/language-specific.sh +++ b/training/language-specific.sh @@ -1,4 +1,14 @@ # +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# http://www.apache.org/licenses/LICENSE-2.0 +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# # Set some language specific variables. Works in conjunction with # tesstrain.sh # @@ -868,6 +878,9 @@ set_lang_specific_parameters() { AMBIGS_FILTER_DENOMINATOR="100000" LEADING="32" MEAN_COUNT="40" # Default for latin script. + # Language to mix with the language for maximum accuracy. Defaults to eng. + # If no language is good, set to the base language. + MIX_LANG="eng" case ${lang} in # Latin languages. @@ -959,11 +972,13 @@ set_lang_specific_parameters() { WORD_DAWG_SIZE=1000000 test -z "$FONTS" && FONTS=( "${EARLY_LATIN_FONTS[@]}" );; - # Cyrillic script-based languages. + # Cyrillic script-based languages. It is bad to mix Latin with Cyrillic. rus ) test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) + MIX_LANG="rus" NUMBER_DAWG_FACTOR=0.05 WORD_DAWG_SIZE=1000000 ;; aze_cyrl | bel | bul | kaz | mkd | srp | tgk | ukr | uzb_cyrl ) + MIX_LANG="${lang}" test -z "$FONTS" && FONTS=( "${RUSSIAN_FONTS[@]}" ) ;; # Special code for performing Cyrillic language-id that is trained on @@ -1115,7 +1130,7 @@ set_lang_specific_parameters() { TRAINING_DATA_ARGUMENTS=" --infrequent_ratio=100" ;; kur ) test -z "$FONTS" && FONTS=( "${KURDISH_FONTS[@]}" ) ;; - *) err "Error: ${lang} is not a valid language code" + *) err_exit "Error: ${lang} is not a valid language code" esac if [[ ${FLAGS_mean_count} -gt 0 ]]; then TRAINING_DATA_ARGUMENTS+=" --mean_count=${FLAGS_mean_count}" diff --git a/training/ligature_table.cpp b/training/ligature_table.cpp index fabed602..e4397855 100644 --- a/training/ligature_table.cpp +++ b/training/ligature_table.cpp @@ -46,11 +46,11 @@ const int kMinLigature = 0xfb00; const int kMaxLigature = 0xfb17; // Don't put the wide Hebrew letters in. /* static */ -SmartPtr LigatureTable::instance_; +std::unique_ptr LigatureTable::instance_; /* static */ LigatureTable* LigatureTable::Get() { - if (instance_ == NULL) { + if (instance_ == nullptr) { instance_.reset(new LigatureTable()); instance_->Init(); } @@ -93,7 +93,7 @@ void LigatureTable::Init() { } } // Add custom extra ligatures. - for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) { + for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) { norm_to_lig_table_[UNICHARSET::kCustomLigatures[i][0]] = UNICHARSET::kCustomLigatures[i][1]; int norm_length = strlen(UNICHARSET::kCustomLigatures[i][0]); @@ -138,8 +138,8 @@ string LigatureTable::RemoveCustomLigatures(const string& str) const { len = it.get_utf8(tmp); tmp[len] = '\0'; norm_ind = -1; - for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL && norm_ind < 0; - ++i) { + for (int i = 0; + UNICHARSET::kCustomLigatures[i][0] != nullptr && norm_ind < 0; ++i) { if (!strcmp(tmp, UNICHARSET::kCustomLigatures[i][1])) { norm_ind = i; } diff --git a/training/ligature_table.h b/training/ligature_table.h index ecae7943..f3516259 100644 --- a/training/ligature_table.h +++ b/training/ligature_table.h @@ -22,9 +22,10 @@ #ifndef TRAININGDATA_LIGATURE_TABLE_H_ #define TRAININGDATA_LIGATURE_TABLE_H_ +#include #include +#include -#include "hashfn.h" #include "util.h" namespace tesseract { @@ -32,7 +33,7 @@ namespace tesseract { class PangoFontInfo; // defined in pango_font_info.h // Map to substitute strings for ligatures. -typedef hash_map LigHash; +typedef std::unordered_map LigHash; class LigatureTable { public: @@ -61,7 +62,7 @@ class LigatureTable { // corresponding ligature characters. void Init(); - static SmartPtr instance_; + static std::unique_ptr instance_; LigHash norm_to_lig_table_; LigHash lig_to_norm_table_; int min_lig_length_; diff --git a/training/lstmeval.cpp b/training/lstmeval.cpp new file mode 100644 index 00000000..aa990e23 --- /dev/null +++ b/training/lstmeval.cpp @@ -0,0 +1,58 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstmeval.cpp +// Description: Evaluation program for LSTM-based networks. +// Author: Ray Smith +// Created: Wed Nov 23 12:20:06 PST 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef USE_STD_NAMESPACE +#include "base/commandlineflags.h" +#endif +#include "commontraining.h" +#include "genericvector.h" +#include "lstmtester.h" +#include "strngs.h" +#include "tprintf.h" + +STRING_PARAM_FLAG(model, "", "Name of model file (training or recognition)"); +STRING_PARAM_FLAG(eval_listfile, "", + "File listing sample files in lstmf training format."); +INT_PARAM_FLAG(max_image_MB, 2000, "Max memory to use for images."); + +int main(int argc, char **argv) { + ParseArguments(&argc, &argv); + if (FLAGS_model.empty()) { + tprintf("Must provide a --model!\n"); + return 1; + } + if (FLAGS_eval_listfile.empty()) { + tprintf("Must provide a --eval_listfile!\n"); + return 1; + } + GenericVector model_data; + if (!tesseract::LoadDataFromFile(FLAGS_model.c_str(), &model_data)) { + tprintf("Failed to load model from: %s\n", FLAGS_eval_listfile.c_str()); + return 1; + } + tesseract::LSTMTester tester(static_cast(FLAGS_max_image_MB) * + 1048576); + if (!tester.LoadAllEvalData(FLAGS_eval_listfile.c_str())) { + tprintf("Failed to load eval data from: %s\n", FLAGS_eval_listfile.c_str()); + return 1; + } + double errs = 0.0; + STRING result = tester.RunEvalSync(0, &errs, model_data, 0); + tprintf("%s\n", result.string()); + return 0; +} /* main */ diff --git a/training/lstmtester.cpp b/training/lstmtester.cpp new file mode 100644 index 00000000..df37ebd7 --- /dev/null +++ b/training/lstmtester.cpp @@ -0,0 +1,146 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstmtester.cpp +// Description: Top-level line evaluation class for LSTM-based networks. +// Author: Ray Smith +// Created: Wed Nov 23 11:18:06 PST 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#include "lstmtester.h" +#include "genericvector.h" + +namespace tesseract { + +LSTMTester::LSTMTester(inT64 max_memory) + : test_data_(max_memory), total_pages_(0), async_running_(false) {} + +// Loads a set of lstmf files that were created using the lstm.train config to +// tesseract into memory ready for testing. Returns false if nothing was +// loaded. The arg is a filename of a file that lists the filenames. +bool LSTMTester::LoadAllEvalData(const STRING& filenames_file) { + GenericVector filenames; + if (!LoadFileLinesToStrings(filenames_file, &filenames)) { + tprintf("Failed to load list of eval filenames from %s\n", + filenames_file.string()); + return false; + } + return LoadAllEvalData(filenames); +} + +// Loads a set of lstmf files that were created using the lstm.train config to +// tesseract into memory ready for testing. Returns false if nothing was +// loaded. +bool LSTMTester::LoadAllEvalData(const GenericVector& filenames) { + test_data_.Clear(); + bool result = + test_data_.LoadDocuments(filenames, "eng", CS_SEQUENTIAL, nullptr); + total_pages_ = test_data_.TotalPages(); + return result; +} + +// Runs an evaluation asynchronously on the stored data and returns a string +// describing the results of the previous test. +STRING LSTMTester::RunEvalAsync(int iteration, const double* training_errors, + const GenericVector& model_data, + int training_stage) { + STRING result; + if (total_pages_ == 0) { + result.add_str_int("No test data at iteration", iteration); + return result; + } + if (!LockIfNotRunning()) { + result.add_str_int("Previous test incomplete, skipping test at iteration", + iteration); + return result; + } + // Save the args. + STRING prev_result = test_result_; + test_result_ = ""; + if (training_errors != nullptr) { + test_iteration_ = iteration; + test_training_errors_ = training_errors; + test_model_data_ = model_data; + test_training_stage_ = training_stage; + SVSync::StartThread(&LSTMTester::ThreadFunc, this); + } else { + UnlockRunning(); + } + return prev_result; +} + +// Runs an evaluation synchronously on the stored data and returns a string +// describing the results. +STRING LSTMTester::RunEvalSync(int iteration, const double* training_errors, + const GenericVector& model_data, + int training_stage) { + LSTMTrainer trainer; + if (!trainer.ReadTrainingDump(model_data, &trainer)) { + return "Deserialize failed"; + } + int eval_iteration = 0; + double char_error = 0.0; + double word_error = 0.0; + int error_count = 0; + while (error_count < total_pages_) { + const ImageData* trainingdata = test_data_.GetPageBySerial(eval_iteration); + trainer.SetIteration(++eval_iteration); + NetworkIO fwd_outputs, targets; + if (trainer.PrepareForBackward(trainingdata, &fwd_outputs, &targets) != + UNENCODABLE) { + char_error += trainer.NewSingleError(tesseract::ET_CHAR_ERROR); + word_error += trainer.NewSingleError(tesseract::ET_WORD_RECERR); + ++error_count; + } + } + char_error *= 100.0 / total_pages_; + word_error *= 100.0 / total_pages_; + STRING result; + result.add_str_int("At iteration ", iteration); + result.add_str_int(", stage ", training_stage); + result.add_str_double(", Eval Char error rate=", char_error); + result.add_str_double(", Word error rate=", word_error); + return result; +} + +// Static helper thread function for RunEvalAsync, with a specific signature +// required by SVSync::StartThread. Actually a member function pretending to +// be static, its arg is a this pointer that it will cast back to LSTMTester* +// to call RunEvalSync using the stored args that RunEvalAsync saves in *this. +// LockIfNotRunning must have returned true before calling ThreadFunc, and +// it will call UnlockRunning to release the lock after RunEvalSync completes. +/* static */ +void* LSTMTester::ThreadFunc(void* lstmtester_void) { + LSTMTester* lstmtester = reinterpret_cast(lstmtester_void); + lstmtester->test_result_ = lstmtester->RunEvalSync( + lstmtester->test_iteration_, lstmtester->test_training_errors_, + lstmtester->test_model_data_, lstmtester->test_training_stage_); + lstmtester->UnlockRunning(); + return lstmtester_void; +} + +// Returns true if there is currently nothing running, and takes the lock +// if there is nothing running. +bool LSTMTester::LockIfNotRunning() { + SVAutoLock lock(&running_mutex_); + if (async_running_) return false; + async_running_ = true; + return true; +} + +// Releases the running lock. +void LSTMTester::UnlockRunning() { + SVAutoLock lock(&running_mutex_); + async_running_ = false; +} + +} // namespace tesseract diff --git a/training/lstmtester.h b/training/lstmtester.h new file mode 100644 index 00000000..3b4cb05e --- /dev/null +++ b/training/lstmtester.h @@ -0,0 +1,94 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstmtester.h +// Description: Top-level line evaluation class for LSTM-based networks. +// Author: Ray Smith +// Created: Wed Nov 23 11:05:06 PST 2016 +// +// (C) Copyright 2016, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef TESSERACT_TRAINING_LSTMTESTER_H_ +#define TESSERACT_TRAINING_LSTMTESTER_H_ + +#include "genericvector.h" +#include "lstmtrainer.h" +#include "strngs.h" +#include "svutil.h" + +namespace tesseract { + +class LSTMTester { + public: + LSTMTester(inT64 max_memory); + + // Loads a set of lstmf files that were created using the lstm.train config to + // tesseract into memory ready for testing. Returns false if nothing was + // loaded. The arg is a filename of a file that lists the filenames, with one + // name per line. Conveniently, tesstrain.sh generates such a file, along + // with the files themselves. + bool LoadAllEvalData(const STRING& filenames_file); + // Loads a set of lstmf files that were created using the lstm.train config to + // tesseract into memory ready for testing. Returns false if nothing was + // loaded. + bool LoadAllEvalData(const GenericVector& filenames); + + // Runs an evaluation asynchronously on the stored eval data and returns a + // string describing the results of the previous test. Args match TestCallback + // declared in lstmtrainer.h: + // iteration: Current learning iteration number. + // training_errors: If not null, is an array of size ET_COUNT, indexed by + // the ErrorTypes enum and indicates the current errors measured by the + // trainer, and this is a serious request to run an evaluation. If null, + // then the caller is just polling for the results of the previous eval. + // model_data: is the model to evaluate, which should be a serialized + // LSTMTrainer. + // training_stage: an arbitrary number on the progress of training. + STRING RunEvalAsync(int iteration, const double* training_errors, + const GenericVector& model_data, + int training_stage); + // Runs an evaluation synchronously on the stored eval data and returns a + // string describing the results. Args as RunEvalAsync. + STRING RunEvalSync(int iteration, const double* training_errors, + const GenericVector& model_data, int training_stage); + + private: + // Static helper thread function for RunEvalAsync, with a specific signature + // required by SVSync::StartThread. Actually a member function pretending to + // be static, its arg is a this pointer that it will cast back to LSTMTester* + // to call RunEvalSync using the stored args that RunEvalAsync saves in *this. + // LockIfNotRunning must have returned true before calling ThreadFunc, and + // it will call UnlockRunning to release the lock after RunEvalSync completes. + static void* ThreadFunc(void* lstmtester_void); + // Returns true if there is currently nothing running, and takes the lock + // if there is nothing running. + bool LockIfNotRunning(); + // Releases the running lock. + void UnlockRunning(); + + // The data to test with. + DocumentCache test_data_; + int total_pages_; + // Flag that indicates an asynchronous test is currently running. + // Protected by running_mutex_. + bool async_running_; + SVMutex running_mutex_; + // Stored copies of the args for use while running asynchronously. + int test_iteration_; + const double* test_training_errors_; + GenericVector test_model_data_; + int test_training_stage_; + STRING test_result_; +}; + +} // namespace tesseract + +#endif // TESSERACT_TRAINING_LSTMTESTER_H_ diff --git a/training/lstmtraining.cpp b/training/lstmtraining.cpp new file mode 100644 index 00000000..c0252996 --- /dev/null +++ b/training/lstmtraining.cpp @@ -0,0 +1,211 @@ +/////////////////////////////////////////////////////////////////////// +// File: lstmtraining.cpp +// Description: Training program for LSTM-based networks. +// Author: Ray Smith +// Created: Fri May 03 11:05:06 PST 2013 +// +// (C) Copyright 2013, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +/////////////////////////////////////////////////////////////////////// + +#ifndef USE_STD_NAMESPACE +#include "base/commandlineflags.h" +#endif +#include "commontraining.h" +#include "lstmtester.h" +#include "lstmtrainer.h" +#include "params.h" +#include "strngs.h" +#include "tprintf.h" +#include "unicharset_training_utils.h" + +INT_PARAM_FLAG(debug_interval, 0, "How often to display the alignment."); +STRING_PARAM_FLAG(net_spec, "", "Network specification"); +INT_PARAM_FLAG(train_mode, 80, "Controls gross training behavior."); +INT_PARAM_FLAG(net_mode, 192, "Controls network behavior."); +INT_PARAM_FLAG(perfect_sample_delay, 4, + "How many imperfect samples between perfect ones."); +DOUBLE_PARAM_FLAG(target_error_rate, 0.01, "Final error rate in percent."); +DOUBLE_PARAM_FLAG(weight_range, 0.1, "Range of initial random weights."); +DOUBLE_PARAM_FLAG(learning_rate, 1.0e-4, "Weight factor for new deltas."); +DOUBLE_PARAM_FLAG(momentum, 0.9, "Decay factor for repeating deltas."); +INT_PARAM_FLAG(max_image_MB, 6000, "Max memory to use for images."); +STRING_PARAM_FLAG(continue_from, "", "Existing model to extend"); +STRING_PARAM_FLAG(model_output, "lstmtrain", "Basename for output models"); +STRING_PARAM_FLAG(script_dir, "", + "Required to set unicharset properties or" + " use unicharset compression."); +STRING_PARAM_FLAG(train_listfile, "", + "File listing training files in lstmf training format."); +STRING_PARAM_FLAG(eval_listfile, "", + "File listing eval files in lstmf training format."); +BOOL_PARAM_FLAG(stop_training, false, + "Just convert the training model to a runtime model."); +INT_PARAM_FLAG(append_index, -1, "Index in continue_from Network at which to" + " attach the new network defined by net_spec"); +BOOL_PARAM_FLAG(debug_network, false, + "Get info on distribution of weight values"); +INT_PARAM_FLAG(max_iterations, 0, "If set, exit after this many iterations"); +DECLARE_STRING_PARAM_FLAG(U); + +// Number of training images to train between calls to MaintainCheckpoints. +const int kNumPagesPerBatch = 100; + +// Apart from command-line flags, input is a collection of lstmf files, that +// were previously created using tesseract with the lstm.train config file. +// The program iterates over the inputs, feeding the data to the network, +// until the error rate reaches a specified target or max_iterations is reached. +int main(int argc, char **argv) { + ParseArguments(&argc, &argv); + // Purify the model name in case it is based on the network string. + if (FLAGS_model_output.empty()) { + tprintf("Must provide a --model_output!\n"); + return 1; + } + STRING model_output = FLAGS_model_output.c_str(); + for (int i = 0; i < model_output.length(); ++i) { + if (model_output[i] == '[' || model_output[i] == ']') + model_output[i] = '-'; + if (model_output[i] == '(' || model_output[i] == ')') + model_output[i] = '_'; + } + // Setup the trainer. + STRING checkpoint_file = FLAGS_model_output.c_str(); + checkpoint_file += "_checkpoint"; + STRING checkpoint_bak = checkpoint_file + ".bak"; + tesseract::LSTMTrainer trainer( + nullptr, nullptr, nullptr, nullptr, FLAGS_model_output.c_str(), + checkpoint_file.c_str(), FLAGS_debug_interval, + static_cast(FLAGS_max_image_MB) * 1048576); + + // Reading something from an existing model doesn't require many flags, + // so do it now and exit. + if (FLAGS_stop_training || FLAGS_debug_network) { + if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str())) { + tprintf("Failed to read continue from: %s\n", + FLAGS_continue_from.c_str()); + return 1; + } + if (FLAGS_debug_network) { + trainer.DebugNetwork(); + } else { + if (FLAGS_train_mode & tesseract::TF_INT_MODE) + trainer.ConvertToInt(); + GenericVector recognizer_data; + trainer.SaveRecognitionDump(&recognizer_data); + if (!tesseract::SaveDataToFile(recognizer_data, + FLAGS_model_output.c_str())) { + tprintf("Failed to write recognition model : %s\n", + FLAGS_model_output.c_str()); + } + } + return 0; + } + + // Get the list of files to process. + if (FLAGS_train_listfile.empty()) { + tprintf("Must supply a list of training filenames! --train_listfile\n"); + return 1; + } + GenericVector filenames; + if (!tesseract::LoadFileLinesToStrings(FLAGS_train_listfile.c_str(), + &filenames)) { + tprintf("Failed to load list of training filenames from %s\n", + FLAGS_train_listfile.c_str()); + return 1; + } + + UNICHARSET unicharset; + // Checkpoints always take priority if they are available. + if (trainer.TryLoadingCheckpoint(checkpoint_file.string()) || + trainer.TryLoadingCheckpoint(checkpoint_bak.string())) { + tprintf("Successfully restored trainer from %s\n", + checkpoint_file.string()); + } else { + if (!FLAGS_continue_from.empty()) { + // Load a past model file to improve upon. + if (!trainer.TryLoadingCheckpoint(FLAGS_continue_from.c_str())) { + tprintf("Failed to continue from: %s\n", FLAGS_continue_from.c_str()); + return 1; + } + tprintf("Continuing from %s\n", FLAGS_continue_from.c_str()); + trainer.InitIterations(); + } + if (FLAGS_continue_from.empty() || FLAGS_append_index >= 0) { + // We need a unicharset to start from scratch or append. + string unicharset_str; + // Character coding to be used by the classifier. + if (!unicharset.load_from_file(FLAGS_U.c_str())) { + tprintf("Error: must provide a -U unicharset!\n"); + return 1; + } + tesseract::SetupBasicProperties(true, &unicharset); + if (FLAGS_append_index >= 0) { + tprintf("Appending a new network to an old one!!"); + if (FLAGS_continue_from.empty()) { + tprintf("Must set --continue_from for appending!\n"); + return 1; + } + } + // We are initializing from scratch. + trainer.InitCharSet(unicharset, FLAGS_script_dir.c_str(), + FLAGS_train_mode); + if (!trainer.InitNetwork(FLAGS_net_spec.c_str(), FLAGS_append_index, + FLAGS_net_mode, FLAGS_weight_range, + FLAGS_learning_rate, FLAGS_momentum)) { + tprintf("Failed to create network from spec: %s\n", + FLAGS_net_spec.c_str()); + return 1; + } + trainer.set_perfect_delay(FLAGS_perfect_sample_delay); + } + } + if (!trainer.LoadAllTrainingData(filenames)) { + tprintf("Load of images failed!!\n"); + return 1; + } + + bool best_dumped = true; + char* best_model_dump = nullptr; + size_t best_model_size = 0; + STRING best_model_name; + tesseract::LSTMTester tester(static_cast(FLAGS_max_image_MB) * + 1048576); + tesseract::TestCallback tester_callback = nullptr; + if (!FLAGS_eval_listfile.empty()) { + if (!tester.LoadAllEvalData(FLAGS_eval_listfile.c_str())) { + tprintf("Failed to load eval data from: %s\n", + FLAGS_eval_listfile.c_str()); + return 1; + } + tester_callback = + NewPermanentTessCallback(&tester, &tesseract::LSTMTester::RunEvalAsync); + } + do { + // Train a few. + int iteration = trainer.training_iteration(); + for (int target_iteration = iteration + kNumPagesPerBatch; + iteration < target_iteration; + iteration = trainer.training_iteration()) { + trainer.TrainOnLine(&trainer, false); + } + STRING log_str; + trainer.MaintainCheckpoints(tester_callback, &log_str); + tprintf("%s\n", log_str.string()); + } while (trainer.best_error_rate() > FLAGS_target_error_rate && + (trainer.training_iteration() < FLAGS_max_iterations || + FLAGS_max_iterations == 0)); + delete tester_callback; + tprintf("Finished! Error rate = %g\n", trainer.best_error_rate()); + return 0; +} /* main */ + + diff --git a/training/merge_unicharsets.cpp b/training/merge_unicharsets.cpp new file mode 100644 index 00000000..60adf198 --- /dev/null +++ b/training/merge_unicharsets.cpp @@ -0,0 +1,52 @@ +/////////////////////////////////////////////////////////////////////// +// File: merge_unicharsets.cpp +// Description: Simple tool to merge two or more unicharsets. +// Author: Ray Smith +// Created: Wed Sep 30 16:09:01 PDT 2015 +// +// (C) Copyright 2015, Google Inc. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +// +/////////////////////////////////////////////////////////////////////// + +#include +#include "unicharset.h" + +int main(int argc, char** argv) { + // Print usage + if (argc < 4) { + printf("Usage: %s unicharset-in-1 ... unicharset-in-n unicharset-out\n", + argv[0]); + exit(1); + } + + UNICHARSET input_unicharset, result_unicharset; + for (int arg = 1; arg < argc - 1; ++arg) { + // Load the input unicharset + if (input_unicharset.load_from_file(argv[arg])) { + printf("Loaded unicharset of size %d from file %s\n", + input_unicharset.size(), argv[arg]); + result_unicharset.AppendOtherUnicharset(input_unicharset); + } else { + printf("Failed to load unicharset from file %s!!\n", argv[arg]); + exit(1); + } + } + + // Save the combined unicharset. + if (result_unicharset.save_to_file(argv[argc - 1])) { + printf("Wrote unicharset file %s.\n", argv[argc - 1]); + } else { + printf("Cannot save unicharset file %s.\n", argv[argc - 1]); + exit(1); + } + return 0; +} diff --git a/training/mergenf.cpp b/training/mergenf.cpp index 1f7c9a41..4c8e7037 100644 --- a/training/mergenf.cpp +++ b/training/mergenf.cpp @@ -33,7 +33,6 @@ #include #include - /*-------------------once in subfeat---------------------------------*/ double_VAR(training_angle_match_scale, 1.0, "Angle Match Scale ..."); @@ -199,9 +198,9 @@ int FindClosestExistingProto(CLASS_TYPE Class, int NumMerged[], * * @param New new proto to be filled in * @param Old old proto to be converted - * + * * Globals: none - * + * * Exceptions: none * History: Mon Nov 26 09:45:39 1990, DSJ, Created. */ diff --git a/training/mergenf.h b/training/mergenf.h index d0920a63..384d60ca 100644 --- a/training/mergenf.h +++ b/training/mergenf.h @@ -15,6 +15,10 @@ ** See the License for the specific language governing permissions and ** limitations under the License. ******************************************************************************/ + +#ifndef TESSERACT_TRAINING_MERGENF_H_ +#define TESSERACT_TRAINING_MERGENF_H_ + /**---------------------------------------------------------------------------- Include Files and Type Defines ----------------------------------------------------------------------------**/ @@ -95,3 +99,5 @@ BOOL8 PointInside ( FRECT *Rectangle, FLOAT32 X, FLOAT32 Y); + +#endif // TESSERACT_TRAINING_MERGENF_H_ diff --git a/training/mftraining.cpp b/training/mftraining.cpp index 60314a1c..804b6397 100644 --- a/training/mftraining.cpp +++ b/training/mftraining.cpp @@ -64,9 +64,6 @@ #include "tprintf.h" #include "unicity_table.h" -using tesseract::Classify; -using tesseract::FontInfo; -using tesseract::FontSpacingInfo; using tesseract::IndexMapBiDi; using tesseract::MasterTrainer; using tesseract::Shape; @@ -151,7 +148,7 @@ static LIST ClusterOneConfig(int shape_id, const char* class_label, clusterer->SampleSize); FreeClusterer(clusterer); MERGE_CLASS merge_class = FindClass(mf_classes, class_label); - if (merge_class == NULL) { + if (merge_class == nullptr) { merge_class = NewLabeledClass(class_label); mf_classes = push(mf_classes, merge_class); } @@ -241,15 +238,14 @@ static void SetupConfigMap(ShapeTable* shape_table, IndexMapBiDi* config_map) { int main (int argc, char **argv) { ParseArguments(&argc, &argv); - ShapeTable* shape_table = NULL; + ShapeTable* shape_table = nullptr; STRING file_prefix; // Load the training data. MasterTrainer* trainer = tesseract::LoadTrainingData(argc, argv, false, &shape_table, &file_prefix); - if (trainer == NULL) - return 1; // Failed. + if (trainer == nullptr) return 1; // Failed. // Setup an index mapping from the shapes in the shape table to the classes // that will be trained. In keeping with the original design, each shape @@ -305,6 +301,9 @@ int main (int argc, char **argv) { *shape_table, float_classes, inttemp_file.string(), pffmtable_file.string()); + for (int c = 0; c < unicharset->size(); ++c) { + FreeClassFields(&float_classes[c]); + } delete [] float_classes; FreeLabeledClassList(mf_classes); delete trainer; diff --git a/training/normstrngs.cpp b/training/normstrngs.cpp index acffeee1..fa2fbb92 100644 --- a/training/normstrngs.cpp +++ b/training/normstrngs.cpp @@ -48,7 +48,7 @@ void UTF32ToUTF8(const GenericVector& str32, STRING* utf8_str) { for (int i = 0; i < str32.length(); ++i) { UNICHAR uni_ch(str32[i]); char *utf8 = uni_ch.utf8_str(); - if (utf8 != NULL) { + if (utf8 != nullptr) { (*utf8_str) += utf8; delete[] utf8; } @@ -113,12 +113,12 @@ bool is_double_quote(const char32 ch) { return false; } -STRING NormalizeUTF8String(const char* str8) { +STRING NormalizeUTF8String(bool decompose, const char* str8) { GenericVector str32, out_str32, norm_str; UTF8ToUTF32(str8, &str32); for (int i = 0; i < str32.length(); ++i) { norm_str.clear(); - NormalizeChar32(str32[i], &norm_str); + NormalizeChar32(str32[i], decompose, &norm_str); for (int j = 0; j < norm_str.length(); ++j) { out_str32.push_back(norm_str[j]); } @@ -128,10 +128,11 @@ STRING NormalizeUTF8String(const char* str8) { return out_str8; } -void NormalizeChar32(char32 ch, GenericVector* str) { +void NormalizeChar32(char32 ch, bool decompose, GenericVector* str) { IcuErrorCode error_code; const icu::Normalizer2* nfkc = icu::Normalizer2::getInstance( - NULL, "nfkc", UNORM2_COMPOSE, error_code); + nullptr, "nfkc", decompose ? UNORM2_DECOMPOSE : UNORM2_COMPOSE, + error_code); error_code.assertSuccess(); error_code.reset(); diff --git a/training/normstrngs.h b/training/normstrngs.h index 71e7b8da..6fca3193 100644 --- a/training/normstrngs.h +++ b/training/normstrngs.h @@ -39,11 +39,16 @@ void UTF32ToUTF8(const GenericVector& str32, STRING* utf8_str); // assumption of this function is that the input is already as fully composed // as it can be, but may require some compatibility normalizations or just // OCR evaluation related normalizations. -void NormalizeChar32(char32 ch, GenericVector* str); +void NormalizeChar32(char32 ch, bool decompose, GenericVector* str); // Normalize a UTF8 string. Same as above, but for UTF8-encoded strings, that // can contain multiple UTF32 code points. -STRING NormalizeUTF8String(const char* str8); +STRING NormalizeUTF8String(bool decompose, const char* str8); +// Default behavior is to compose, until it is proven that decomposed benefits +// at least one language. +inline STRING NormalizeUTF8String(const char* str8) { + return NormalizeUTF8String(false, str8); +} // Apply just the OCR-specific normalizations and return the normalized char. char32 OCRNormalize(char32 ch); diff --git a/training/pango_font_info.cpp b/training/pango_font_info.cpp index c2b508c0..0481c2b7 100644 --- a/training/pango_font_info.cpp +++ b/training/pango_font_info.cpp @@ -60,15 +60,6 @@ STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir"); -BOOL_PARAM_FLAG(fontconfig_refresh_cache, false, - "Does a one-time deletion of cache files from the " - "fontconfig_tmpdir before initializing fontconfig."); -BOOL_PARAM_FLAG(fontconfig_refresh_config_file, true, - "Does a one-time reset of the fontconfig config file to point" - " to fonts_dir before initializing fontconfig. Set to true" - " if fontconfig_refresh_cache is true. Set it to false to use" - " multiple instances in separate processes without having to" - " rescan the fonts_dir, using a previously setup font cache"); #ifndef USE_STD_NAMESPACE #include "ocr/trainingdata/typesetting/legacy_fonts.h" @@ -91,14 +82,16 @@ namespace tesseract { // in pixels. const int kDefaultResolution = 300; -bool PangoFontInfo::fontconfig_initialized_ = false; +string PangoFontInfo::fonts_dir_; +string PangoFontInfo::cache_dir_; -PangoFontInfo::PangoFontInfo() : desc_(NULL), resolution_(kDefaultResolution) { +PangoFontInfo::PangoFontInfo() + : desc_(nullptr), resolution_(kDefaultResolution) { Clear(); } PangoFontInfo::PangoFontInfo(const string& desc) - : desc_(NULL), resolution_(kDefaultResolution) { + : desc_(nullptr), resolution_(kDefaultResolution) { if (!ParseFontDescriptionName(desc)) { tprintf("ERROR: Could not parse %s\n", desc.c_str()); Clear(); @@ -115,10 +108,12 @@ void PangoFontInfo::Clear() { font_type_ = UNKNOWN; if (desc_) { pango_font_description_free(desc_); - desc_ = NULL; + desc_ = nullptr; } } +PangoFontInfo::~PangoFontInfo() { pango_font_description_free(desc_); } + string PangoFontInfo::DescriptionName() const { if (!desc_) return ""; char* desc_str = pango_font_description_to_string(desc_); @@ -127,59 +122,63 @@ string PangoFontInfo::DescriptionName() const { return desc_name; } -// Initializes Fontconfig for use by writing a fake fonts.conf file into the -// FLAGS_fontconfigs_tmpdir directory, that points to the supplied -// fonts_dir, and then overrides the FONTCONFIG_PATH environment variable -// to point to this fonts.conf file. If force_clear, the cache is refreshed -// even if it has already been initialized. -void PangoFontInfo::InitFontConfig(bool force_clear, const string& fonts_dir) { - if ((fontconfig_initialized_ && !force_clear) || fonts_dir.empty()) { - fontconfig_initialized_ = true; - return; +// If not already initialized, initializes FontConfig by setting its +// environment variable and creating a fonts.conf file that points to the +// FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir. +/* static */ +void PangoFontInfo::SoftInitFontConfig() { + if (fonts_dir_.empty()) { + HardInitFontConfig(FLAGS_fonts_dir.c_str(), + FLAGS_fontconfig_tmpdir.c_str()); } - if (FLAGS_fontconfig_refresh_cache || force_clear) { - File::DeleteMatchingFiles(File::JoinPath( - FLAGS_fontconfig_tmpdir.c_str(), "*cache-?").c_str()); - } - if (FLAGS_fontconfig_refresh_config_file || FLAGS_fontconfig_refresh_cache || - force_clear) { - const int MAX_FONTCONF_FILESIZE = 1024; - char fonts_conf_template[MAX_FONTCONF_FILESIZE]; - snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, - "\n" - "\n" - "\n" - "%s\n" - "%s\n" - "\n" - "", fonts_dir.c_str(), - FLAGS_fontconfig_tmpdir.c_str()); - string fonts_conf_file = File::JoinPath(FLAGS_fontconfig_tmpdir.c_str(), - "fonts.conf"); - File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); +} + +// Re-initializes font config, whether or not already initialized. +// If already initialized, any existing cache is deleted, just to be sure. +/* static */ +void PangoFontInfo::HardInitFontConfig(const string& fonts_dir, + const string& cache_dir) { + if (!cache_dir_.empty()) { + File::DeleteMatchingFiles( + File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str()); } + const int MAX_FONTCONF_FILESIZE = 1024; + char fonts_conf_template[MAX_FONTCONF_FILESIZE]; + cache_dir_ = cache_dir; + fonts_dir_ = fonts_dir; + snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE, + "\n" + "\n" + "\n" + "%s\n" + "%s\n" + "\n" + "", + fonts_dir.c_str(), cache_dir_.c_str()); + string fonts_conf_file = File::JoinPath(cache_dir_.c_str(), "fonts.conf"); + File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file); #ifdef _WIN32 std::string env("FONTCONFIG_PATH="); - env.append(FLAGS_fontconfig_tmpdir.c_str()); + env.append(cache_dir_.c_str()); putenv(env.c_str()); putenv("LANG=en_US.utf8"); #else - setenv("FONTCONFIG_PATH", FLAGS_fontconfig_tmpdir.c_str(), true); + setenv("FONTCONFIG_PATH", cache_dir_.c_str(), true); // Fix the locale so that the reported font names are consistent. setenv("LANG", "en_US.utf8", true); #endif // _WIN32 - if (!fontconfig_initialized_ || force_clear) { - if (FcInitReinitialize() != FcTrue) { - tprintf("FcInitiReinitialize failed!!\n"); - } + + if (FcInitReinitialize() != FcTrue) { + tprintf("FcInitiReinitialize failed!!\n"); } - fontconfig_initialized_ = true; FontUtils::ReInit(); + // Clear Pango's font cache too. + pango_cairo_font_map_set_default(nullptr); } static void ListFontFamilies(PangoFontFamily*** families, int* n_families) { - PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str()); + PangoFontInfo::SoftInitFontConfig(); PangoFontMap* font_map = pango_cairo_font_map_get_default(); DISABLE_HEAP_LEAK_CHECK; pango_font_map_list_families(font_map, families, n_families); @@ -238,7 +237,7 @@ bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) { // We don't have a way to detect whether a font is of type Fraktur. The fonts // we currently use all have "Fraktur" in their family name, so we do a // fragile but functional check for that here. - is_fraktur_ = (strcasestr(family, "Fraktur") != NULL); + is_fraktur_ = (strcasestr(family, "Fraktur") != nullptr); return true; } @@ -253,12 +252,12 @@ bool PangoFontInfo::ParseFontDescriptionName(const string& name) { // in the font map. Note that if the font is wholly missing, this could // correspond to a completely different font family and face. PangoFont* PangoFontInfo::ToPangoFont() const { - InitFontConfig(false, FLAGS_fonts_dir.c_str()); + SoftInitFontConfig(); PangoFontMap* font_map = pango_cairo_font_map_get_default(); PangoContext* context = pango_context_new(); pango_cairo_context_set_resolution(context, resolution_); pango_context_set_font_map(context, font_map); - PangoFont* font = NULL; + PangoFont* font = nullptr; { DISABLE_HEAP_LEAK_CHECK; font = pango_font_map_load_font(font_map, context, desc_); @@ -269,7 +268,7 @@ PangoFont* PangoFontInfo::ToPangoFont() const { bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const { PangoFont* font = ToPangoFont(); - PangoCoverage* coverage = pango_font_get_coverage(font, NULL); + PangoCoverage* coverage = pango_font_get_coverage(font, nullptr); for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length); it != UNICHAR::end(utf8_text, byte_length); ++it) { @@ -310,7 +309,7 @@ static char* my_strnmove(char* dest, const char* src, size_t n) { int PangoFontInfo::DropUncoveredChars(string* utf8_text) const { PangoFont* font = ToPangoFont(); - PangoCoverage* coverage = pango_font_get_coverage(font, NULL); + PangoCoverage* coverage = pango_font_get_coverage(font, nullptr); int num_dropped_chars = 0; // Maintain two iterators that point into the string. For space efficiency, we // will repeatedly copy one covered UTF8 character from one to the other, and @@ -373,8 +372,8 @@ bool PangoFontInfo::GetSpacingProperties(const string& utf8_char, // Find the ink glyph extents for the glyph PangoRectangle ink_rect, logical_rect; pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect); - pango_extents_to_pixels(&ink_rect, NULL); - pango_extents_to_pixels(&logical_rect, NULL); + pango_extents_to_pixels(&ink_rect, nullptr); + pango_extents_to_pixels(&logical_rect, nullptr); int bearing = total_advance + PANGO_LBEARING(ink_rect); if (it == it_begin || bearing < min_bearing) { @@ -388,12 +387,12 @@ bool PangoFontInfo::GetSpacingProperties(const string& utf8_char, } bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const { - vector graphemes; + std::vector graphemes; return CanRenderString(utf8_word, len, &graphemes); } bool PangoFontInfo::CanRenderString(const char* utf8_word, int len, - vector* graphemes) const { + std::vector* graphemes) const { if (graphemes) graphemes->clear(); // We check for font coverage of the text first, as otherwise Pango could // (undesirably) fall back to another font that does have the required @@ -423,7 +422,7 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len, pango_font_description_free(desc); } pango_layout_set_text(layout, utf8_word, len); - PangoLayoutIter* run_iter = NULL; + PangoLayoutIter* run_iter = nullptr; { // Fontconfig caches some information here that is not freed before exit. DISABLE_HEAP_LEAK_CHECK; run_iter = pango_layout_get_iter(layout); @@ -431,16 +430,21 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len, do { PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter); if (!run) { - tlog(2, "Found end of line NULL run marker\n"); + tlog(2, "Found end of line nullptr run marker\n"); continue; } PangoGlyph dotted_circle_glyph; PangoFont* font = run->item->analysis.font; - PangoGlyphString * glyphs = pango_glyph_string_new(); +#ifdef _WIN32 // Fixme! Leaks memory and breaks unittests. + PangoGlyphString* glyphs = pango_glyph_string_new(); char s[] = "\xc2\xa7"; pango_shape(s, sizeof(s), &(run->item->analysis), glyphs); dotted_circle_glyph = glyphs->glyphs[0].glyph; +#else + dotted_circle_glyph = pango_fc_font_get_glyph( + reinterpret_cast(font), kDottedCircleGlyph); +#endif if (TLOG_IS_ON(2)) { PangoFontDescription* desc = pango_font_describe(font); @@ -504,7 +508,7 @@ bool PangoFontInfo::CanRenderString(const char* utf8_word, int len, // ------------------------ FontUtils ------------------------------------ -vector FontUtils::available_fonts_; // cache list +std::vector FontUtils::available_fonts_; // cache list // Returns whether the specified font description is available in the fonts // directory. @@ -519,22 +523,21 @@ vector FontUtils::available_fonts_; // cache list bool FontUtils::IsAvailableFont(const char* input_query_desc, string* best_match) { string query_desc(input_query_desc); - if (PANGO_VERSION <= 12005) { - // Strip commas and any ' Medium' substring in the name. - query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','), - query_desc.end()); - const string kMediumStr = " Medium"; - std::size_t found = query_desc.find(kMediumStr); - if (found != std::string::npos) { - query_desc.erase(found, kMediumStr.length()); - } +#if (PANGO_VERSION <= 12005) + // Strip commas and any ' Medium' substring in the name. + query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','), + query_desc.end()); + const string kMediumStr = " Medium"; + std::size_t found = query_desc.find(kMediumStr); + if (found != std::string::npos) { + query_desc.erase(found, kMediumStr.length()); } - +#endif PangoFontDescription *desc = pango_font_description_from_string( query_desc.c_str()); - PangoFont* selected_font = NULL; + PangoFont* selected_font = nullptr; { - PangoFontInfo::InitFontConfig(false, FLAGS_fonts_dir.c_str()); + PangoFontInfo::SoftInitFontConfig(); PangoFontMap* font_map = pango_cairo_font_map_get_default(); PangoContext* context = pango_context_new(); pango_context_set_font_map(context, font_map); @@ -544,7 +547,7 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc, } g_object_unref(context); } - if (selected_font == NULL) { + if (selected_font == nullptr) { pango_font_description_free(desc); return false; } @@ -558,7 +561,7 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc, char* selected_desc_str = pango_font_description_to_string(selected_desc); tlog(2, "query_desc: '%s' Selected: '%s'\n", query_desc.c_str(), selected_desc_str); - if (!equal && best_match != NULL) { + if (!equal && best_match != nullptr) { *best_match = selected_desc_str; // Clip the ending ' 0' if there is one. It seems that, if there is no // point size on the end of the fontname, then Pango always appends ' 0'. @@ -576,10 +579,10 @@ bool FontUtils::IsAvailableFont(const char* input_query_desc, } static bool ShouldIgnoreFontFamilyName(const char* query) { - static const char* kIgnoredFamilyNames[] - = { "Sans", "Serif", "Monospace", NULL }; + static const char* kIgnoredFamilyNames[] = {"Sans", "Serif", "Monospace", + nullptr}; const char** list = kIgnoredFamilyNames; - for (; *list != NULL; ++list) { + for (; *list != nullptr; ++list) { if (!strcmp(*list, query)) return true; } @@ -588,8 +591,8 @@ static bool ShouldIgnoreFontFamilyName(const char* query) { // Outputs description names of available fonts. /* static */ -const vector& FontUtils::ListAvailableFonts() { - if (available_fonts_.size()) { +const std::vector& FontUtils::ListAvailableFonts() { + if (!available_fonts_.empty()) { return available_fonts_; } #ifndef USE_STD_NAMESPACE @@ -598,7 +601,7 @@ const vector& FontUtils::ListAvailableFonts() { tprintf("Using list of legacy fonts only\n"); const int kNumFontLists = 4; for (int i = 0; i < kNumFontLists; ++i) { - for (int j = 0; kFontlists[i][j] != NULL; ++j) { + for (int j = 0; kFontlists[i][j] != nullptr; ++j) { available_fonts_.push_back(kFontlists[i][j]); } } @@ -617,7 +620,7 @@ const vector& FontUtils::ListAvailableFonts() { } int n_faces; - PangoFontFace** faces = NULL; + PangoFontFace** faces = nullptr; pango_font_family_list_faces(families[i], &faces, &n_faces); for (int j = 0; j < n_faces; ++j) { PangoFontDescription* desc = pango_font_face_describe(faces[j]); @@ -631,13 +634,13 @@ const vector& FontUtils::ListAvailableFonts() { g_free(faces); } g_free(families); - sort(available_fonts_.begin(), available_fonts_.end()); + std::sort(available_fonts_.begin(), available_fonts_.end()); return available_fonts_; } static void CharCoverageMapToBitmap(PangoCoverage* coverage, - vector* unichar_bitmap) { + std::vector* unichar_bitmap) { const int kMinUnicodeValue = 33; const int kMaxUnicodeValue = 0x10FFFF; unichar_bitmap->resize(kMaxUnicodeValue + 1, false); @@ -651,30 +654,30 @@ static void CharCoverageMapToBitmap(PangoCoverage* coverage, } /* static */ -void FontUtils::GetAllRenderableCharacters(vector* unichar_bitmap) { - const vector& all_fonts = ListAvailableFonts(); +void FontUtils::GetAllRenderableCharacters(std::vector* unichar_bitmap) { + const std::vector& all_fonts = ListAvailableFonts(); return GetAllRenderableCharacters(all_fonts, unichar_bitmap); } /* static */ void FontUtils::GetAllRenderableCharacters(const string& font_name, - vector* unichar_bitmap) { + std::vector* unichar_bitmap) { PangoFontInfo font_info(font_name); - PangoCoverage* coverage = pango_font_get_coverage( - font_info.ToPangoFont(), NULL); + PangoCoverage* coverage = + pango_font_get_coverage(font_info.ToPangoFont(), nullptr); CharCoverageMapToBitmap(coverage, unichar_bitmap); } /* static */ -void FontUtils::GetAllRenderableCharacters(const vector& fonts, - vector* unichar_bitmap) { +void FontUtils::GetAllRenderableCharacters(const std::vector& fonts, + std::vector* unichar_bitmap) { // Form the union of coverage maps from the fonts PangoCoverage* all_coverage = pango_coverage_new(); tlog(1, "Processing %d fonts\n", fonts.size()); for (int i = 0; i < fonts.size(); ++i) { PangoFontInfo font_info(fonts[i]); - PangoCoverage* coverage = pango_font_get_coverage( - font_info.ToPangoFont(), NULL); + PangoCoverage* coverage = + pango_font_get_coverage(font_info.ToPangoFont(), nullptr); // Mark off characters that any font can render. pango_coverage_max(all_coverage, coverage); } @@ -686,16 +689,15 @@ void FontUtils::GetAllRenderableCharacters(const vector& fonts, // Utilities written to be backward compatible with StringRender /* static */ -int FontUtils::FontScore(const unordered_map& ch_map, - const string& fontname, - int* raw_score, - vector* ch_flags) { +int FontUtils::FontScore(const std::unordered_map& ch_map, + const string& fontname, int* raw_score, + std::vector* ch_flags) { PangoFontInfo font_info; if (!font_info.ParseFontDescriptionName(fontname)) { tprintf("ERROR: Could not parse %s\n", fontname.c_str()); } PangoFont* font = font_info.ToPangoFont(); - PangoCoverage* coverage = pango_font_get_coverage(font, NULL); + PangoCoverage* coverage = pango_font_get_coverage(font, nullptr); if (ch_flags) { ch_flags->clear(); @@ -703,7 +705,7 @@ int FontUtils::FontScore(const unordered_map& ch_map, } *raw_score = 0; int ok_chars = 0; - for (unordered_map::const_iterator it = ch_map.begin(); + for (std::unordered_map::const_iterator it = ch_map.begin(); it != ch_map.end(); ++it) { bool covered = (IsWhitespace(it->first) || (pango_coverage_get(coverage, it->first) @@ -721,22 +723,23 @@ int FontUtils::FontScore(const unordered_map& ch_map, /* static */ -string FontUtils::BestFonts(const unordered_map& ch_map, - vector > >* fonts) { +string FontUtils::BestFonts( + const std::unordered_map& ch_map, + std::vector > >* fonts) { const double kMinOKFraction = 0.99; // Weighted fraction of characters that must be renderable in a font to make // it OK even if the raw count is not good. const double kMinWeightedFraction = 0.99995; fonts->clear(); - vector > font_flags; - vector font_scores; - vector raw_scores; + std::vector > font_flags; + std::vector font_scores; + std::vector raw_scores; int most_ok_chars = 0; int best_raw_score = 0; - const vector& font_names = FontUtils::ListAvailableFonts(); + const std::vector& font_names = FontUtils::ListAvailableFonts(); for (int i = 0; i < font_names.size(); ++i) { - vector ch_flags; + std::vector ch_flags; int raw_score = 0; int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags); most_ok_chars = MAX(ok_chars, most_ok_chars); @@ -765,7 +768,7 @@ string FontUtils::BestFonts(const unordered_map& ch_map, int raw_score = raw_scores[i]; if ((score >= least_good_enough && raw_score >= least_raw_enough) || score >= override_enough) { - fonts->push_back(make_pair(font_names[i].c_str(), font_flags[i])); + fonts->push_back(std::make_pair(font_names[i].c_str(), font_flags[i])); tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n", font_names[i].c_str(), 100.0 * score / most_ok_chars, @@ -784,20 +787,20 @@ string FontUtils::BestFonts(const unordered_map& ch_map, /* static */ bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len, - string* font_name, vector* graphemes) { + string* font_name, std::vector* graphemes) { return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name, graphemes); } /* static */ bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len, - const vector& all_fonts, - string* font_name, vector* graphemes) { + const std::vector& all_fonts, + string* font_name, std::vector* graphemes) { if (font_name) font_name->clear(); if (graphemes) graphemes->clear(); for (int i = 0; i < all_fonts.size(); ++i) { PangoFontInfo font; - vector found_graphemes; + std::vector found_graphemes; ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]), "Could not parse font desc name %s\n", all_fonts[i].c_str()); diff --git a/training/pango_font_info.h b/training/pango_font_info.h index 421139a1..af6ee985 100644 --- a/training/pango_font_info.h +++ b/training/pango_font_info.h @@ -21,13 +21,19 @@ #define TESSERACT_TRAINING_PANGO_FONT_INFO_H_ #include +#include #include #include -#include "hashfn.h" +#include "commandlineflags.h" #include "host.h" -#include "util.h" #include "pango/pango-font.h" +#include "pango/pango.h" +#include "pango/pangocairo.h" +#include "util.h" + +DECLARE_STRING_PARAM_FLAG(fonts_dir); +DECLARE_STRING_PARAM_FLAG(fontconfig_tmpdir); typedef signed int char32; @@ -44,6 +50,7 @@ class PangoFontInfo { DECORATIVE, }; PangoFontInfo(); + ~PangoFontInfo(); // Initialize from parsing a font description name, defined as a string of the // format: // "FamilyName [FaceName] [PointSize]" @@ -73,7 +80,7 @@ class PangoFontInfo { // If true, returns individual graphemes. Any whitespace characters in the // original string are also included in the list. bool CanRenderString(const char* utf8_word, int len, - vector* graphemes) const; + std::vector* graphemes) const; bool CanRenderString(const char* utf8_word, int len) const; // Retrieves the x_bearing and x_advance for the given utf8 character in the @@ -83,25 +90,29 @@ class PangoFontInfo { bool GetSpacingProperties(const string& utf8_char, int* x_bearing, int* x_advance) const; - // Initializes FontConfig by setting its environment variable and creating - // a fonts.conf file that points to the given fonts_dir. Once initialized, - // it is not re-initialized unless force_clear is true. - static void InitFontConfig(bool force_clear, const string& fonts_dir); + // If not already initialized, initializes FontConfig by setting its + // environment variable and creating a fonts.conf file that points to the + // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir. + static void SoftInitFontConfig(); + // Re-initializes font config, whether or not already initialized. + // If already initialized, any existing cache is deleted, just to be sure. + static void HardInitFontConfig(const string& fonts_dir, + const string& cache_dir); // Accessors string DescriptionName() const; // Font Family name eg. "Arial" const string& family_name() const { return family_name_; } // Size in points (1/72"), rounded to the nearest integer. - int font_size() const { return font_size_; } - bool is_bold() const { return is_bold_; } - bool is_italic() const { return is_italic_; } - bool is_smallcaps() const { return is_smallcaps_; } - bool is_monospace() const { return is_monospace_; } - bool is_fraktur() const { return is_fraktur_; } + int font_size() const { return font_size_; } + bool is_bold() const { return is_bold_; } + bool is_italic() const { return is_italic_; } + bool is_smallcaps() const { return is_smallcaps_; } + bool is_monospace() const { return is_monospace_; } + bool is_fraktur() const { return is_fraktur_; } FontTypeEnum font_type() const { return font_type_; } - int resolution() const { return resolution_; } + int resolution() const { return resolution_; } void set_resolution(const int resolution) { resolution_ = resolution; } @@ -130,8 +141,14 @@ class PangoFontInfo { int resolution_; // Fontconfig operates through an environment variable, so it intrinsically // cannot be thread-friendly, but you can serialize multiple independent - // font configurations by calling InitFontConfig(true, path). - static bool fontconfig_initialized_; + // font configurations by calling HardInitFontConfig(fonts_dir, cache_dir). + // These hold the last initialized values set by HardInitFontConfig or + // the first call to SoftInitFontConfig. + // Directory to be scanned for font files. + static string fonts_dir_; + // Directory to store the cache of font information. (Can be the same as + // fonts_dir_) + static string cache_dir_; private: PangoFontInfo(const PangoFontInfo&); @@ -145,36 +162,36 @@ class FontUtils { // Returns true if the font of the given description name is available in the // target directory specified by --fonts_dir static bool IsAvailableFont(const char* font_desc) { - return IsAvailableFont(font_desc, NULL); + return IsAvailableFont(font_desc, nullptr); } // Returns true if the font of the given description name is available in the // target directory specified by --fonts_dir. If false is returned, and - // best_match is not NULL, the closest matching font is returned there. + // best_match is not nullptr, the closest matching font is returned there. static bool IsAvailableFont(const char* font_desc, string* best_match); // Outputs description names of available fonts. - static const vector& ListAvailableFonts(); + static const std::vector& ListAvailableFonts(); // Picks font among available fonts that covers and can render the given word, // and returns the font description name and the decomposition of the word to // graphemes. Returns false if no suitable font was found. static bool SelectFont(const char* utf8_word, const int utf8_len, - string* font_name, vector* graphemes); + string* font_name, std::vector* graphemes); // Picks font among all_fonts that covers and can render the given word, // and returns the font description name and the decomposition of the word to // graphemes. Returns false if no suitable font was found. static bool SelectFont(const char* utf8_word, const int utf8_len, - const vector& all_fonts, - string* font_name, vector* graphemes); + const std::vector& all_fonts, + string* font_name, std::vector* graphemes); // Returns a bitmask where the value of true at index 'n' implies that unicode // value 'n' is renderable by at least one available font. - static void GetAllRenderableCharacters(vector* unichar_bitmap); + static void GetAllRenderableCharacters(std::vector* unichar_bitmap); // Variant of the above function that inspects only the provided font names. - static void GetAllRenderableCharacters(const vector& font_names, - vector* unichar_bitmap); + static void GetAllRenderableCharacters(const std::vector& font_names, + std::vector* unichar_bitmap); static void GetAllRenderableCharacters(const string& font_name, - vector* unichar_bitmap); + std::vector* unichar_bitmap); // NOTE: The following utilities were written to be backward compatible with // StringRender. @@ -185,23 +202,24 @@ class FontUtils { // In the flags vector, each flag is set according to whether the // corresponding character (in order of iterating ch_map) can be rendered. // The return string is a list of the acceptable fonts that were used. - static string BestFonts(const unordered_map& ch_map, - vector > >* font_flag); + static string BestFonts( + const std::unordered_map& ch_map, + std::vector > >* font_flag); // FontScore returns the weighted renderability score of the given // hash map character table in the given font. The unweighted score // is also returned in raw_score. // The values in the bool vector ch_flags correspond to whether the // corresponding character (in order of iterating ch_map) can be rendered. - static int FontScore(const unordered_map& ch_map, + static int FontScore(const std::unordered_map& ch_map, const string& fontname, int* raw_score, - vector* ch_flags); + std::vector* ch_flags); // PangoFontInfo is reinitialized, so clear the static list of fonts. static void ReInit(); private: - static vector available_fonts_; // cache list + static std::vector available_fonts_; // cache list }; } // namespace tesseract diff --git a/training/set_unicharset_properties.cpp b/training/set_unicharset_properties.cpp index 00844ecb..691c6dcf 100644 --- a/training/set_unicharset_properties.cpp +++ b/training/set_unicharset_properties.cpp @@ -1,3 +1,13 @@ +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + // This program reads a unicharset file, puts the result in a UNICHARSET // object, fills it with properties about the unichars it contains and writes // the result back to a file. diff --git a/training/shapeclustering.cpp b/training/shapeclustering.cpp index bf34b88c..25c4863a 100644 --- a/training/shapeclustering.cpp +++ b/training/shapeclustering.cpp @@ -48,8 +48,8 @@ int main(int argc, char **argv) { ParseArguments(&argc, &argv); STRING file_prefix; - tesseract::MasterTrainer* trainer = tesseract::LoadTrainingData( - argc, argv, false, NULL, &file_prefix); + tesseract::MasterTrainer* trainer = + tesseract::LoadTrainingData(argc, argv, false, nullptr, &file_prefix); if (!trainer) return 1; diff --git a/training/stringrenderer.cpp b/training/stringrenderer.cpp index fbff55fb..fab7b88b 100644 --- a/training/stringrenderer.cpp +++ b/training/stringrenderer.cpp @@ -52,7 +52,7 @@ static const int kDefaultOutputResolution = 300; // Word joiner (U+2060) inserted after letters in ngram mode, as per // recommendation in http://unicode.org/reports/tr14/ to avoid line-breaks at // hyphens and other non-alpha characters. -static const char* kWordJoinerUTF8 = "\xE2\x81\xA0"; //u8"\u2060"; +static const char* kWordJoinerUTF8 = "\xE2\x81\xA0"; // u8"\u2060"; static const char32 kWordJoiner = 0x2060; static bool IsCombiner(int ch) { @@ -79,7 +79,7 @@ Pix* CairoARGB32ToPixFormat(cairo_surface_t *surface) { if (cairo_image_surface_get_format(surface) != CAIRO_FORMAT_ARGB32) { printf("Unexpected surface format %d\n", cairo_image_surface_get_format(surface)); - return NULL; + return nullptr; } const int width = cairo_image_surface_get_width(surface); const int height = cairo_image_surface_get_height(surface); @@ -108,26 +108,26 @@ StringRenderer::StringRenderer(const string& font_desc, int page_width, underline_start_prob_(0), underline_continuation_prob_(0), underline_style_(PANGO_UNDERLINE_SINGLE), + features_(nullptr), drop_uncovered_chars_(true), strip_unrenderable_words_(false), add_ligatures_(false), output_word_boxes_(false), - surface_(NULL), - cr_(NULL), - layout_(NULL), + surface_(nullptr), + cr_(nullptr), + layout_(nullptr), start_box_(0), page_(0), box_padding_(0), total_chars_(0), font_index_(0), - features_(NULL), last_offset_(0) { pen_color_[0] = 0.0; pen_color_[1] = 0.0; pen_color_[2] = 0.0; set_font(font_desc); set_resolution(kDefaultOutputResolution); - page_boxes_ = NULL; + page_boxes_ = nullptr; } bool StringRenderer::set_font(const string& desc) { @@ -209,8 +209,7 @@ void StringRenderer::SetLayoutProperties() { #if (PANGO_VERSION_MAJOR == 1 && PANGO_VERSION_MINOR >= 38) if (add_ligatures_) { set_features("liga, clig, dlig, hlig"); - PangoAttribute* feature_attr = - pango_attr_font_features_new(features_); + PangoAttribute* feature_attr = pango_attr_font_features_new(features_); pango_attr_list_change(attr_list, feature_attr); } #endif @@ -225,15 +224,15 @@ void StringRenderer::SetLayoutProperties() { void StringRenderer::FreePangoCairo() { if (layout_) { g_object_unref(layout_); - layout_ = NULL; + layout_ = nullptr; } if (cr_) { cairo_destroy(cr_); - cr_ = NULL; + cr_ = nullptr; } if (surface_) { cairo_surface_destroy(surface_); - surface_ = NULL; + surface_ = nullptr; } } @@ -298,7 +297,7 @@ int StringRenderer::FindFirstPageBreakOffset(const char* text, tlog(1, "len = %d buf_len = %d\n", text_length, buf_length); pango_layout_set_text(layout_, text, buf_length); - PangoLayoutIter* line_iter = NULL; + PangoLayoutIter* line_iter = nullptr; { // Fontconfig caches some info here that is not freed before exit. DISABLE_HEAP_LEAK_CHECK; line_iter = pango_layout_get_iter(layout_); @@ -309,8 +308,8 @@ int StringRenderer::FindFirstPageBreakOffset(const char* text, do { // Get bounding box of the current line PangoRectangle line_ink_rect; - pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, NULL); - pango_extents_to_pixels(&line_ink_rect, NULL); + pango_layout_iter_get_line_extents(line_iter, &line_ink_rect, nullptr); + pango_extents_to_pixels(&line_ink_rect, nullptr); PangoLayoutLine* line = pango_layout_iter_get_line_readonly(line_iter); if (first_page) { page_top = line_ink_rect.y; @@ -327,7 +326,7 @@ int StringRenderer::FindFirstPageBreakOffset(const char* text, return offset; } -const vector& StringRenderer::GetBoxes() const { +const std::vector& StringRenderer::GetBoxes() const { return boxchars_; } @@ -348,20 +347,25 @@ void StringRenderer::ClearBoxes() { boxaDestroy(&page_boxes_); } +string StringRenderer::GetBoxesStr() { + BoxChar::PrepareToWrite(&boxchars_); + return BoxChar::GetTesseractBoxStr(page_height_, boxchars_); +} + void StringRenderer::WriteAllBoxes(const string& filename) { BoxChar::PrepareToWrite(&boxchars_); BoxChar::WriteTesseractBoxFile(filename, page_height_, boxchars_); } // Returns cluster strings in logical order. -bool StringRenderer::GetClusterStrings(vector* cluster_text) { - map start_byte_to_text; +bool StringRenderer::GetClusterStrings(std::vector* cluster_text) { + std::map start_byte_to_text; PangoLayoutIter* run_iter = pango_layout_get_iter(layout_); const char* full_text = pango_layout_get_text(layout_); do { PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter); if (!run) { - // End of line NULL run marker + // End of line nullptr run marker tlog(2, "Found end of line marker\n"); continue; } @@ -384,7 +388,7 @@ bool StringRenderer::GetClusterStrings(vector* cluster_text) { if (add_ligatures_) { // Make sure the output box files have ligatured text in case the font // decided to use an unmapped glyph. - text = LigatureTable::Get()->AddLigatures(text, NULL); + text = LigatureTable::Get()->AddLigatures(text, nullptr); } start_byte_to_text[start_byte_index] = text; } @@ -392,11 +396,11 @@ bool StringRenderer::GetClusterStrings(vector* cluster_text) { pango_layout_iter_free(run_iter); cluster_text->clear(); - for (map::const_iterator it = start_byte_to_text.begin(); + for (std::map::const_iterator it = start_byte_to_text.begin(); it != start_byte_to_text.end(); ++it) { cluster_text->push_back(it->second); } - return cluster_text->size(); + return !cluster_text->empty(); } // Merges an array of BoxChars into words based on the identification of @@ -409,14 +413,13 @@ bool StringRenderer::GetClusterStrings(vector* cluster_text) { // hyphens. When this is detected the word is split at that location into // multiple BoxChars. Otherwise, each resulting BoxChar will contain a word and // its bounding box. -static void MergeBoxCharsToWords(vector* boxchars) { - vector result; +static void MergeBoxCharsToWords(std::vector* boxchars) { + std::vector result; bool started_word = false; for (int i = 0; i < boxchars->size(); ++i) { - if (boxchars->at(i)->ch() == " " || - boxchars->at(i)->box() == NULL) { + if (boxchars->at(i)->ch() == " " || boxchars->at(i)->box() == nullptr) { result.push_back(boxchars->at(i)); - boxchars->at(i) = NULL; + boxchars->at(i) = nullptr; started_word = false; continue; } @@ -425,7 +428,7 @@ static void MergeBoxCharsToWords(vector* boxchars) { // Begin new word started_word = true; result.push_back(boxchars->at(i)); - boxchars->at(i) = NULL; + boxchars->at(i) = nullptr; } else { BoxChar* last_boxchar = result.back(); // Compute bounding box union @@ -444,7 +447,7 @@ static void MergeBoxCharsToWords(vector* boxchars) { // boxchar. result.push_back(new BoxChar(" ", 1)); result.push_back(boxchars->at(i)); - boxchars->at(i) = NULL; + boxchars->at(i) = nullptr; continue; } // Append to last word @@ -454,7 +457,7 @@ static void MergeBoxCharsToWords(vector* boxchars) { last_box->y = top; last_box->h = bottom - top; delete boxchars->at(i); - boxchars->at(i) = NULL; + boxchars->at(i) = nullptr; } } boxchars->swap(result); @@ -466,7 +469,7 @@ void StringRenderer::ComputeClusterBoxes() { PangoLayoutIter* cluster_iter = pango_layout_get_iter(layout_); // Do a first pass to store cluster start indexes. - vector cluster_start_indices; + std::vector cluster_start_indices; do { cluster_start_indices.push_back(pango_layout_iter_get_index(cluster_iter)); tlog(3, "Added %d\n", cluster_start_indices.back()); @@ -475,8 +478,8 @@ void StringRenderer::ComputeClusterBoxes() { cluster_start_indices.push_back(strlen(text)); tlog(3, "Added last index %d\n", cluster_start_indices.back()); // Sort the indices and create a map from start to end indices. - sort(cluster_start_indices.begin(), cluster_start_indices.end()); - map cluster_start_to_end_index; + std::sort(cluster_start_indices.begin(), cluster_start_indices.end()); + std::map cluster_start_to_end_index; for (int i = 0; i < cluster_start_indices.size() - 1; ++i) { cluster_start_to_end_index[cluster_start_indices[i]] = cluster_start_indices[i + 1]; @@ -486,17 +489,16 @@ void StringRenderer::ComputeClusterBoxes() { // cluster extent information. cluster_iter = pango_layout_get_iter(layout_); // Store BoxChars* sorted by their byte start positions - map start_byte_to_box; + std::map start_byte_to_box; do { PangoRectangle cluster_rect; - pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, - NULL); - pango_extents_to_pixels(&cluster_rect, NULL); + pango_layout_iter_get_cluster_extents(cluster_iter, &cluster_rect, nullptr); + pango_extents_to_pixels(&cluster_rect, nullptr); const int start_byte_index = pango_layout_iter_get_index(cluster_iter); const int end_byte_index = cluster_start_to_end_index[start_byte_index]; string cluster_text = string(text + start_byte_index, end_byte_index - start_byte_index); - if (cluster_text.size() && cluster_text[0] == '\n') { + if (!cluster_text.empty() && cluster_text[0] == '\n') { tlog(2, "Skipping newlines at start of text.\n"); continue; } @@ -530,7 +532,7 @@ void StringRenderer::ComputeClusterBoxes() { if (add_ligatures_) { // Make sure the output box files have ligatured text in case the font // decided to use an unmapped glyph. - cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, NULL); + cluster_text = LigatureTable::Get()->AddLigatures(cluster_text, nullptr); } BoxChar* boxchar = new BoxChar(cluster_text.c_str(), cluster_text.size()); boxchar->set_page(page_); @@ -546,21 +548,21 @@ void StringRenderer::ComputeClusterBoxes() { // accurate. // TODO(ranjith): Revisit whether this is still needed in newer versions of // pango. - vector cluster_text; + std::vector cluster_text; if (GetClusterStrings(&cluster_text)) { ASSERT_HOST(cluster_text.size() == start_byte_to_box.size()); int ind = 0; - for (map::iterator it = start_byte_to_box.begin(); + for (std::map::iterator it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it, ++ind) { it->second->mutable_ch()->swap(cluster_text[ind]); } } // Append to the boxchars list in byte order. - vector page_boxchars; + std::vector page_boxchars; page_boxchars.reserve(start_byte_to_box.size()); string last_ch; - for (map::const_iterator it = start_byte_to_box.begin(); + for (std::map::const_iterator it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it) { if (it->second->ch() == kWordJoinerUTF8) { // Skip zero-width joiner characters (ZWJs) here. @@ -572,7 +574,7 @@ void StringRenderer::ComputeClusterBoxes() { CorrectBoxPositionsToLayout(&page_boxchars); if (render_fullwidth_latin_) { - for (map::iterator it = start_byte_to_box.begin(); + for (std::map::iterator it = start_byte_to_box.begin(); it != start_byte_to_box.end(); ++it) { // Convert fullwidth Latin characters to their halfwidth forms. string half(ConvertFullwidthLatinToBasicLatin(it->second->ch())); @@ -588,23 +590,24 @@ void StringRenderer::ComputeClusterBoxes() { boxchars_.insert(boxchars_.end(), page_boxchars.begin(), page_boxchars.end()); // Compute the page bounding box - Box* page_box = NULL; - Boxa* all_boxes = NULL; + Box* page_box = nullptr; + Boxa* all_boxes = nullptr; for (int i = 0; i < page_boxchars.size(); ++i) { - if (page_boxchars[i]->box() == NULL) continue; - if (all_boxes == NULL) - all_boxes = boxaCreate(0); + if (page_boxchars[i]->box() == nullptr) continue; + if (all_boxes == nullptr) all_boxes = boxaCreate(0); boxaAddBox(all_boxes, page_boxchars[i]->mutable_box(), L_CLONE); } - boxaGetExtent(all_boxes, NULL, NULL, &page_box); - boxaDestroy(&all_boxes); - if (page_boxes_ == NULL) - page_boxes_ = boxaCreate(0); - boxaAddBox(page_boxes_, page_box, L_INSERT); + if (all_boxes != nullptr) { + boxaGetExtent(all_boxes, nullptr, nullptr, &page_box); + boxaDestroy(&all_boxes); + if (page_boxes_ == nullptr) page_boxes_ = boxaCreate(0); + boxaAddBox(page_boxes_, page_box, L_INSERT); + } } -void StringRenderer::CorrectBoxPositionsToLayout(vector* boxchars) { +void StringRenderer::CorrectBoxPositionsToLayout( + std::vector* boxchars) { if (vertical_text_) { const double rotation = - pango_gravity_to_rotation( pango_context_get_base_gravity(pango_layout_get_context(layout_))); @@ -645,7 +648,7 @@ int StringRenderer::StripUnrenderableWords(string* utf8_text) const { int StringRenderer::RenderToGrayscaleImage(const char* text, int text_length, Pix** pix) { - Pix *orig_pix = NULL; + Pix* orig_pix = nullptr; int offset = RenderToImage(text, text_length, &orig_pix); if (orig_pix) { *pix = pixConvertTo8(orig_pix, false); @@ -656,7 +659,7 @@ int StringRenderer::RenderToGrayscaleImage(const char* text, int text_length, int StringRenderer::RenderToBinaryImage(const char* text, int text_length, int threshold, Pix** pix) { - Pix *orig_pix = NULL; + Pix* orig_pix = nullptr; int offset = RenderToImage(text, text_length, &orig_pix); if (orig_pix) { Pix* gray_pix = pixConvertTo8(orig_pix, false); @@ -830,19 +833,20 @@ int StringRenderer::RenderToImage(const char* text, int text_length, // do { // Pix *pix; // offset += renderer.RenderAllFontsToImage(min_proportion, txt + offset, -// strlen(txt + offset), NULL, &pix); +// strlen(txt + offset), nullptr, +// &pix); // ... // } while (offset < strlen(text)); // int StringRenderer::RenderAllFontsToImage(double min_coverage, const char* text, int text_length, string* font_used, Pix** image) { - *image = NULL; + *image = nullptr; // Select a suitable font to render the title with. const char kTitleTemplate[] = "%s : %d hits = %.2f%%, raw = %d = %.2f%%"; string title_font; if (!FontUtils::SelectFont(kTitleTemplate, strlen(kTitleTemplate), - &title_font, NULL)) { + &title_font, nullptr)) { tprintf("WARNING: Could not find a font to render image title with!\n"); title_font = "Arial"; } @@ -861,12 +865,12 @@ int StringRenderer::RenderAllFontsToImage(double min_coverage, } tprintf("Total chars = %d\n", total_chars_); } - const vector& all_fonts = FontUtils::ListAvailableFonts(); + const std::vector& all_fonts = FontUtils::ListAvailableFonts(); for (int i = font_index_; i < all_fonts.size(); ++i) { ++font_index_; int raw_score = 0; - int ok_chars = FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, - NULL); + int ok_chars = + FontUtils::FontScore(char_map_, all_fonts[i], &raw_score, nullptr); if (ok_chars > 0 && ok_chars >= total_chars_ * min_coverage) { set_font(all_fonts[i]); int offset = RenderToBinaryImage(text, text_length, 128, image); @@ -887,7 +891,7 @@ int StringRenderer::RenderAllFontsToImage(double min_coverage, // Add the font to the image. set_font(title_font); v_margin_ /= 8; - Pix* title_image = NULL; + Pix* title_image = nullptr; RenderToBinaryImage(title, strlen(title), 128, &title_image); pixOr(*image, *image, title_image); pixDestroy(&title_image); diff --git a/training/stringrenderer.h b/training/stringrenderer.h index 942b7fdd..3fb3504e 100644 --- a/training/stringrenderer.h +++ b/training/stringrenderer.h @@ -30,9 +30,9 @@ #define TESSERACT_TRAINING_STRINGRENDERER_H_ #include +#include #include -#include "hashfn.h" #include "host.h" #include "pango_font_info.h" #include "pango/pango-layout.h" @@ -90,7 +90,7 @@ class StringRenderer { void set_underline_style(const PangoUnderline style) { underline_style_ = style; } - void set_features(const char *features) { + void set_features(const char* features) { free(features_); features_ = strdup(features); } @@ -130,16 +130,12 @@ class StringRenderer { const PangoFontInfo& font() const { return font_; } - int h_margin() const { - return h_margin_; - } - int v_margin() const { - return v_margin_; - } + int h_margin() const { return h_margin_; } + int v_margin() const { return v_margin_; } // Get the boxchars of all clusters rendered thus far (or since the last call // to ClearBoxes()). - const vector& GetBoxes() const; + const std::vector& GetBoxes() const; // Get the rendered page bounding boxes of all pages created thus far (or // since last call to ClearBoxes()). Boxa* GetPageBoxes() const; @@ -148,6 +144,9 @@ class StringRenderer { void RotatePageBoxes(float rotation); // Delete all boxes. void ClearBoxes(); + // Returns the boxes in a boxfile string. + string GetBoxesStr(); + // Writes the boxes to a boxfile. void WriteAllBoxes(const string& filename); // Removes space-delimited words from the string that are not renderable by // the current font and returns the count of such words. @@ -172,8 +171,8 @@ class StringRenderer { void SetWordUnderlineAttributes(const string& page_text); // Compute bounding boxes around grapheme clusters. void ComputeClusterBoxes(); - void CorrectBoxPositionsToLayout(vector* boxchars); - bool GetClusterStrings(vector* cluster_text); + void CorrectBoxPositionsToLayout(std::vector* boxchars); + bool GetClusterStrings(std::vector* cluster_text); int FindFirstPageBreakOffset(const char* text, int text_length); PangoFontInfo font_; @@ -189,7 +188,7 @@ class StringRenderer { double underline_start_prob_; double underline_continuation_prob_; PangoUnderline underline_style_; - char *features_; + char* features_; // Text filtering options bool drop_uncovered_chars_; bool strip_unrenderable_words_; @@ -205,13 +204,13 @@ class StringRenderer { int page_; // Boxes and associated text for all pages rendered with RenderToImage() since // the last call to ClearBoxes(). - vector boxchars_; + std::vector boxchars_; int box_padding_; // Bounding boxes for pages since the last call to ClearBoxes(). Boxa* page_boxes_; // Objects cached for subsequent calls to RenderAllFontsToImage() - hash_map char_map_; // Time-saving char histogram. + std::unordered_map char_map_; // Time-saving char histogram. int total_chars_; // Number in the string to be rendered. int font_index_; // Index of next font to use in font list. int last_offset_; // Offset returned from last successful rendering diff --git a/training/tessopt.h b/training/tessopt.h index 5ff5e9f5..6d377e57 100644 --- a/training/tessopt.h +++ b/training/tessopt.h @@ -17,6 +17,9 @@ * **********************************************************************/ +#ifndef TESSERACT_TRAINING_TESSOPT_H_ +#define TESSERACT_TRAINING_TESSOPT_H_ + #include "host.h" extern int tessoptind; @@ -27,3 +30,5 @@ inT32 argc, //arg count char *argv[], //args const char *arglist //string of arg chars ); + +#endif // TESSERACT_TRAINING_TESSOPT_H_ diff --git a/training/tesstrain.sh b/training/tesstrain.sh index 231b5360..c55b646f 100755 --- a/training/tesstrain.sh +++ b/training/tesstrain.sh @@ -23,6 +23,7 @@ # --langdata_dir DATADIR # Path to tesseract/training/langdata directory. # --output_dir OUTPUTDIR # Location of output traineddata file. # --overwrite # Safe to overwrite files in output_dir. +# --linedata_only # Only generate training data for lstmtraining. # --run_shape_clustering # Run shape clustering (use for Indic langs). # --exposures EXPOSURES # A list of exposure levels to use (e.g. "-1 0 1"). # @@ -44,7 +45,7 @@ # appropriate --fonts_dir path. -source `dirname $0`/tesstrain_utils.sh +source "$(dirname $0)/tesstrain_utils.sh" ARGV=("$@") parse_flags @@ -52,7 +53,7 @@ parse_flags mkdir -p ${TRAINING_DIR} tlog "\n=== Starting training for language '${LANG_CODE}'" -source `dirname $0`/language-specific.sh +source "$(dirname $0)/language-specific.sh" set_lang_specific_parameters ${LANG_CODE} initialize_fontconfig @@ -60,13 +61,18 @@ initialize_fontconfig phase_I_generate_image 8 phase_UP_generate_unicharset phase_D_generate_dawg -phase_E_extract_features "box.train" 8 -phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto" -if [[ "${ENABLE_SHAPE_CLUSTERING}" == "y" ]]; then - phase_S_cluster_shapes +if ((LINEDATA)); then + phase_E_extract_features "lstm.train" 8 "lstmf" + make__lstmdata +else + phase_E_extract_features "box.train" 8 "tr" + phase_C_cluster_prototypes "${TRAINING_DIR}/${LANG_CODE}.normproto" + if [[ "${ENABLE_SHAPE_CLUSTERING}" == "y" ]]; then + phase_S_cluster_shapes + fi + phase_M_cluster_microfeatures + phase_B_generate_ambiguities + make__traineddata fi -phase_M_cluster_microfeatures -phase_B_generate_ambiguities -make__traineddata tlog "\nCompleted training for language '${LANG_CODE}'\n" diff --git a/training/tesstrain_utils.sh b/training/tesstrain_utils.sh index c45d0037..b319bbc4 100755 --- a/training/tesstrain_utils.sh +++ b/training/tesstrain_utils.sh @@ -23,9 +23,10 @@ else fi OUTPUT_DIR="/tmp/tesstrain/tessdata" OVERWRITE=0 +LINEDATA=0 RUN_SHAPE_CLUSTERING=0 EXTRACT_FONT_PROPERTIES=1 -WORKSPACE_DIR=`mktemp -d` +WORKSPACE_DIR=$(mktemp -d) # Logging helper functions. tlog() { @@ -41,7 +42,7 @@ err_exit() { # if the program file is not found. # Usage: run_command CMD ARG1 ARG2... run_command() { - local cmd=`which $1` + local cmd=$(which $1) if [[ -z ${cmd} ]]; then err_exit "$1 not found" fi @@ -90,11 +91,11 @@ parse_flags() { --) break;; --fontlist) - fn=0 - FONTS="" + fn=0 + FONTS="" while test $j -lt ${#ARGV[@]}; do test -z "${ARGV[$j]}" && break - test `echo ${ARGV[$j]} | cut -c -2` = "--" && break + test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break FONTS[$fn]="${ARGV[$j]}" fn=$((fn+1)) j=$((j+1)) @@ -104,7 +105,7 @@ parse_flags() { exp="" while test $j -lt ${#ARGV[@]}; do test -z "${ARGV[$j]}" && break - test `echo ${ARGV[$j]} | cut -c -2` = "--" && break + test $(echo ${ARGV[$j]} | cut -c -2) = "--" && break exp="$exp ${ARGV[$j]}" j=$((j+1)) done @@ -124,6 +125,8 @@ parse_flags() { i=$j ;; --overwrite) OVERWRITE=1 ;; + --linedata_only) + LINEDATA=1 ;; --extract_font_properties) EXTRACT_FONT_PROPERTIES=1 ;; --noextract_font_properties) @@ -199,7 +202,7 @@ generate_font_image() { local common_args="--fontconfig_tmpdir=${FONT_CONFIG_CACHE}" common_args+=" --fonts_dir=${FONTS_DIR} --strip_unrenderable_words" - common_args+=" --fontconfig_refresh_config_file=false --leading=${LEADING}" + common_args+=" --leading=${LEADING}" common_args+=" --char_spacing=${CHAR_SPACING} --exposure=${EXPOSURE}" common_args+=" --outputbase=${outbase}" @@ -216,7 +219,7 @@ generate_font_image() { --text=${TRAINING_TEXT} ${TEXT2IMAGE_EXTRA_ARGS} check_file_readable ${outbase}.box ${outbase}.tif - if (( ${EXTRACT_FONT_PROPERTIES} )) && + if ((EXTRACT_FONT_PROPERTIES)) && [[ -r ${TRAIN_NGRAMS_FILE} ]]; then tlog "Extracting font properties of ${font}" run_command text2image ${common_args} --font="${font}" \ @@ -240,7 +243,7 @@ phase_I_generate_image() { CHAR_SPACING="0.0" for EXPOSURE in $EXPOSURES; do - if (( ${EXTRACT_FONT_PROPERTIES} )) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then + if ((EXTRACT_FONT_PROPERTIES)) && [[ -r ${BIGRAM_FREQS_FILE} ]]; then # Parse .bigram_freqs file and compose a .train_ngrams file with text # for tesseract to recognize during training. Take only the ngrams whose # combined weight accounts for 95% of all the bigrams in the language. @@ -368,10 +371,11 @@ phase_D_generate_dawg() { phase_E_extract_features() { local box_config=$1 local par_factor=$2 + local ext=$3 if [[ -z ${par_factor} || ${par_factor} -le 0 ]]; then par_factor=1 fi - tlog "\n=== Phase E: Extracting features ===" + tlog "\n=== Phase E: Generating ${ext} files ===" local img_files="" for exposure in ${EXPOSURES}; do @@ -401,7 +405,7 @@ phase_E_extract_features() { export TESSDATA_PREFIX=${OLD_TESSDATA_PREFIX} # Check that all the output files were produced. for img_file in ${img_files}; do - check_file_readable ${img_file%.*}.tr + check_file_readable "${img_file%.*}.${ext}" done } @@ -420,7 +424,7 @@ phase_C_cluster_prototypes() { # Phase S : (S)hape clustering phase_S_cluster_shapes() { - if (( ! ${RUN_SHAPE_CLUSTERING} )); then + if ((! RUN_SHAPE_CLUSTERING)); then tlog "\n=== Shape Clustering disabled ===" return fi @@ -484,6 +488,39 @@ phase_B_generate_ambiguities() { # TODO: Add support for generating ambiguities automatically. } +make__lstmdata() { + tlog "\n=== Constructing LSTM training data ===" + local lang_prefix=${LANGDATA_ROOT}/${LANG_CODE}/${LANG_CODE} + if [[ ! -d ${OUTPUT_DIR} ]]; then + tlog "Creating new directory ${OUTPUT_DIR}" + mkdir -p ${OUTPUT_DIR} + fi + + # Copy available files for this language from the langdata dir. + if [[ -r ${lang_prefix}.config ]]; then + tlog "Copying ${lang_prefix}.config to ${OUTPUT_DIR}" + cp ${lang_prefix}.config ${OUTPUT_DIR} + chmod u+w ${OUTPUT_DIR}/${LANG_CODE}.config + fi + if [[ -r "${TRAINING_DIR}/${LANG_CODE}.unicharset" ]]; then + tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.unicharset to ${OUTPUT_DIR}" + mv "${TRAINING_DIR}/${LANG_CODE}.unicharset" "${OUTPUT_DIR}" + fi + for ext in number-dawg punc-dawg word-dawg; do + local src="${TRAINING_DIR}/${LANG_CODE}.${ext}" + if [[ -r "${src}" ]]; then + dest="${OUTPUT_DIR}/${LANG_CODE}.lstm-${ext}" + tlog "Moving ${src} to ${dest}" + mv "${src}" "${dest}" + fi + done + for f in "${TRAINING_DIR}/${LANG_CODE}".*.lstmf; do + tlog "Moving ${f} to ${OUTPUT_DIR}" + mv "${f}" "${OUTPUT_DIR}" + done + local lstm_list="${OUTPUT_DIR}/${LANG_CODE}.training_files.txt" + ls -1 "${OUTPUT_DIR}"/*.lstmf > "${lstm_list}" +} make__traineddata() { tlog "\n=== Making final traineddata file ===" @@ -520,7 +557,7 @@ make__traineddata() { mkdir -p ${OUTPUT_DIR} fi local destfile=${OUTPUT_DIR}/${LANG_CODE}.traineddata; - if [[ -f ${destfile} ]] && (( ! ${OVERWRITE} )); then + if [[ -f ${destfile} ]] && ((! OVERWRITE)); then err_exit "File ${destfile} exists and no --overwrite specified"; fi tlog "Moving ${TRAINING_DIR}/${LANG_CODE}.traineddata to ${OUTPUT_DIR}" diff --git a/training/text2image.cpp b/training/text2image.cpp index 406669dc..13f513f3 100644 --- a/training/text2image.cpp +++ b/training/text2image.cpp @@ -179,11 +179,11 @@ struct SpacingProperties { // used by the FreeType font engine. int x_gap_before; // horizontal x bearing int x_gap_after; // horizontal advance - x_gap_before - width - map kerned_x_gaps; + std::map kerned_x_gaps; }; static bool IsWhitespaceBox(const BoxChar* boxchar) { - return (boxchar->box() == NULL || + return (boxchar->box() == nullptr || SpanUTF8Whitespace(boxchar->ch().c_str())); } @@ -215,16 +215,17 @@ static string StringReplace(const string& in, void ExtractFontProperties(const string &utf8_text, StringRenderer *render, const string &output_base) { - map spacing_map; - map::iterator spacing_map_it0; - map::iterator spacing_map_it1; + std::map spacing_map; + std::map::iterator spacing_map_it0; + std::map::iterator spacing_map_it1; int x_bearing, x_advance; int len = utf8_text.length(); int offset = 0; const char* text = utf8_text.c_str(); while (offset < len) { - offset += render->RenderToImage(text + offset, strlen(text + offset), NULL); - const vector &boxes = render->GetBoxes(); + offset += + render->RenderToImage(text + offset, strlen(text + offset), nullptr); + const std::vector &boxes = render->GetBoxes(); // If the page break split a bigram, correct the offset so we try the bigram // on the next iteration. @@ -251,6 +252,8 @@ void ExtractFontProperties(const string &utf8_text, // the input consists of the separated characters. NOTE(ranjith): As per // behdad@ this is not currently controllable at the level of the Pango // API. + // The most frequent of all is a single character "word" made by the CJK + // segmenter. // Safeguard against these cases here by just skipping the bigram. if (IsWhitespaceBox(boxes[b+1])) { continue; @@ -288,7 +291,7 @@ void ExtractFontProperties(const string &utf8_text, char buf[kBufSize]; snprintf(buf, kBufSize, "%d\n", static_cast(spacing_map.size())); output_string.append(buf); - map::const_iterator spacing_map_it; + std::map::const_iterator spacing_map_it; for (spacing_map_it = spacing_map.begin(); spacing_map_it != spacing_map.end(); ++spacing_map_it) { snprintf(buf, kBufSize, @@ -297,7 +300,7 @@ void ExtractFontProperties(const string &utf8_text, spacing_map_it->second.x_gap_after, static_cast(spacing_map_it->second.kerned_x_gaps.size())); output_string.append(buf); - map::const_iterator kern_it; + std::map::const_iterator kern_it; for (kern_it = spacing_map_it->second.kerned_x_gaps.begin(); kern_it != spacing_map_it->second.kerned_x_gaps.end(); ++kern_it) { snprintf(buf, kBufSize, @@ -310,11 +313,11 @@ void ExtractFontProperties(const string &utf8_text, } bool MakeIndividualGlyphs(Pix* pix, - const vector& vbox, + const std::vector& vbox, const int input_tiff_page) { // If checks fail, return false without exiting text2image if (!pix) { - tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is NULL\n"); + tprintf("ERROR: MakeIndividualGlyphs(): Input Pix* is nullptr\n"); return false; } else if (FLAGS_glyph_resized_size <= 0) { tprintf("ERROR: --glyph_resized_size must be positive\n"); @@ -357,7 +360,7 @@ bool MakeIndividualGlyphs(Pix* pix, continue; } // Crop the boxed character - Pix* pix_glyph = pixClipRectangle(pix, b, NULL); + Pix* pix_glyph = pixClipRectangle(pix, b, nullptr); if (!pix_glyph) { tprintf("ERROR: MakeIndividualGlyphs(): Failed to clip, at i=%d\n", i); continue; @@ -418,7 +421,7 @@ int main(int argc, char** argv) { tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true); if (FLAGS_list_available_fonts) { - const vector& all_fonts = FontUtils::ListAvailableFonts(); + const std::vector& all_fonts = FontUtils::ListAvailableFonts(); for (int i = 0; i < all_fonts.size(); ++i) { printf("%3d: %s\n", i, all_fonts[i].c_str()); ASSERT_HOST_MSG(FontUtils::IsAvailableFont(all_fonts[i].c_str()), @@ -445,7 +448,7 @@ int main(int argc, char** argv) { string pango_name; if (!FontUtils::IsAvailableFont(FLAGS_font.c_str(), &pango_name)) { tprintf("Could not find font named %s.\n", FLAGS_font.c_str()); - if (!pango_name.empty()) { + if (!pango_name.empty()) { tprintf("Pango suggested font %s.\n", pango_name.c_str()); } tprintf("Please correct --font arg.\n"); @@ -523,7 +526,7 @@ int main(int argc, char** argv) { if (FLAGS_render_ngrams && !FLAGS_unicharset_file.empty() && !unicharset.load_from_file(FLAGS_unicharset_file.c_str())) { tprintf("Failed to load unicharset from file %s\n", - FLAGS_unicharset_file.c_str()); + FLAGS_unicharset_file.c_str()); exit(1); } @@ -533,11 +536,11 @@ int main(int argc, char** argv) { const char *str8 = src_utf8.c_str(); int len = src_utf8.length(); int step; - vector > offsets; + std::vector > offsets; int offset = SpanUTF8Whitespace(str8); while (offset < len) { step = SpanUTF8NotWhitespace(str8 + offset); - offsets.push_back(make_pair(offset, step)); + offsets.push_back(std::make_pair(offset, step)); offset += step; offset += SpanUTF8Whitespace(str8 + offset); } @@ -549,7 +552,7 @@ int main(int argc, char** argv) { int ngram_len = offsets[i].second; // Skip words that contain characters not in found in unicharset. if (!FLAGS_unicharset_file.empty() && - !unicharset.encodable_string(curr_pos, NULL)) { + !unicharset.encodable_string(curr_pos, nullptr)) { continue; } rand_utf8.append(curr_pos, ngram_len); @@ -572,12 +575,12 @@ int main(int argc, char** argv) { } int im = 0; - vector page_rotation; + std::vector page_rotation; const char* to_render_utf8 = src_utf8.c_str(); tesseract::TRand randomizer; randomizer.set_seed(kRandomSeed); - vector font_names; + std::vector font_names; // We use a two pass mechanism to rotate images in both direction. // The first pass(0) will rotate the images in random directions and // the second pass(1) will mirror those rotations. @@ -587,7 +590,7 @@ int main(int argc, char** argv) { string font_used; for (int offset = 0; offset < strlen(to_render_utf8); ++im, ++page_num) { tlog(1, "Starting page %d\n", im); - Pix* pix = NULL; + Pix* pix = nullptr; if (FLAGS_find_fonts) { offset += render.RenderAllFontsToImage(FLAGS_min_coverage, to_render_utf8 + offset, @@ -597,14 +600,15 @@ int main(int argc, char** argv) { offset += render.RenderToImage(to_render_utf8 + offset, strlen(to_render_utf8 + offset), &pix); } - if (pix != NULL) { + if (pix != nullptr) { float rotation = 0; if (pass == 1) { // Pass 2, do mirror rotation. rotation = -1 * page_rotation[page_num]; } if (FLAGS_degrade_image) { - pix = DegradeImage(pix, FLAGS_exposure, &randomizer, FLAGS_rotate_image ? &rotation : NULL); + pix = DegradeImage(pix, FLAGS_exposure, &randomizer, + FLAGS_rotate_image ? &rotation : nullptr); } render.RotatePageBoxes(rotation); @@ -657,7 +661,7 @@ int main(int argc, char** argv) { string filename = FLAGS_outputbase.c_str(); filename += ".fontlist.txt"; FILE* fp = fopen(filename.c_str(), "wb"); - if (fp == NULL) { + if (fp == nullptr) { tprintf("Failed to create output font list %s\n", filename.c_str()); } else { for (int i = 0; i < font_names.size(); ++i) { diff --git a/training/unicharset_extractor.cpp b/training/unicharset_extractor.cpp index b60e9980..1e6c35af 100644 --- a/training/unicharset_extractor.cpp +++ b/training/unicharset_extractor.cpp @@ -134,7 +134,7 @@ int main(int argc, char** argv) { printf("Extracting unicharset from %s\n", argv[tessoptind]); FILE* box_file = fopen(argv[tessoptind], "rb"); - if (box_file == NULL) { + if (box_file == nullptr) { printf("Cannot open box file %s\n", argv[tessoptind]); return -1; } diff --git a/training/unicharset_training_utils.cpp b/training/unicharset_training_utils.cpp index 10aaf0e6..10582f02 100644 --- a/training/unicharset_training_utils.cpp +++ b/training/unicharset_training_utils.cpp @@ -37,11 +37,12 @@ namespace tesseract { // Helper sets the character attribute properties and sets up the script table. // Does not set tops and bottoms. -void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) { +void SetupBasicProperties(bool report_errors, bool decompose, + UNICHARSET* unicharset) { for (int unichar_id = 0; unichar_id < unicharset->size(); ++unichar_id) { // Convert any custom ligatures. const char* unichar_str = unicharset->id_to_unichar(unichar_id); - for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != NULL; ++i) { + for (int i = 0; UNICHARSET::kCustomLigatures[i][0] != nullptr; ++i) { if (!strcmp(UNICHARSET::kCustomLigatures[i][1], unichar_str)) { unichar_str = UNICHARSET::kCustomLigatures[i][0]; break; @@ -129,7 +130,7 @@ void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) { } // Record normalized version of this unichar. - STRING normed_str = tesseract::NormalizeUTF8String(unichar_str); + STRING normed_str = tesseract::NormalizeUTF8String(decompose, unichar_str); if (unichar_id != 0 && normed_str.length() > 0) { unicharset->set_normed(unichar_id, normed_str.c_str()); } else { @@ -158,7 +159,7 @@ void SetPropertiesForInputFile(const string& script_dir, // Set unichar properties tprintf("Setting unichar properties\n"); - SetupBasicProperties(true, &unicharset); + SetupBasicProperties(true, false, &unicharset); string xheights_str; for (int s = 0; s < unicharset.get_script_table_size(); ++s) { // Load the unicharset for the script if available. diff --git a/training/unicharset_training_utils.h b/training/unicharset_training_utils.h index ff226287..f03e12ac 100644 --- a/training/unicharset_training_utils.h +++ b/training/unicharset_training_utils.h @@ -33,7 +33,13 @@ namespace tesseract { // Helper sets the character attribute properties and sets up the script table. // Does not set tops and bottoms. -void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset); +void SetupBasicProperties(bool report_errors, bool decompose, + UNICHARSET* unicharset); +// Default behavior is to compose, until it is proven that decomposed benefits +// at least one language. +inline void SetupBasicProperties(bool report_errors, UNICHARSET* unicharset) { + SetupBasicProperties(report_errors, false, unicharset); +} // Helper to set the properties for an input unicharset file, writes to the // output file. If an appropriate script unicharset can be found in the diff --git a/training/wordlist2dawg.cpp b/training/wordlist2dawg.cpp index 8812df8e..f502ea19 100644 --- a/training/wordlist2dawg.cpp +++ b/training/wordlist2dawg.cpp @@ -76,7 +76,7 @@ int main(int argc, char** argv) { } tprintf("Reducing Trie to SquishedDawg\n"); tesseract::SquishedDawg *dawg = trie.trie_to_dawg(); - if (dawg != NULL && dawg->NumEdges() > 0) { + if (dawg != nullptr && dawg->NumEdges() > 0) { tprintf("Writing squished DAWG to '%s'\n", dawg_filename); dawg->write_squished_dawg(dawg_filename); } else { diff --git a/viewer/scrollview.cpp b/viewer/scrollview.cpp index ac059d54..d4eb6d46 100644 --- a/viewer/scrollview.cpp +++ b/viewer/scrollview.cpp @@ -37,9 +37,9 @@ #include "scrollview.h" #ifdef _MSC_VER -#pragma warning(disable:4786) // Don't give stupid warnings for stl -#pragma warning(disable:4018) // signed/unsigned warnings -#pragma warning(disable:4530) // exception warnings +#pragma warning(disable : 4786) // Don't give irrelevant warnings for stl +#pragma warning(disable : 4018) // signed/unsigned warnings +#pragma warning(disable : 4530) // exception warnings #endif const int kSvPort = 8461; diff --git a/viewer/scrollview.h b/viewer/scrollview.h index 12fac4e5..96c14a76 100644 --- a/viewer/scrollview.h +++ b/viewer/scrollview.h @@ -29,8 +29,8 @@ // API calls at all and generate a java user interface from scratch (or // basically generate any kind of java program, possibly even dangerous ones). -#ifndef TESSERACT_VIEWER_SCROLLVIEW_H__ -#define TESSERACT_VIEWER_SCROLLVIEW_H__ +#ifndef TESSERACT_VIEWER_SCROLLVIEW_H_ +#define TESSERACT_VIEWER_SCROLLVIEW_H_ // TODO(rays) Move ScrollView into the tesseract namespace. #ifndef OCR_SCROLLVIEW_H__ @@ -89,7 +89,7 @@ class SVEventHandler { // Gets called by the SV Window. Does nothing on default, overwrite this // to implement the desired behaviour - virtual void Notify(const SVEvent* sve) { } + virtual void Notify(const SVEvent* sve) { (void)sve; } }; // The ScrollView class provides the expernal API to the scrollviewer process. @@ -327,7 +327,7 @@ class ScrollView { // be unique among menubar eventIDs. void MenuItem(const char* parent, const char* name, int cmdEvent); -// This adds a new checkbox entry, which might initially be flagged. + // This adds a new checkbox entry, which might initially be flagged. void MenuItem(const char* parent, const char* name, int cmdEvent, bool flagged); @@ -415,4 +415,4 @@ class ScrollView { }; #endif // OCR_SCROLLVIEW_H__ -#endif // TESSERACT_VIEWER_SCROLLVIEW_H__ +#endif // TESSERACT_VIEWER_SCROLLVIEW_H_ diff --git a/viewer/svmnode.h b/viewer/svmnode.h index 379e212b..98f4e298 100644 --- a/viewer/svmnode.h +++ b/viewer/svmnode.h @@ -25,8 +25,8 @@ // A SVMenuNode can both be used in the context_ of popup menus as well as // menu bars. -#ifndef TESSERACT_VIEWER_SVMNODE_H__ -#define TESSERACT_VIEWER_SVMNODE_H__ +#ifndef TESSERACT_VIEWER_SVMNODE_H_ +#define TESSERACT_VIEWER_SVMNODE_H_ #include "strngs.h" @@ -93,4 +93,4 @@ class SVMenuNode { STRING description_; }; -#endif // TESSERACT_VIEWER_SVMNODE_H__ +#endif // TESSERACT_VIEWER_SVMNODE_H_ diff --git a/viewer/svpaint.cpp b/viewer/svpaint.cpp index 4d2f49d9..c267257b 100644 --- a/viewer/svpaint.cpp +++ b/viewer/svpaint.cpp @@ -1,6 +1,15 @@ // Copyright 2007 Google Inc. All Rights Reserved. // // Author: Joern Wanke +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // // Simple drawing program to illustrate ScrollView capabilities. // diff --git a/viewer/svutil.cpp b/viewer/svutil.cpp index ae47e21b..f4596da7 100644 --- a/viewer/svutil.cpp +++ b/viewer/svutil.cpp @@ -22,6 +22,7 @@ #include #ifdef _WIN32 +#include struct addrinfo { struct sockaddr* ai_addr; int ai_addrlen; @@ -31,13 +32,13 @@ struct addrinfo { }; #else #include +#include #include #include #include #include #include #include -#include #include #include #ifdef __linux__ @@ -56,10 +57,54 @@ struct addrinfo { #include "config_auto.h" #endif -#ifndef GRAPHICS_DISABLED - #include "svutil.h" +SVMutex::SVMutex() { +#ifdef _WIN32 + mutex_ = CreateMutex(0, FALSE, 0); +#else + pthread_mutex_init(&mutex_, NULL); +#endif +} + +void SVMutex::Lock() { +#ifdef _WIN32 + WaitForSingleObject(mutex_, INFINITE); +#else + pthread_mutex_lock(&mutex_); +#endif +} + +void SVMutex::Unlock() { +#ifdef _WIN32 + ReleaseMutex(mutex_); +#else + pthread_mutex_unlock(&mutex_); +#endif +} + +// Create new thread. +void SVSync::StartThread(void* (*func)(void*), void* arg) { +#ifdef _WIN32 + LPTHREAD_START_ROUTINE f = (LPTHREAD_START_ROUTINE)func; + DWORD threadid; + HANDLE newthread = CreateThread(NULL, // default security attributes + 0, // use default stack size + f, // thread function + arg, // argument to thread function + 0, // use default creation flags + &threadid); // returns the thread identifier +#else + pthread_t helper; + pthread_attr_t attr; + pthread_attr_init(&attr); + pthread_attr_setdetachstate(&attr, PTHREAD_CREATE_DETACHED); + pthread_create(&helper, &attr, func, arg); +#endif +} + +#ifndef GRAPHICS_DISABLED + const int kMaxMsgSize = 4096; // Signals a thread to exit. @@ -118,6 +163,9 @@ void SVSync::StartProcess(const char* executable, const char* args) { } argv[argc] = NULL; execvp(executable, argv); + free(argv[0]); + free(argv[1]); + delete[] argv; } #endif } @@ -158,49 +206,6 @@ void SVSemaphore::Wait() { #endif } -SVMutex::SVMutex() { -#ifdef _WIN32 - mutex_ = CreateMutex(0, FALSE, 0); -#else - pthread_mutex_init(&mutex_, NULL); -#endif -} - -void SVMutex::Lock() { -#ifdef _WIN32 - WaitForSingleObject(mutex_, INFINITE); -#else - pthread_mutex_lock(&mutex_); -#endif -} - -void SVMutex::Unlock() { -#ifdef _WIN32 - ReleaseMutex(mutex_); -#else - pthread_mutex_unlock(&mutex_); -#endif -} - -// Create new thread. - -void SVSync::StartThread(void *(*func)(void*), void* arg) { -#ifdef _WIN32 - LPTHREAD_START_ROUTINE f = (LPTHREAD_START_ROUTINE) func; - DWORD threadid; - HANDLE newthread = CreateThread( - NULL, // default security attributes - 0, // use default stack size - f, // thread function - arg, // argument to thread function - 0, // use default creation flags - &threadid); // returns the thread identifier -#else - pthread_t helper; - pthread_create(&helper, NULL, func, arg); -#endif -} - // Place a message in the message buffer (and flush it). void SVNetwork::Send(const char* msg) { mutex_send_->Lock(); @@ -211,7 +216,7 @@ void SVNetwork::Send(const char* msg) { // Send the whole buffer. void SVNetwork::Flush() { mutex_send_->Lock(); - while (msg_buffer_out_.size() > 0) { + while (!msg_buffer_out_.empty()) { int i = send(stream_, msg_buffer_out_.c_str(), msg_buffer_out_.length(), 0); msg_buffer_out_.erase(0, i); } @@ -299,7 +304,8 @@ static std::string ScrollViewCommand(std::string scrollview_path) { const char* cmd_template = "-Djava.library.path=%s -jar %s/ScrollView.jar"; #else - const char* cmd_template = "-c \"trap 'kill %%1' 0 1 2 ; java " + const char* cmd_template = + "-c \"trap 'kill %%1' 0 1 2 ; java " "-Xms1024m -Xmx2048m -jar %s/ScrollView.jar" " & wait\""; #endif @@ -419,6 +425,7 @@ SVNetwork::SVNetwork(const char* hostname, int port) { // Wait for server to show up. // Note: There is no exception handling in case the server never turns up. + Close(); stream_ = socket(addr_info->ai_family, addr_info->ai_socktype, addr_info->ai_protocol); @@ -431,6 +438,7 @@ SVNetwork::SVNetwork(const char* hostname, int port) { sleep(1); #endif + Close(); stream_ = socket(addr_info->ai_family, addr_info->ai_socktype, addr_info->ai_protocol); } diff --git a/viewer/svutil.h b/viewer/svutil.h index ccfce917..b56025ec 100644 --- a/viewer/svutil.h +++ b/viewer/svutil.h @@ -21,12 +21,13 @@ // classes, which are used for thread/process creation & synchronization // and network connection. -#ifndef TESSERACT_VIEWER_SVUTIL_H__ -#define TESSERACT_VIEWER_SVUTIL_H__ +#ifndef TESSERACT_VIEWER_SVUTIL_H_ +#define TESSERACT_VIEWER_SVUTIL_H_ #ifdef _WIN32 #ifndef __GNUC__ #include +#include "platform.h" #if defined(_MSC_VER) && _MSC_VER < 1900 #define snprintf _snprintf #endif @@ -102,6 +103,17 @@ class SVMutex { #endif }; +// Auto-unlocking object that locks a mutex on construction and unlocks it +// on destruction. +class SVAutoLock { + public: + explicit SVAutoLock(SVMutex* mutex) : mutex_(mutex) { mutex->Lock(); } + ~SVAutoLock() { mutex_->Unlock(); } + + private: + SVMutex* mutex_; +}; + /// The SVNetwork class takes care of the remote connection for ScrollView /// This means setting up and maintaining a remote connection, sending and /// receiving messages and closing the connection. @@ -143,4 +155,4 @@ class SVNetwork { char* buffer_ptr_; // Unix (strtok_r) }; -#endif // TESSERACT_VIEWER_SVUTIL_H__ +#endif // TESSERACT_VIEWER_SVUTIL_H_ diff --git a/vs2010/include/leptonica_versionnumbers.props b/vs2010/include/leptonica_versionnumbers.props deleted file mode 100644 index f3b3f3f2..00000000 --- a/vs2010/include/leptonica_versionnumbers.props +++ /dev/null @@ -1,42 +0,0 @@ - - - - 416 - 8c - 171 - 1,71,0,0 - 1.71 - 143 - 394 - 125 - - - <_ProjectFileVersion>10.0.40219.1 - - - - $(GIFLIB_VERSION) - - - $(LIBJPEG_VERSION) - - - $(LIBLEPT_VERSION) - - - $(LIBLEPT_VERSION_R) - - - $(LIBLEPT_NUMBER) - - - $(LIBPNG_VERSION) - - - $(LIBTIFF_VERSION) - - - $(ZLIB_VERSION) - - - \ No newline at end of file diff --git a/vs2010/include/tesseract_versionnumbers.props b/vs2010/include/tesseract_versionnumbers.props deleted file mode 100644 index e16989d0..00000000 --- a/vs2010/include/tesseract_versionnumbers.props +++ /dev/null @@ -1,25 +0,0 @@ - - - - - - - 303 - 3,3,0,0 - 3.03 - - - <_ProjectFileVersion>10.0.40219.1 - - - - $(LIBTESS_VERSION) - - - $(LIBTESS_VERSION_R) - - - $(LIBTESS_NUMBER) - - - \ No newline at end of file diff --git a/vs2010/libtesseract/libtesseract.rc b/vs2010/libtesseract/libtesseract.rc deleted file mode 100644 index f72d17f5..00000000 --- a/vs2010/libtesseract/libtesseract.rc +++ /dev/null @@ -1,101 +0,0 @@ -// Microsoft Visual C++ generated resource script. -// -#include "resource.h" - -#define APSTUDIO_READONLY_SYMBOLS -///////////////////////////////////////////////////////////////////////////// -// -// Generated from the TEXTINCLUDE 2 resource. -// -#include "windows.h" - -///////////////////////////////////////////////////////////////////////////// -#undef APSTUDIO_READONLY_SYMBOLS - -///////////////////////////////////////////////////////////////////////////// -// English (U.S.) resources - -#if !defined(AFX_RESOURCE_DLL) || defined(AFX_TARG_ENU) -#ifdef _WIN32 -LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US -#pragma code_page(1252) -#endif //_WIN32 - -#ifdef APSTUDIO_INVOKED -///////////////////////////////////////////////////////////////////////////// -// -// TEXTINCLUDE -// - -1 TEXTINCLUDE -BEGIN - "resource.h\0" -END - -2 TEXTINCLUDE -BEGIN - "#include ""afxres.h""\r\n" - "\0" -END - -3 TEXTINCLUDE -BEGIN - "\r\n" - "\0" -END - -#endif // APSTUDIO_INVOKED - - -///////////////////////////////////////////////////////////////////////////// -// -// Version -// - -VS_VERSION_INFO VERSIONINFO - FILEVERSION 3,5,0,0 - PRODUCTVERSION 3,5,0,0 - FILEFLAGSMASK 0x17L -#ifdef _DEBUG - FILEFLAGS 0x1L -#else - FILEFLAGS 0x0L -#endif - FILEOS 0x4L - FILETYPE 0x7L - FILESUBTYPE 0x0L -BEGIN - BLOCK "StringFileInfo" - BEGIN - BLOCK "040904b0" - BEGIN - VALUE "FileDescription", "Tesseract OCR library" - VALUE "FileVersion", "3, 5, 0, 0" - VALUE "InternalName", "libtesseract" - VALUE "LegalCopyright", "Copyright (C) 2015 Google, Inc. Licensed under the Apache License, Version 2.0" - VALUE "OriginalFilename", "libtesseract" - VALUE "ProductName", "Tesseract OCR Library" - VALUE "ProductVersion", "3, 5, 0, 0" - END - END - BLOCK "VarFileInfo" - BEGIN - VALUE "Translation", 0x409, 1200 - END -END - -#endif // English (U.S.) resources -///////////////////////////////////////////////////////////////////////////// - - - -#ifndef APSTUDIO_INVOKED -///////////////////////////////////////////////////////////////////////////// -// -// Generated from the TEXTINCLUDE 3 resource. -// - - -///////////////////////////////////////////////////////////////////////////// -#endif // not APSTUDIO_INVOKED - diff --git a/vs2010/libtesseract/libtesseract.vcxproj b/vs2010/libtesseract/libtesseract.vcxproj deleted file mode 100644 index 9b734d5d..00000000 --- a/vs2010/libtesseract/libtesseract.vcxproj +++ /dev/null @@ -1,870 +0,0 @@ - - - - - DLL_Debug - Win32 - - - DLL_Release - Win32 - - - LIB_Debug - Win32 - - - LIB_OpenCL_Debug - Win32 - - - LIB_OpenCL_Release - Win32 - - - LIB_Release - Win32 - - - - libtesseract304 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0} - libtesseract - - - - DynamicLibrary - MultiByte - - - DynamicLibrary - MultiByte - - - StaticLibrary - MultiByte - - - StaticLibrary - MultiByte - - - StaticLibrary - MultiByte - - - StaticLibrary - MultiByte - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - <_ProjectFileVersion>10.0.40219.1 - ..\$(Configuration)\ - ..\$(Configuration)\ - $(Configuration)\ - $(Configuration)\ - false - false - ..\$(Configuration)\ - ..\$(Configuration)\ - $(Configuration)\ - $(Configuration)\ - false - false - ..\$(Configuration)\ - $(Configuration)\ - false - ..\$(Configuration)\ - $(Configuration)\ - false - $(ProjectName)-static-debug - $(ProjectName)-opencl-static-debug - $(ProjectName)-static - $(ProjectName)-opencl-static - $(ProjectName)d - false - true - - - - - - - - %(Outputs) - - - Disabled - ..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - WIN32;_WINDOWS;_DEBUG;_LIB;USE_STD_NAMESPACE;WINDLLNAME="$(TargetFileName)";%(PreprocessorDefinitions) - false - EnableFastChecks - MultiThreadedDebugDLL - - - Level3 - OldStyle - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - false - - - $(OutDir)$(TargetName)$(TargetExt) - - - copy library to lib directory - if not exist ..\..\..\lib md ..\..\..\lib -copy "$(TargetPath)" ..\..\..\lib - - - - ..\version.bat - - - GIT_REV - - - - - - - - - %(Outputs) - - - Disabled - $(AMDAPPSDKROOT)\include;$(INTELOCLSDKROOT)\include;..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - USE_OPENCL;WIN32;_WINDOWS;_DEBUG;_LIB;USE_STD_NAMESPACE;WINDLLNAME="$(TargetFileName)";%(PreprocessorDefinitions) - false - EnableFastChecks - MultiThreadedDebugDLL - - - Level3 - OldStyle - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - false - - - $(OutDir)$(TargetName)$(TargetExt) - - - copy library to lib directory - if not exist ..\..\..\lib md ..\..\..\lib -copy "$(TargetPath)" ..\..\..\lib - - - - - - - - - - %(Outputs) - - - MaxSpeed - ..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - WIN32;_WINDOWS;NDEBUG;_LIB;USE_STD_NAMESPACE;WINDLLNAME="$(TargetFileName)";%(PreprocessorDefinitions) - MultiThreadedDLL - - - Level3 - - - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - - - $(OutDir)$(TargetName)$(TargetExt) - - - - - copy library to lib directory - if not exist ..\..\..\lib md ..\..\..\lib -copy "$(TargetPath)" ..\..\..\lib - - - - - - - - - - %(Outputs) - - - MaxSpeed - $(AMDAPPSDKROOT)\include;$(INTELOCLSDKROOT)\include;..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - USE_OPENCL;WIN32;_WINDOWS;NDEBUG;_LIB;USE_STD_NAMESPACE;WINDLLNAME="$(TargetFileName)";%(PreprocessorDefinitions) - MultiThreadedDLL - - - Level3 - - - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - - - $(OutDir)$(TargetName)$(TargetExt) - - - - - copy library to lib directory - if not exist ..\..\..\lib md ..\..\..\lib -copy "$(TargetPath)" ..\..\..\lib - - - - - - - - - - %(Outputs) - - - Full - ..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - WIN32;_WINDOWS;NDEBUG;_USRDLL;_WINDLL;USE_STD_NAMESPACE;TESS_EXPORTS;LIBLEPT_IMPORTS;WINDLLNAME="$(TargetFileName)";%(PreprocessorDefinitions) - MultiThreadedDLL - - - Level3 - - - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - true - - - ws2_32.lib;user32.lib;liblept$(LIBLEPT_VERSION).lib;libtiff$(LIBTIFF_VERSION)-static-mtdll.lib - $(OutDir)$(TargetName)$(TargetExt) - $(LIBTESS_NUMBER) - ..\..\..\lib;%(AdditionalLibraryDirectories) - UseLinkTimeCodeGeneration - true - true - - - copy library to lib directory - if not exist ..\..\..\lib md ..\..\..\lib -copy "$(TargetPath)" ..\..\..\lib -copy "$(TargetDir)$(TargetName).lib" ..\..\..\lib - - - - - - - - - - %(Outputs) - - - Disabled - ..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - WIN32;_WINDOWS;_DEBUG;_USRDLL;_WINDLL;USE_STD_NAMESPACE;TESS_EXPORTS;LIBLEPT_IMPORTS;WINDLLNAME="$(TargetFileName)";%(PreprocessorDefinitions) - false - EnableFastChecks - MultiThreadedDebugDLL - - - Level3 - OldStyle - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - false - - - ws2_32.lib;user32.lib;liblept$(LIBLEPT_VERSION)d.lib;libtiff$(LIBTIFF_VERSION)-static-mtdll-debug.lib - $(OutDir)$(TargetName)$(TargetExt) - $(LIBTESS_NUMBER) - ..\..\..\lib;%(AdditionalLibraryDirectories) - true - - - copy library to lib directory - if not exist ..\..\..\lib md ..\..\..\lib -copy "$(TargetPath)" ..\..\..\lib -copy "$(TargetDir)$(TargetName).lib" ..\..\..\lib - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - \ No newline at end of file diff --git a/vs2010/libtesseract/libtesseract.vcxproj.filters b/vs2010/libtesseract/libtesseract.vcxproj.filters deleted file mode 100644 index 911f3f7b..00000000 --- a/vs2010/libtesseract/libtesseract.vcxproj.filters +++ /dev/null @@ -1,1613 +0,0 @@ - - - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - Source Files - - - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - Header Files - - - - - {df5a77a2-95e6-4c3d-a8ff-17c102dc47a3} - - - {81873181-2448-481b-8a31-daedb240e410} - - - {456a0027-7a61-4d9a-b150-841104208d04} - - - - - Resource Files - - - \ No newline at end of file diff --git a/vs2010/libtesseract/resource.h b/vs2010/libtesseract/resource.h deleted file mode 100644 index 6824c386..00000000 --- a/vs2010/libtesseract/resource.h +++ /dev/null @@ -1,14 +0,0 @@ -//{{NO_DEPENDENCIES}} -// Microsoft Visual C++ generated include file. -// Used by libtesseract.rc - -// Next default values for new objects -// -#ifdef APSTUDIO_INVOKED -#ifndef APSTUDIO_READONLY_SYMBOLS -#define _APS_NEXT_RESOURCE_VALUE 101 -#define _APS_NEXT_COMMAND_VALUE 40001 -#define _APS_NEXT_CONTROL_VALUE 1001 -#define _APS_NEXT_SYMED_VALUE 101 -#endif -#endif diff --git a/vs2010/port/vcsversion.h b/vs2010/port/vcsversion.h index 6d5bed80..c08664ef 100644 --- a/vs2010/port/vcsversion.h +++ b/vs2010/port/vcsversion.h @@ -1,2 +1,11 @@ -#define GIT_REV "3.04.00" +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. +#define GIT_REV "4.00.00dev" diff --git a/vs2010/tesseract.sln b/vs2010/tesseract.sln deleted file mode 100644 index 32962b91..00000000 --- a/vs2010/tesseract.sln +++ /dev/null @@ -1,46 +0,0 @@ - -Microsoft Visual Studio Solution File, Format Version 11.00 -# Visual C++ Express 2010 -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "libtesseract304", "libtesseract\libtesseract.vcxproj", "{D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}" -EndProject -Project("{8BC9CEB8-8B4A-11D0-8D11-00A0C91BC942}") = "tesseract", "tesseract\tesseract.vcxproj", "{C76996CB-C4CB-4D89-9F67-F605DF129618}" -EndProject -Global - GlobalSection(SolutionConfigurationPlatforms) = preSolution - DLL_Debug|Win32 = DLL_Debug|Win32 - DLL_Release|Win32 = DLL_Release|Win32 - LIB_Debug|Win32 = LIB_Debug|Win32 - LIB_OpenCL_Debug|Win32 = LIB_OpenCL_Debug|Win32 - LIB_OpenCL_Release|Win32 = LIB_OpenCL_Release|Win32 - LIB_Release|Win32 = LIB_Release|Win32 - EndGlobalSection - GlobalSection(ProjectConfigurationPlatforms) = postSolution - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.DLL_Debug|Win32.ActiveCfg = DLL_Debug|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.DLL_Debug|Win32.Build.0 = DLL_Debug|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.DLL_Release|Win32.ActiveCfg = DLL_Release|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.DLL_Release|Win32.Build.0 = DLL_Release|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.LIB_Debug|Win32.ActiveCfg = LIB_Debug|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.LIB_Debug|Win32.Build.0 = LIB_Debug|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.LIB_OpenCL_Debug|Win32.ActiveCfg = LIB_OpenCL_Debug|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.LIB_OpenCL_Debug|Win32.Build.0 = LIB_OpenCL_Debug|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.LIB_OpenCL_Release|Win32.ActiveCfg = LIB_OpenCL_Release|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.LIB_OpenCL_Release|Win32.Build.0 = LIB_OpenCL_Release|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.LIB_Release|Win32.ActiveCfg = LIB_Release|Win32 - {D14F28C7-0CAE-4C37-B174-40FDEFBD4FE0}.LIB_Release|Win32.Build.0 = LIB_Release|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.DLL_Debug|Win32.ActiveCfg = DLL_Debug|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.DLL_Debug|Win32.Build.0 = DLL_Debug|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.DLL_Release|Win32.ActiveCfg = DLL_Release|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.DLL_Release|Win32.Build.0 = DLL_Release|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.LIB_Debug|Win32.ActiveCfg = LIB_Debug|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.LIB_Debug|Win32.Build.0 = LIB_Debug|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.LIB_OpenCL_Debug|Win32.ActiveCfg = LIB_OpenCL_Debug|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.LIB_OpenCL_Debug|Win32.Build.0 = LIB_OpenCL_Debug|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.LIB_OpenCL_Release|Win32.ActiveCfg = LIB_OpenCL_Release|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.LIB_OpenCL_Release|Win32.Build.0 = LIB_OpenCL_Release|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.LIB_Release|Win32.ActiveCfg = LIB_Release|Win32 - {C76996CB-C4CB-4D89-9F67-F605DF129618}.LIB_Release|Win32.Build.0 = LIB_Release|Win32 - EndGlobalSection - GlobalSection(SolutionProperties) = preSolution - HideSolutionNode = FALSE - EndGlobalSection -EndGlobal diff --git a/vs2010/tesseract/resource.h b/vs2010/tesseract/resource.h index 25e0730f..ea8408a9 100644 --- a/vs2010/tesseract/resource.h +++ b/vs2010/tesseract/resource.h @@ -1,6 +1,15 @@ //{{NO_DEPENDENCIES}} // Microsoft Visual C++ generated include file. // Used by tesseract.rc +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// http://www.apache.org/licenses/LICENSE-2.0 +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. // Next default values for new objects // diff --git a/vs2010/tesseract/tesseract.rc b/vs2010/tesseract/tesseract.rc index f2fc5248..3ce8e2f3 100644 --- a/vs2010/tesseract/tesseract.rc +++ b/vs2010/tesseract/tesseract.rc @@ -53,8 +53,8 @@ END // VS_VERSION_INFO VERSIONINFO - FILEVERSION 3,5,0,0 - PRODUCTVERSION 3,5,0,0 + FILEVERSION 4,0,0,0 + PRODUCTVERSION 4,0,0,0 FILEFLAGSMASK 0x17L #ifdef _DEBUG FILEFLAGS 0x1L @@ -70,12 +70,12 @@ BEGIN BLOCK "040904b0" BEGIN VALUE "FileDescription", "Tesseract command-line OCR engine" - VALUE "FileVersion", "3,5,0,0" + VALUE "FileVersion", "4,0,0,0" VALUE "InternalName", "tesseract" - VALUE "LegalCopyright", "Copyright (C) 2015 Google, Inc. Licensed under the Apache License, Version 2.0" + VALUE "LegalCopyright", "Copyright (C) 2016 Google, Inc. Licensed under the Apache License, Version 2.0" VALUE "OriginalFilename", "tesseract.exe" VALUE "ProductName", "Tesseract-OCR" - VALUE "ProductVersion", "3.05.00dev" + VALUE "ProductVersion", "4.00.00dev" END END BLOCK "VarFileInfo" diff --git a/vs2010/tesseract/tesseract.vcxproj b/vs2010/tesseract/tesseract.vcxproj deleted file mode 100644 index 246d69a0..00000000 --- a/vs2010/tesseract/tesseract.vcxproj +++ /dev/null @@ -1,360 +0,0 @@ - - - - - DLL_Debug - Win32 - - - DLL_Release - Win32 - - - LIB_Debug - Win32 - - - LIB_OpenCL_Debug - Win32 - - - LIB_OpenCL_Release - Win32 - - - LIB_Release - Win32 - - - - {C76996CB-C4CB-4D89-9F67-F605DF129618} - tesseract - - - - Application - MultiByte - - - Application - MultiByte - - - Application - MultiByte - - - Application - MultiByte - - - Application - MultiByte - - - Application - MultiByte - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - - <_ProjectFileVersion>10.0.40219.1 - ..\$(Configuration)\ - ..\$(Configuration)\ - $(Configuration)\ - $(Configuration)\ - false - false - ..\$(Configuration)\ - ..\$(Configuration)\ - $(Configuration)\ - $(Configuration)\ - false - false - ..\$(Configuration)\ - $(Configuration)\ - false - ..\$(Configuration)\ - $(Configuration)\ - false - $(ProjectName)d - $(ProjectName)d - $(ProjectName)-dll - $(ProjectName)-dlld - false - false - false - $(ProjectName)-opencl - - - - - - - - %(Outputs) - - - Disabled - ..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\image;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - WIN32;_WINDOWS;_DEBUG;USE_STD_NAMESPACE;%(PreprocessorDefinitions) - false - EnableFastChecks - MultiThreadedDebugDLL - - - Level3 - OldStyle - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - false - - - - - MYVERSION=$(LIBTESS_VERSION_R);%(PreprocessorDefinitions) - - - ws2_32.lib;user32.lib;zlib$(ZLIB_VERSION)-static-mtdll-debug.lib;libpng$(LIBPNG_VERSION)-static-mtdll-debug.lib;libjpeg$(LIBJPEG_VERSION)-static-mtdll-debug.lib;giflib$(GIFLIB_VERSION)-static-mtdll-debug.lib;libtiff$(LIBTIFF_VERSION)-static-mtdll-debug.lib;liblept$(LIBLEPT_VERSION)-static-mtdll-debug.lib - $(OutDir)$(TargetName)$(TargetExt) - ..\..\..\lib;%(AdditionalLibraryDirectories) - true - Console - - - - - - - - - - - - - - - %(Outputs) - - - Disabled - ..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\image;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - WIN32;_WINDOWS;_DEBUG;USE_STD_NAMESPACE;%(PreprocessorDefinitions) - false - EnableFastChecks - MultiThreadedDebugDLL - - - Level3 - OldStyle - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - false - - - - - MYVERSION=$(LIBTESS_VERSION_R);%(PreprocessorDefinitions) - - - ws2_32.lib;user32.lib;OpenCL.lib;zlib$(ZLIB_VERSION)-static-mtdll-debug.lib;libpng$(LIBPNG_VERSION)-static-mtdll-debug.lib;libjpeg$(LIBJPEG_VERSION)-static-mtdll-debug.lib;giflib$(GIFLIB_VERSION)-static-mtdll-debug.lib;libtiff$(LIBTIFF_VERSION)-static-mtdll-debug.lib;liblept$(LIBLEPT_VERSION)-static-mtdll-debug.lib - $(OutDir)$(TargetName)$(TargetExt) - $(AMDAPPSDKROOT)\lib\x86;$(INTELOCLSDKROOT)\lib\x86;..\..\..\lib;%(AdditionalLibraryDirectories) - true - Console - - - - - - - - - - - - - - - %(Outputs) - - - Full - ..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\image;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - WIN32;_WINDOWS;NDEBUG;USE_STD_NAMESPACE;%(PreprocessorDefinitions) - MultiThreadedDLL - - - Level3 - - - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - true - - - MYVERSION=$(LIBTESS_VERSION_R);%(PreprocessorDefinitions) - - - ws2_32.lib;user32.lib;zlib$(ZLIB_VERSION)-static-mtdll.lib;libpng$(LIBPNG_VERSION)-static-mtdll.lib;libjpeg$(LIBJPEG_VERSION)-static-mtdll.lib;giflib$(GIFLIB_VERSION)-static-mtdll.lib;libtiff$(LIBTIFF_VERSION)-static-mtdll.lib;liblept$(LIBLEPT_VERSION)-static-mtdll.lib - $(OutDir)$(TargetName)$(TargetExt) - ..\..\..\lib;%(AdditionalLibraryDirectories) - Console - true - true - UseLinkTimeCodeGeneration - - - - - - - - - %(Outputs) - - - Full - $(AMDAPPSDKROOT)\include;$(INTELOCLSDKROOT)\include;..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\image;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - USE_OPENCL;WIN32;_WINDOWS;NDEBUG;USE_STD_NAMESPACE;%(PreprocessorDefinitions) - MultiThreadedDLL - - - Level3 - - - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - true - - - MYVERSION=$(LIBTESS_VERSION_R);%(PreprocessorDefinitions) - - - ws2_32.lib;user32.lib;OpenCL.lib;zlib$(ZLIB_VERSION)-static-mtdll.lib;libpng$(LIBPNG_VERSION)-static-mtdll.lib;libjpeg$(LIBJPEG_VERSION)-static-mtdll.lib;giflib$(GIFLIB_VERSION)-static-mtdll.lib;libtiff$(LIBTIFF_VERSION)-static-mtdll.lib;liblept$(LIBLEPT_VERSION)-static-mtdll.lib - $(OutDir)$(TargetName)$(TargetExt) - ..\..\..\lib;$(AMDAPPSDKROOT)\lib\x86;$(INTELOCLSDKROOT)\lib\x86;%(AdditionalLibraryDirectories) - Console - true - true - UseLinkTimeCodeGeneration - - - - - - - - - %(Outputs) - - - Full - ..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\image;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - WIN32;_WINDOWS;NDEBUG;USE_STD_NAMESPACE;TESS_IMPORTS;LIBLEPT_IMPORTS;%(PreprocessorDefinitions) - MultiThreadedDLL - - - Level3 - - - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - true - - - MYVERSION=$(LIBTESS_VERSION_R);%(PreprocessorDefinitions) - - - ws2_32.lib;user32.lib;liblept$(LIBLEPT_VERSION).lib - $(OutDir)$(TargetName)$(TargetExt) - $(LIBTESS_NUMBER) - ..\..\..\lib;%(AdditionalLibraryDirectories) - Console - UseLinkTimeCodeGeneration - MachineX86 - true - true - - - - - - - - - %(Outputs) - - - Disabled - ..\..\api;..\..\ccmain;..\..\ccutil;..\..\ccstruct;..\..\classify;..\..\cube;..\..\cutil;..\..\dict;..\..\image;..\..\neural_networks\runtime;..\..\textord;..\..\viewer;..\..\wordrec;.;..\..\..\include;..\..\..\include\leptonica;..\port;..\..\opencl;%(AdditionalIncludeDirectories) - WIN32;_WINDOWS;_DEBUG;USE_STD_NAMESPACE;TESS_IMPORTS;LIBLEPT_IMPORTS;%(PreprocessorDefinitions) - false - EnableFastChecks - MultiThreadedDebugDLL - - - Level3 - OldStyle - Default - 4244;4305;4018;4267;4996;4800;4005;4355;4099;4566;%(DisableSpecificWarnings) - false - - - MYVERSION=$(LIBTESS_VERSION_R);%(PreprocessorDefinitions) - - - ws2_32.lib;user32.lib;liblept$(LIBLEPT_VERSION)d.lib - $(OutDir)$(TargetName)$(TargetExt) - $(LIBLEPT_NUMBER) - ..\..\..\lib;%(AdditionalLibraryDirectories) - true - Console - MachineX86 - - - - - - - - - - - - - - {d14f28c7-0cae-4c37-b174-40fdefbd4fe0} - false - - - - - - \ No newline at end of file diff --git a/vs2010/tesseract/tesseract.vcxproj.filters b/vs2010/tesseract/tesseract.vcxproj.filters deleted file mode 100644 index 34be85d7..00000000 --- a/vs2010/tesseract/tesseract.vcxproj.filters +++ /dev/null @@ -1,32 +0,0 @@ - - - - - {4FC737F1-C7A5-4376-A066-2A32D752A2FF} - cpp;c;cc;cxx;def;odl;idl;hpj;bat;asm;asmx - - - {93995380-89BD-4b04-88EB-625FBE52EBFB} - h;hpp;hxx;hm;inl;inc;xsd - - - {67DA6AB6-F800-4c08-8B7A-83BB121AAD01} - rc;ico;cur;bmp;dlg;rc2;rct;bin;rgs;gif;jpg;jpeg;jpe;resx;tiff;tif;png;wav;mfcribbon-ms - - - - - Source Files - - - - - Resource Files - - - - - Header Files - - - \ No newline at end of file diff --git a/vs2010/tesshelper.py b/vs2010/tesshelper.py deleted file mode 100644 index a7bb274c..00000000 --- a/vs2010/tesshelper.py +++ /dev/null @@ -1,502 +0,0 @@ -#!/usr/bin/env python -from __future__ import print_function -from builtins import input -""" -tesshelper.py -- Utility operations to compare, report stats, and copy - public headers for tesseract 3.0x VS2010 Project - -$RCSfile: tesshelper.py,v $ $Revision: 7ca575b377aa $ $Date: 2012/03/07 17:26:31 $ -""" - -r""" -Requires: - - python 2.7 or greater: activestate.com - http://www.activestate.com/activepython/downloads - -because using the new argparse module and new literal set syntax (s={1, 2}) . - -General Notes: --------------- - -Format for a .vcproj file entry: - - - - -""" - -epilogStr = r""" -Examples: - -Assume that tesshelper.py is in c:\buildfolder\tesseract-3.02\vs2010, -which is also the current directory. Then, - - python tesshelper .. compare - -will compare c:\buildfolder\tesseract-3.02 "library" directories to the -libtesseract Project -(c:\buildfolder\tesseract-3.02\vs2010\libtesseract\libtesseract.vcproj). - - python tesshelper .. report - -will display summary stats for c:\buildfolder\tesseract-3.02 "library" -directories and the libtesseract Project. - - python tesshelper .. copy ..\..\include - -will copy all "public" libtesseract header files to -c:\buildfolder\include. - - python tesshelper .. clean - -will clean the vs2010 folder of all build directories, and .user, .suo, -.ncb, and other temp files. - -""" - -# imports of python standard library modules -# See Python Documentation | Library Reference for details -import collections -import glob -import argparse -import os -import re -import shutil -import sys - -# ==================================================================== - -VERSION = "1.0 %s" % "$Date: 2012/03/07 17:26:31 $".split()[1] -PROJ_SUBDIR = r"vs2010\libtesseract" -PROJFILE = "libtesseract.vcproj" - -NEWHEADERS_FILENAME = "newheaders.txt" -NEWSOURCES_FILENAME = "newsources.txt" - -fileNodeTemplate = \ -''' ''' - -# ==================================================================== - -def getProjectfiles(libTessDir, libProjectFile, nTrimChars): - """Return sets of all, c, h, and resources files in libtesseract Project""" - - #extract filenames of header & source files from the .vcproj - projectCFiles = set() - projectHFiles = set() - projectRFiles = set() - projectFilesSet = set() - f = open(libProjectFile, "r") - data = f.read() - f.close() - - projectFiles = re.findall(r'(?i)Include="(\.[^"]+)"', data) - for projectFile in projectFiles: - root, ext = os.path.splitext(projectFile.lower()) - if ext == ".c" or ext == ".cpp": - projectCFiles.add(projectFile) - elif ext == ".h": - projectHFiles.add(projectFile) - elif ext == ".rc": - projectRFiles.add(projectFile) - else: - print("unknown file type: %s" % projectFile) - - relativePath = os.path.join(libTessDir, projectFile) - relativePath = os.path.abspath(relativePath) - relativePath = relativePath[nTrimChars:].lower() - projectFilesSet.add(relativePath) - - return projectFilesSet, projectHFiles, projectCFiles, projectRFiles - -def getTessLibFiles(tessDir, nTrimChars): - """Return set of all libtesseract files in tessDir""" - - libDirs = [ - "api", - "ccmain", - "ccstruct", - "ccutil", - "classify", - "cube", - "cutil", - "dict", - r"neural_networks\runtime", - "opencl", - "textord", - "viewer", - "wordrec", - #"training", - r"vs2010\port", - r"vs2010\libtesseract", - ] - - #create list of all .h, .c, .cpp files in "library" directories - tessFiles = set() - for curDir in libDirs: - baseDir = os.path.join(tessDir, curDir) - for filetype in ["*.c", "*.cpp", "*.h"]: - pattern = os.path.join(baseDir, filetype) - fileList = glob.glob(pattern) - for curFile in fileList: - curFile = os.path.abspath(curFile) - relativePath = curFile[nTrimChars:].lower() - tessFiles.add(relativePath) - - return tessFiles - -# ==================================================================== - -def tessCompare(tessDir): - '''Compare libtesseract Project files and actual "sub-library" files.''' - - vs2010Dir = os.path.join(tessDir, "vs2010") - libTessDir = os.path.join(vs2010Dir, "libtesseract") - libProjectFile = os.path.join(libTessDir,"libtesseract.vcxproj") - tessAbsDir = os.path.abspath(tessDir) - nTrimChars = len(tessAbsDir)+1 - print('Comparing VS2010 Project "%s" with\n "%s"' % (libProjectFile, - tessAbsDir)) - - projectFilesSet, projectHFiles, projectCFiles, projectRFiles = \ - getProjectfiles(libTessDir, libProjectFile, nTrimChars) - tessFiles = getTessLibFiles(tessDir, nTrimChars) - - extraFiles = tessFiles - projectFilesSet - print("%2d Extra files (in %s but not in Project)" % (len(extraFiles), - tessAbsDir)) - headerFiles = [] - sourceFiles = [] - sortedList = list(extraFiles) - sortedList.sort() - for filename in sortedList: - root, ext = os.path.splitext(filename.lower()) - if ext == ".h": - headerFiles.append(filename) - else: - sourceFiles.append(filename) - print(" %s " % filename) - - print() - print("%2d new header file items written to %s" % (len(headerFiles), - NEWHEADERS_FILENAME)) - headerFiles.sort() - with open(NEWHEADERS_FILENAME, "w") as f: - for filename in headerFiles: - f.write(fileNodeTemplate % filename) - - print("%2d new source file items written to %s" % (len(sourceFiles), - NEWSOURCES_FILENAME)) - sourceFiles.sort() - with open(NEWSOURCES_FILENAME, "w") as f: - for filename in sourceFiles: - f.write(fileNodeTemplate % filename) - print() - - deadFiles = projectFilesSet - tessFiles - print("%2d Dead files (in Project but not in %s" % (len(deadFiles), - tessAbsDir)) - sortedList = list(deadFiles) - sortedList.sort() - for filename in sortedList: - print(" %s " % filename) - -# ==================================================================== - -def tessReport(tessDir): - """Report summary stats on "sub-library" files and libtesseract Project file.""" - - vs2010Dir = os.path.join(tessDir, "vs2010") - libTessDir = os.path.join(vs2010Dir, "libtesseract") - libProjectFile = os.path.join(libTessDir,"libtesseract.vcproj") - tessAbsDir = os.path.abspath(tessDir) - nTrimChars = len(tessAbsDir)+1 - - projectFilesSet, projectHFiles, projectCFiles, projectRFiles = \ - getProjectfiles(libTessDir, libProjectFile, nTrimChars) - tessFiles = getTessLibFiles(tessDir, nTrimChars) - - print('Summary stats for "%s" library directories' % tessAbsDir) - folderCounters = {} - for tessFile in tessFiles: - tessFile = tessFile.lower() - folder, head = os.path.split(tessFile) - file, ext = os.path.splitext(head) - typeCounter = folderCounters.setdefault(folder, collections.Counter()) - typeCounter[ext[1:]] += 1 - - folders = list(folderCounters.keys()) - folders.sort() - totalFiles = 0 - totalH = 0 - totalCPP = 0 - totalOther = 0 - - print() - print(" total h cpp") - print(" ----- --- ---") - for folder in folders: - counters = folderCounters[folder] - nHFiles = counters['h'] - nCPPFiles = counters['cpp'] - - total = nHFiles + nCPPFiles - totalFiles += total - totalH += nHFiles - totalCPP += nCPPFiles - - print(" %5d %3d %3d %s" % (total, nHFiles, nCPPFiles, folder)) - print(" ----- --- ---") - print(" %5d %3d %3d" % (totalFiles, totalH, totalCPP)) - - print() - print('Summary stats for VS2010 Project "%s"' % libProjectFile) - print(" %5d %s" %(len(projectHFiles), "Header files")) - print(" %5d %s" % (len(projectCFiles), "Source files")) - print(" %5d %s" % (len(projectRFiles), "Resource files")) - print(" -----") - print(" %5d" % (len(projectHFiles) + len(projectCFiles) + len(projectRFiles), )) - -# ==================================================================== - -def copyIncludes(fileSet, description, tessDir, includeDir): - """Copy set of files to specified include dir.""" - - print() - print('Copying libtesseract "%s" headers to %s' % (description, includeDir)) - print() - - sortedList = list(fileSet) - sortedList.sort() - - count = 0 - errList = [] - for includeFile in sortedList: - filepath = os.path.join(tessDir, includeFile) - if os.path.isfile(filepath): - shutil.copy2(filepath, includeDir) - print("Copied: %s" % includeFile) - count += 1 - else: - print('***Error: "%s" doesn\'t exist"' % filepath) - errList.append(filepath) - - print('%d header files successfully copied to "%s"' % (count, includeDir)) - if len(errList): - print("The following %d files were not copied:") - for filepath in errList: - print(" %s" % filepath) - -def tessCopy(tessDir, includeDir): - '''Copy all "public" libtesseract Project header files to include directory. - - Preserves directory hierarchy.''' - - baseIncludeSet = { - r"api\baseapi.h", - r"api\capi.h", - r"api\apitypes.h", - r"ccstruct\publictypes.h", - r"ccmain\thresholder.h", - r"ccutil\host.h", - r"ccutil\basedir.h", - r"ccutil\tesscallback.h", - r"ccutil\unichar.h", - r"ccutil\platform.h", - } - - strngIncludeSet = { - r"ccutil\strngs.h", - r"ccutil\memry.h", - r"ccutil\host.h", - r"ccutil\serialis.h", - r"ccutil\errcode.h", - r"ccutil\fileerr.h", - #r"ccutil\genericvector.h", - } - - resultIteratorIncludeSet = { - r"ccmain\ltrresultiterator.h", - r"ccmain\pageiterator.h", - r"ccmain\resultiterator.h", - r"ccutil\genericvector.h", - r"ccutil\tesscallback.h", - r"ccutil\errcode.h", - r"ccutil\host.h", - r"ccutil\helpers.h", - r"ccutil\ndminx.h", - r"ccutil\params.h", - r"ccutil\unicharmap.h", - r"ccutil\unicharset.h", - } - - genericVectorIncludeSet = { - r"ccutil\genericvector.h", - r"ccutil\tesscallback.h", - r"ccutil\errcode.h", - r"ccutil\host.h", - r"ccutil\helpers.h", - r"ccutil\ndminx.h", - } - - blobsIncludeSet = { - r"ccstruct\blobs.h", - r"ccstruct\rect.h", - r"ccstruct\points.h", - r"ccstruct\ipoints.h", - r"ccutil\elst.h", - r"ccutil\host.h", - r"ccutil\serialis.h", - r"ccutil\lsterr.h", - r"ccutil\ndminx.h", - r"ccutil\tprintf.h", - r"ccutil\params.h", - r"viewer\scrollview.h", - r"ccstruct\vecfuncs.h", - } - - extraFilesSet = { - #r"vs2010\include\stdint.h", - r"vs2010\include\leptonica_versionnumbers.vsprops", - r"vs2010\include\tesseract_versionnumbers.vsprops", - } - - tessIncludeDir = os.path.join(includeDir, "tesseract") - if os.path.isfile(tessIncludeDir): - print('Aborting: "%s" is a file not a directory.' % tessIncludeDir) - return - if not os.path.exists(tessIncludeDir): - os.mkdir(tessIncludeDir) - - #fileSet = baseIncludeSet | strngIncludeSet | genericVectorIncludeSet | blobsIncludeSet - fileSet = baseIncludeSet | strngIncludeSet | resultIteratorIncludeSet - - copyIncludes(fileSet, "public", tessDir, tessIncludeDir) - copyIncludes(extraFilesSet, "extra", tessDir, includeDir) - -# ==================================================================== - -def tessClean(tessDir): - '''Clean vs2010 folder of all build directories and certain temp files.''' - - vs2010Dir = os.path.join(tessDir, "vs2010") - vs2010AbsDir = os.path.abspath(vs2010Dir) - - answer = eval(input( - 'Are you sure you want to clean the\n "%s" folder (Yes/No) [No]? ' % - vs2010AbsDir)) - if answer.lower() not in ("yes",): - return - answer = eval(input('Only list the items to be deleted (Yes/No) [Yes]? ')) - answer = answer.strip() - listOnly = answer.lower() not in ("no",) - - for rootDir, dirs, files in os.walk(vs2010AbsDir): - for buildDir in ("LIB_Release", "LIB_Debug", "DLL_Release", "DLL_Debug"): - if buildDir in dirs: - dirs.remove(buildDir) - absBuildDir = os.path.join(rootDir, buildDir) - if listOnly: - print("Would remove: %s" % absBuildDir) - else: - print("Removing: %s" % absBuildDir) - shutil.rmtree(absBuildDir) - - if rootDir == vs2010AbsDir: - for file in files: - if file.lower() not in ("tesseract.sln", - "tesshelper.py", - "readme.txt"): - absPath = os.path.join(rootDir, file) - if listOnly: - print("Would remove: %s" % absPath) - else: - print("Removing: %s" % absPath) - os.remove(absPath) - else: - for file in files: - root, ext = os.path.splitext(file) - if ext.lower() in (".suo", - ".ncb", - ".user", - ) or ( - len(ext)>0 and ext[-1] == "~"): - absPath = os.path.join(rootDir, file) - if listOnly: - print("Would remove: %s" % absPath) - else: - print("Removing: %s" % absPath) - os.remove(absPath) - -# ==================================================================== - -def validateTessDir(tessDir): - """Check that tessDir is a valid tesseract directory.""" - - if not os.path.isdir(tessDir): - raise argparse.ArgumentTypeError('Directory "%s" doesn\'t exist.' % tessDir) - projFile = os.path.join(tessDir, PROJ_SUBDIR, PROJFILE) - if not os.path.isfile(projFile): - raise argparse.ArgumentTypeError('Project file "%s" doesn\'t exist.' % projFile) - return tessDir - -def validateDir(dir): - """Check that dir is a valid directory named include.""" - - if not os.path.isdir(dir): - raise argparse.ArgumentTypeError('Directory "%s" doesn\'t exist.' % dir) - - dirpath = os.path.abspath(dir) - head, tail = os.path.split(dirpath) - if tail.lower() != "include": - raise argparse.ArgumentTypeError('Include directory "%s" must be named "include".' % tail) - - return dir - -def main (): - parser = argparse.ArgumentParser( - epilog=epilogStr, - formatter_class=argparse.RawDescriptionHelpFormatter) - - parser.add_argument("--version", action="version", - version="%(prog)s " + VERSION) - parser.add_argument('tessDir', type=validateTessDir, - help="tesseract installation directory") - - subparsers = parser.add_subparsers( - dest="subparser_name", - title="Commands") - parser_changes = subparsers.add_parser('compare', - help="compare libtesseract Project with tessDir") - parser_changes.set_defaults(func=tessCompare) - - parser_report = subparsers.add_parser('report', - help="report libtesseract summary stats") - parser_report.set_defaults(func=tessReport) - - parser_copy = subparsers.add_parser('copy', - help="copy public libtesseract header files to includeDir") - parser_copy.add_argument('includeDir', type=validateDir, - help="Directory to copy header files to.") - parser_copy.set_defaults(func=tessCopy) - - parser_clean = subparsers.add_parser('clean', - help="clean vs2010 folder of build folders and .user files") - parser_clean.set_defaults(func=tessClean) - - #kludge because argparse has no ability to set default subparser - if (len(sys.argv) == 2): - sys.argv.append("compare") - args = parser.parse_args() - - #handle commands - if args.func == tessCopy: - args.func(args.tessDir, args.includeDir) - else: - args.func(args.tessDir) - -if __name__ == '__main__' : - main() diff --git a/vs2010/version.bat b/vs2010/version.bat deleted file mode 100644 index 68e68828..00000000 --- a/vs2010/version.bat +++ /dev/null @@ -1,2 +0,0 @@ -@echo off -FOR /F "tokens=*" %%i IN ('call git describe --tags --always') DO echo #define GIT_REV "%%i" > ..\port\vcsversion.h \ No newline at end of file diff --git a/wordrec/associate.h b/wordrec/associate.h index 3d6fc447..10b1e0b7 100644 --- a/wordrec/associate.h +++ b/wordrec/associate.h @@ -47,9 +47,7 @@ struct AssociateStats { gap_sum = 0; } - void Print() { - tprintf("AssociateStats: w(%g %d)\n", shape_cost, bad_shape); - } + void Print() { tprintf("AssociateStats: s(%g %d)\n", shape_cost, bad_shape); } float shape_cost; // cost of blob shape bool bad_shape; // true if the shape of the blob is unacceptable diff --git a/wordrec/chopper.cpp b/wordrec/chopper.cpp index 69a458bc..dfda3e91 100644 --- a/wordrec/chopper.cpp +++ b/wordrec/chopper.cpp @@ -426,7 +426,7 @@ void Wordrec::chop_word_main(WERD_RES *word) { if (word->best_choice == NULL) { // SegSearch found no valid paths, so just use the leading diagonal. - word->FakeWordFromRatings(); + word->FakeWordFromRatings(TOP_CHOICE_PERM); } word->RebuildBestState(); // If we finished without a hyphen at the end of the word, let the next word @@ -568,9 +568,7 @@ int Wordrec::select_blob_to_split( for (x = 0; x < blob_choices.size(); ++x) { if (blob_choices[x] == NULL) { - if (fragments != NULL) { - delete[] fragments; - } + delete[] fragments; return x; } else { blob_choice = blob_choices[x]; @@ -614,9 +612,7 @@ int Wordrec::select_blob_to_split( } } } - if (fragments != NULL) { - delete[] fragments; - } + delete[] fragments; // TODO(daria): maybe a threshold of badness for // worst_near_fragment would be useful. return worst_index_near_fragment != -1 ? diff --git a/wordrec/language_model.cpp b/wordrec/language_model.cpp index f0e3be66..99710478 100644 --- a/wordrec/language_model.cpp +++ b/wordrec/language_model.cpp @@ -32,7 +32,7 @@ #include "params.h" #include "params_training_featdef.h" -#if defined(_MSC_VER) || defined(ANDROID) +#if (defined(_MSC_VER) && _MSC_VER < 1900) || defined(ANDROID) double log2(double n) { return log(n) / log(2.0); } @@ -988,7 +988,7 @@ float LanguageModel::ComputeNgramCost(const char *unichar, unichar, context_ptr, CertaintyScore(certainty)/denom, prob, ngram_and_classifier_cost); } - if (modified_context != NULL) delete[] modified_context; + delete[] modified_context; return ngram_and_classifier_cost; } diff --git a/wordrec/lm_consistency.h b/wordrec/lm_consistency.h index 1d452157..8693bbfc 100644 --- a/wordrec/lm_consistency.h +++ b/wordrec/lm_consistency.h @@ -18,14 +18,14 @@ // //////////////////////////////////////////////////////////////////////// +#ifndef TESSERACT_WORDREC_LM_CONSISTENCY_H_ +#define TESSERACT_WORDREC_LM_CONSISTENCY_H_ + #include "dawg.h" #include "dict.h" #include "host.h" #include "ratngs.h" -#ifndef TESSERACT_WORDREC_CONSISTENCY_H_ -#define TESSERACT_WORDREC_CONSISTENCY_H_ - namespace tesseract { static const char * const XHeightConsistencyEnumName[] = { @@ -137,7 +137,6 @@ struct LMConsistencyInfo { XHeightConsistencyEnum xht_decision; }; - } // namespace tesseract -#endif // TESSERACT_WORDREC_CONSISTENCY_H_ +#endif // TESSERACT_WORDREC_LM_CONSISTENCY_H_ diff --git a/wordrec/lm_state.h b/wordrec/lm_state.h index 623bbb5e..6229e9b3 100644 --- a/wordrec/lm_state.h +++ b/wordrec/lm_state.h @@ -48,8 +48,8 @@ typedef unsigned char LanguageModelFlagsType; /// Each ViterbiStateEntry contains information from various components of the /// language model: dawgs in which the path is found, character ngram model /// probability of the path, script/chartype/font consistency info, state for -/// language-specific heuristics (e.g. hyphenated and compound words, lower/upper -/// case preferences, etc). +/// language-specific heuristics (e.g. hyphenated and compound words, +/// lower/upper case preferences, etc). /// /// Each ViterbiStateEntry also contains the parent pointer, so that the path /// that it represents (WERD_CHOICE) can be constructed by following these @@ -165,13 +165,13 @@ struct ViterbiStateEntry : public ELIST_LINK { /// Various information about the characters on the path represented /// by this ViterbiStateEntry. - float ratings_sum; //< sum of ratings of character on the path - float min_certainty; //< minimum certainty on the path - int adapted; //< number of BLOB_CHOICES from adapted templates - int length; //< number of characters on the path + float ratings_sum; //< sum of ratings of character on the path + float min_certainty; //< minimum certainty on the path + int adapted; //< number of BLOB_CHOICES from adapted templates + int length; //< number of characters on the path float outline_length; //< length of the outline so far LMConsistencyInfo consistency_info; //< path consistency info - AssociateStats associate_stats; //< character widths/gaps/seams + AssociateStats associate_stats; //< character widths/gaps/seams /// Flags for marking the entry as a top choice path with /// the smallest rating or lower/upper case letters). diff --git a/wordrec/measure.h b/wordrec/measure.h index 9c739068..894938e5 100644 --- a/wordrec/measure.h +++ b/wordrec/measure.h @@ -60,10 +60,9 @@ typedef struct * Add one more sample to a measurement. **********************************************************************/ -#define ADD_SAMPLE(m,s) \ -(m.sum_of_samples += (float) (s), \ - m.sum_of_squares += (float) (s) * (float) (s), \ - ++m.num_samples) +#define ADD_SAMPLE(m, s) \ + (m.sum_of_samples += (float)(s), \ + m.sum_of_squares += (float)(s) * (float)(s), ++m.num_samples) /********************************************************************** * mean @@ -71,10 +70,8 @@ typedef struct * Return the mean value of the measurement. **********************************************************************/ -#define MEAN(m) \ -((m).num_samples ? \ - ((float) ((m).sum_of_samples / (m).num_samples)) : \ - 0) +#define MEAN(m) \ + ((m).num_samples ? ((float)((m).sum_of_samples / (m).num_samples)) : 0) /********************************************************************** * new_measurement @@ -83,10 +80,8 @@ typedef struct * samples. **********************************************************************/ -#define new_measurement(m) \ -((m).num_samples = 0, \ - (m).sum_of_samples = 0, \ - (m).sum_of_squares = 0) +#define new_measurement(m) \ + ((m).num_samples = 0, (m).sum_of_samples = 0, (m).sum_of_squares = 0) /********************************************************************** * number_of_samples @@ -112,13 +107,12 @@ typedef struct * Return the variance of the measurement. **********************************************************************/ -#define VARIANCE(m) \ -(((m).num_samples > 1) ? \ - ((float) \ - (((m).num_samples * (m).sum_of_squares - \ - (m).sum_of_samples * (m).sum_of_samples) / \ - (((m).num_samples - 1) * (m).num_samples))) : \ - 0) +#define VARIANCE(m) \ + (((m).num_samples > 1) \ + ? ((float)(((m).num_samples * (m).sum_of_squares - \ + (m).sum_of_samples * (m).sum_of_samples) / \ + (((m).num_samples - 1) * (m).num_samples))) \ + : 0) /********************************************************************** * print_summary @@ -126,10 +120,8 @@ typedef struct * Summarize a MEASUREMENT record. **********************************************************************/ -#define print_summary(string,measure) \ -cprintf ("\t%-20s \tn = %d, \tm = %4.2f, \ts = %4.2f\n ", \ - string, \ - number_of_samples (measure), \ - MEAN (measure), \ - standard_deviation (measure)) +#define print_summary(string, measure) \ + cprintf("\t%-20s \tn = %d, \tm = %4.2f, \ts = %4.2f\n ", string, \ + number_of_samples(measure), MEAN(measure), \ + standard_deviation(measure)) #endif diff --git a/wordrec/params_model.cpp b/wordrec/params_model.cpp index a77d2b13..727b657b 100644 --- a/wordrec/params_model.cpp +++ b/wordrec/params_model.cpp @@ -100,17 +100,15 @@ bool ParamsModel::Equivalent(const ParamsModel &that) const { bool ParamsModel::LoadFromFile( const char *lang, const char *full_path) { - FILE *fp = fopen(full_path, "rb"); - if (!fp) { + TFile fp; + if (!fp.Open(full_path, nullptr)) { tprintf("Error opening file %s\n", full_path); return false; } - bool result = LoadFromFp(lang, fp, -1); - fclose(fp); - return result; + return LoadFromFp(lang, &fp); } -bool ParamsModel::LoadFromFp(const char *lang, FILE *fp, inT64 end_offset) { +bool ParamsModel::LoadFromFp(const char *lang, TFile *fp) { const int kMaxLineSize = 100; char line[kMaxLineSize]; BitVector present; @@ -120,9 +118,8 @@ bool ParamsModel::LoadFromFp(const char *lang, FILE *fp, inT64 end_offset) { GenericVector &weights = weights_vec_[pass_]; weights.init_to_size(PTRAIN_NUM_FEATURE_TYPES, 0.0); - while ((end_offset < 0 || ftell(fp) < end_offset) && - fgets(line, kMaxLineSize, fp)) { - char *key = NULL; + while (fp->FGets(line, kMaxLineSize) != nullptr) { + char *key = nullptr; float value; if (!ParseLine(line, &key, &value)) continue; diff --git a/wordrec/params_model.h b/wordrec/params_model.h index a66e4450..df48d5bd 100644 --- a/wordrec/params_model.h +++ b/wordrec/params_model.h @@ -61,7 +61,7 @@ class ParamsModel { // Returns true on success. bool LoadFromFile(const char *lang, const char *full_path); - bool LoadFromFp(const char *lang, FILE *fp, inT64 end_offset); + bool LoadFromFp(const char *lang, TFile *fp); const GenericVector& weights() const { return weights_vec_[pass_]; diff --git a/wordrec/pieces.cpp b/wordrec/pieces.cpp index 04e34039..f7b406d5 100644 --- a/wordrec/pieces.cpp +++ b/wordrec/pieces.cpp @@ -267,7 +267,6 @@ void Wordrec::merge_and_put_fragment_lists(inT16 row, inT16 column, delete [] choice_lists_it; } - /********************************************************************** * get_fragment_lists * diff --git a/wordrec/tface.cpp b/wordrec/tface.cpp index e21fcb88..823dc7a9 100644 --- a/wordrec/tface.cpp +++ b/wordrec/tface.cpp @@ -44,12 +44,16 @@ namespace tesseract { * and Dawg models. */ void Wordrec::program_editup(const char *textbase, - bool init_classifier, - bool init_dict) { + TessdataManager *init_classifier, + TessdataManager *init_dict) { if (textbase != NULL) imagefile = textbase; InitFeatureDefs(&feature_defs_); InitAdaptiveClassifier(init_classifier); - if (init_dict) getDict().Load(Dict::GlobalDawgCache()); + if (init_dict) { + getDict().SetupForLoad(Dict::GlobalDawgCache()); + getDict().Load(lang, init_dict); + getDict().FinishLoad(); + } pass2_ok_split = chop_ok_split; } diff --git a/wordrec/wordrec.h b/wordrec/wordrec.h index fb54ccae..592dc29b 100644 --- a/wordrec/wordrec.h +++ b/wordrec/wordrec.h @@ -16,8 +16,8 @@ // /////////////////////////////////////////////////////////////////////// -#ifndef TESSERACT_WORDREC_WORDREC_H__ -#define TESSERACT_WORDREC_WORDREC_H__ +#ifndef TESSERACT_WORDREC_WORDREC_H_ +#define TESSERACT_WORDREC_WORDREC_H_ #include "associate.h" #include "classify.h" @@ -200,9 +200,8 @@ class Wordrec : public Classify { } // tface.cpp - void program_editup(const char *textbase, - bool init_classifier, - bool init_permute); + void program_editup(const char *textbase, TessdataManager *init_classifier, + TessdataManager *init_dict); void cc_recog(WERD_RES *word); void program_editdown(inT32 elasped_time); void set_pass1(); @@ -491,4 +490,4 @@ class Wordrec : public Classify { } // namespace tesseract -#endif // TESSERACT_WORDREC_WORDREC_H__ +#endif // TESSERACT_WORDREC_WORDREC_H_