2010-11-24 02:34:14 +08:00
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
// File: recogtraining.cpp
|
|
|
|
// Description: Functions for ambiguity and parameter training.
|
|
|
|
// Author: Daria Antonova
|
|
|
|
//
|
|
|
|
// (C) Copyright 2009, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
|
|
|
//
|
|
|
|
///////////////////////////////////////////////////////////////////////
|
|
|
|
|
|
|
|
#include "tesseractclass.h"
|
|
|
|
|
|
|
|
#include "boxread.h"
|
|
|
|
#include "control.h"
|
2019-03-30 20:29:03 +08:00
|
|
|
#include "host.h" // for NearlyEqual
|
2010-11-24 02:34:14 +08:00
|
|
|
#include "ratngs.h"
|
2019-04-04 20:02:07 +08:00
|
|
|
#ifndef DISABLED_LEGACY_ENGINE
|
2010-11-24 02:34:14 +08:00
|
|
|
#include "reject.h"
|
2019-04-04 20:02:07 +08:00
|
|
|
#endif
|
2010-11-24 02:34:14 +08:00
|
|
|
#include "stopper.h"
|
|
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
|
Use POSIX data types and macros (#878)
* api: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* cutil: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* training: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract data types by POSIX data types
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract data types by POSIX data types
Now all Tesseract data types which are no longer needed can be removed
from ccutil/host.h.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccmain: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccstruct: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* classify: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* dict: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* lstm: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* textord: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* wordrec: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* ccutil: Replace Tesseract's MIN_*INT, MAX_*INT* by POSIX *INT*_MIN, *INT*_MAX
Remove the macros which are now unused from ccutil/host.h.
Remove also the obsolete history comments.
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* Fix build error caused by ambiguous ClipToRange
Error message vom Appveyor CI:
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2672: 'ClipToRange': no matching overloaded function found [C:\projects\tesseract\build\libtesseract.vcxproj]
C:\projects\tesseract\ccstruct\coutln.cpp(818): error C2782: 'T ClipToRange(const T &,const T &,const T &)': template parameter 'T' is ambiguous [C:\projects\tesseract\build\libtesseract.vcxproj]
c:\projects\tesseract\ccutil\helpers.h(122): note: see declaration of 'ClipToRange'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: could be 'char'
C:\projects\tesseract\ccstruct\coutln.cpp(818): note: or 'int'
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* unittest: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
* arch: Replace Tesseract's MAX_INT8 by POSIX INT8_MAX
Signed-off-by: Stefan Weil <sw@weilnetz.de>
2018-03-14 04:36:30 +08:00
|
|
|
const int16_t kMaxBoxEdgeDiff = 2;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
|
|
|
// Sets flags necessary for recognition in the training mode.
|
|
|
|
// Opens and returns the pointer to the output file.
|
2019-03-30 20:29:03 +08:00
|
|
|
FILE* Tesseract::init_recog_training(const STRING& fname) {
|
2010-11-24 02:34:14 +08:00
|
|
|
if (tessedit_ambigs_training) {
|
2019-03-30 20:29:03 +08:00
|
|
|
tessedit_tess_adaption_mode.set_value(0); // turn off adaption
|
|
|
|
tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
|
2010-11-24 02:34:14 +08:00
|
|
|
// Explore all segmentations.
|
|
|
|
getDict().stopper_no_acceptable_choices.set_value(1);
|
|
|
|
}
|
|
|
|
|
|
|
|
STRING output_fname = fname;
|
2019-09-23 14:12:33 +08:00
|
|
|
const char* lastdot = strrchr(output_fname.c_str(), '.');
|
2019-03-30 20:29:03 +08:00
|
|
|
if (lastdot != nullptr)
|
2019-09-23 14:12:33 +08:00
|
|
|
output_fname[lastdot - output_fname.c_str()] = '\0';
|
2010-11-24 02:34:14 +08:00
|
|
|
output_fname += ".txt";
|
2019-09-23 14:12:33 +08:00
|
|
|
FILE* output_file = fopen(output_fname.c_str(), "a+");
|
2018-07-03 02:34:18 +08:00
|
|
|
if (output_file == nullptr) {
|
2019-09-23 14:12:33 +08:00
|
|
|
tprintf("Error: Could not open file %s\n", output_fname.c_str());
|
2018-07-03 02:34:18 +08:00
|
|
|
ASSERT_HOST(output_file);
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
return output_file;
|
|
|
|
}
|
|
|
|
|
|
|
|
// Copies the bounding box from page_res_it->word() to the given TBOX.
|
2019-03-30 20:29:03 +08:00
|
|
|
static bool read_t(PAGE_RES_IT* page_res_it, TBOX* tbox) {
|
2016-12-13 00:19:35 +08:00
|
|
|
while (page_res_it->block() != nullptr && page_res_it->word() == nullptr)
|
2012-02-02 10:59:49 +08:00
|
|
|
page_res_it->forward();
|
|
|
|
|
2016-12-13 00:19:35 +08:00
|
|
|
if (page_res_it->word() != nullptr) {
|
2010-11-24 02:34:14 +08:00
|
|
|
*tbox = page_res_it->word()->word->bounding_box();
|
|
|
|
|
2012-02-02 10:59:49 +08:00
|
|
|
// If tbox->left() is negative, the training image has vertical text and
|
|
|
|
// all the coordinates of bounding boxes of page_res are rotated by 90
|
|
|
|
// degrees in a counterclockwise direction. We need to rotate the TBOX back
|
|
|
|
// in order to compare with the TBOXes of box files.
|
|
|
|
if (tbox->left() < 0) {
|
|
|
|
tbox->rotate(FCOORD(0.0, -1.0));
|
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
return true;
|
|
|
|
} else {
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
// This function takes tif/box pair of files and runs recognition on the image,
|
|
|
|
// while making sure that the word bounds that tesseract identified roughly
|
|
|
|
// match to those specified by the input box file. For each word (ngram in a
|
|
|
|
// single bounding box from the input box file) it outputs the ocred result,
|
|
|
|
// the correct label, rating and certainty.
|
2019-03-30 20:29:03 +08:00
|
|
|
void Tesseract::recog_training_segmented(const STRING& fname,
|
|
|
|
PAGE_RES* page_res,
|
|
|
|
volatile ETEXT_DESC* monitor,
|
|
|
|
FILE* output_file) {
|
2010-11-24 02:34:14 +08:00
|
|
|
STRING box_fname = fname;
|
2019-09-23 14:12:33 +08:00
|
|
|
const char* lastdot = strrchr(box_fname.c_str(), '.');
|
2019-03-30 20:29:03 +08:00
|
|
|
if (lastdot != nullptr)
|
2019-09-23 14:12:33 +08:00
|
|
|
box_fname[lastdot - box_fname.c_str()] = '\0';
|
2010-11-24 02:34:14 +08:00
|
|
|
box_fname += ".box";
|
2015-11-25 01:21:57 +08:00
|
|
|
// ReadNextBox() will close box_file
|
2019-09-23 14:12:33 +08:00
|
|
|
FILE* box_file = fopen(box_fname.c_str(), "r");
|
2018-07-03 02:34:18 +08:00
|
|
|
if (box_file == nullptr) {
|
2019-09-23 14:12:33 +08:00
|
|
|
tprintf("Error: Could not open file %s\n", box_fname.c_str());
|
2018-07-03 02:34:18 +08:00
|
|
|
ASSERT_HOST(box_file);
|
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
|
|
|
|
PAGE_RES_IT page_res_it;
|
|
|
|
page_res_it.page_res = page_res;
|
|
|
|
page_res_it.restart_page();
|
2012-02-02 10:59:49 +08:00
|
|
|
STRING label;
|
2010-11-24 02:34:14 +08:00
|
|
|
|
|
|
|
// Process all the words on this page.
|
|
|
|
TBOX tbox; // tesseract-identified box
|
|
|
|
TBOX bbox; // box from the box file
|
|
|
|
bool keep_going;
|
|
|
|
int line_number = 0;
|
2012-02-02 10:59:49 +08:00
|
|
|
int examined_words = 0;
|
2010-11-24 02:34:14 +08:00
|
|
|
do {
|
|
|
|
keep_going = read_t(&page_res_it, &tbox);
|
2019-03-30 20:29:03 +08:00
|
|
|
keep_going &=
|
|
|
|
ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
2010-11-24 02:34:14 +08:00
|
|
|
// Align bottom left points of the TBOXes.
|
|
|
|
while (keep_going &&
|
|
|
|
!NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
|
2014-08-12 07:23:06 +08:00
|
|
|
if (bbox.bottom() < tbox.bottom()) {
|
|
|
|
page_res_it.forward();
|
|
|
|
keep_going = read_t(&page_res_it, &tbox);
|
|
|
|
} else {
|
2019-03-30 20:29:03 +08:00
|
|
|
keep_going =
|
|
|
|
ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
2014-08-12 07:23:06 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
while (keep_going &&
|
|
|
|
!NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
|
2014-08-12 07:23:06 +08:00
|
|
|
if (bbox.left() > tbox.left()) {
|
|
|
|
page_res_it.forward();
|
|
|
|
keep_going = read_t(&page_res_it, &tbox);
|
|
|
|
} else {
|
2019-03-30 20:29:03 +08:00
|
|
|
keep_going =
|
|
|
|
ReadNextBox(applybox_page, &line_number, box_file, &label, &bbox);
|
2014-08-12 07:23:06 +08:00
|
|
|
}
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
// OCR the word if top right points of the TBOXes are similar.
|
|
|
|
if (keep_going &&
|
|
|
|
NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
|
|
|
|
NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
|
2019-09-23 14:12:33 +08:00
|
|
|
ambigs_classify_and_output(label.c_str(), &page_res_it, output_file);
|
2019-03-30 20:29:03 +08:00
|
|
|
examined_words++;
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
2014-08-12 07:23:06 +08:00
|
|
|
page_res_it.forward();
|
2010-11-24 02:34:14 +08:00
|
|
|
} while (keep_going);
|
2012-02-02 10:59:49 +08:00
|
|
|
|
|
|
|
// Set up scripts on all of the words that did not get sent to
|
|
|
|
// ambigs_classify_and_output. They all should have, but if all the
|
|
|
|
// werd_res's don't get uch_sets, tesseract will crash when you try
|
|
|
|
// to iterate over them. :-(
|
|
|
|
int total_words = 0;
|
2016-12-13 00:19:35 +08:00
|
|
|
for (page_res_it.restart_page(); page_res_it.block() != nullptr;
|
2012-02-02 10:59:49 +08:00
|
|
|
page_res_it.forward()) {
|
|
|
|
if (page_res_it.word()) {
|
2016-12-13 00:19:35 +08:00
|
|
|
if (page_res_it.word()->uch_set == nullptr)
|
2012-02-02 10:59:49 +08:00
|
|
|
page_res_it.word()->SetupFake(unicharset);
|
|
|
|
total_words++;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
if (examined_words < 0.85 * total_words) {
|
2019-03-30 20:29:03 +08:00
|
|
|
tprintf(
|
|
|
|
"TODO(antonova): clean up recog_training_segmented; "
|
|
|
|
" It examined only a small fraction of the ambigs image.\n");
|
2012-02-02 10:59:49 +08:00
|
|
|
}
|
2019-03-30 20:29:03 +08:00
|
|
|
tprintf("recog_training_segmented: examined %d / %d words.\n", examined_words,
|
|
|
|
total_words);
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
// Helper prints the given set of blob choices.
|
|
|
|
static void PrintPath(int length, const BLOB_CHOICE** blob_choices,
|
2019-03-30 20:29:03 +08:00
|
|
|
const UNICHARSET& unicharset, const char* label,
|
|
|
|
FILE* output_file) {
|
2013-09-23 23:26:50 +08:00
|
|
|
float rating = 0.0f;
|
|
|
|
float certainty = 0.0f;
|
|
|
|
for (int i = 0; i < length; ++i) {
|
|
|
|
const BLOB_CHOICE* blob_choice = blob_choices[i];
|
|
|
|
fprintf(output_file, "%s",
|
2019-03-30 20:29:03 +08:00
|
|
|
unicharset.id_to_unichar(blob_choice->unichar_id()));
|
2013-09-23 23:26:50 +08:00
|
|
|
rating += blob_choice->rating();
|
|
|
|
if (certainty > blob_choice->certainty())
|
|
|
|
certainty = blob_choice->certainty();
|
|
|
|
}
|
2019-03-30 20:29:03 +08:00
|
|
|
fprintf(output_file, "\t%s\t%.4f\t%.4f\n", label, rating, certainty);
|
2013-09-23 23:26:50 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Helper recursively prints all paths through the ratings matrix, starting
|
|
|
|
// at column col.
|
2019-03-30 20:29:03 +08:00
|
|
|
static void PrintMatrixPaths(int col, int dim, const MATRIX& ratings,
|
2013-09-23 23:26:50 +08:00
|
|
|
int length, const BLOB_CHOICE** blob_choices,
|
2019-03-30 20:29:03 +08:00
|
|
|
const UNICHARSET& unicharset, const char* label,
|
|
|
|
FILE* output_file) {
|
2013-09-23 23:26:50 +08:00
|
|
|
for (int row = col; row < dim && row - col < ratings.bandwidth(); ++row) {
|
|
|
|
if (ratings.get(col, row) != NOT_CLASSIFIED) {
|
|
|
|
BLOB_CHOICE_IT bc_it(ratings.get(col, row));
|
|
|
|
for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
|
|
|
|
blob_choices[length] = bc_it.data();
|
|
|
|
if (row + 1 < dim) {
|
|
|
|
PrintMatrixPaths(row + 1, dim, ratings, length + 1, blob_choices,
|
|
|
|
unicharset, label, output_file);
|
|
|
|
} else {
|
|
|
|
PrintPath(length + 1, blob_choices, unicharset, label, output_file);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-11-24 02:34:14 +08:00
|
|
|
// Runs classify_word_pass1() on the current word. Outputs Tesseract's
|
|
|
|
// raw choice as a result of the classification. For words labeled with a
|
|
|
|
// single unichar also outputs all alternatives from blob_choices of the
|
|
|
|
// best choice.
|
2019-03-30 20:29:03 +08:00
|
|
|
void Tesseract::ambigs_classify_and_output(const char* label,
|
2014-08-12 07:23:06 +08:00
|
|
|
PAGE_RES_IT* pr_it,
|
2019-03-30 20:29:03 +08:00
|
|
|
FILE* output_file) {
|
2010-11-24 02:34:14 +08:00
|
|
|
// Classify word.
|
2012-02-02 10:59:49 +08:00
|
|
|
fflush(stdout);
|
2014-08-12 07:23:06 +08:00
|
|
|
WordData word_data(*pr_it);
|
2013-11-09 04:30:56 +08:00
|
|
|
SetupWordPassN(1, &word_data);
|
2015-05-13 07:47:02 +08:00
|
|
|
classify_word_and_language(1, pr_it, &word_data);
|
2014-08-12 07:23:06 +08:00
|
|
|
WERD_RES* werd_res = word_data.word;
|
2019-03-30 20:29:03 +08:00
|
|
|
WERD_CHOICE* best_choice = werd_res->best_choice;
|
2016-12-13 00:19:35 +08:00
|
|
|
ASSERT_HOST(best_choice != nullptr);
|
2010-11-24 02:34:14 +08:00
|
|
|
|
|
|
|
// Compute the number of unichars in the label.
|
2013-09-23 23:26:50 +08:00
|
|
|
GenericVector<UNICHAR_ID> encoding;
|
2016-12-13 00:19:35 +08:00
|
|
|
if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
|
2010-11-24 02:34:14 +08:00
|
|
|
tprintf("Not outputting illegal unichar %s\n", label);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
|
2013-09-23 23:26:50 +08:00
|
|
|
// Dump all paths through the ratings matrix (which is normally small).
|
|
|
|
int dim = werd_res->ratings->dimension();
|
2019-03-26 14:55:08 +08:00
|
|
|
const auto** blob_choices = new const BLOB_CHOICE*[dim];
|
2019-03-30 20:29:03 +08:00
|
|
|
PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices, unicharset,
|
|
|
|
label, output_file);
|
|
|
|
delete[] blob_choices;
|
2010-11-24 02:34:14 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace tesseract
|