Extend function BoxFileName to handle more common image names

The function derives the file name for the .box file from an image name.

For training from existing line images, it is useful to directly support
the image names which are commonly used.

While generated images for Tesseract training typically use the name
pattern NAME.tif, other ground truth sets use NAME.bin.png for binarized
or NAME.nrm.png for grayscale images.

BoxFileName is also now a local function as it is only used locally.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2019-10-02 19:53:25 +02:00
parent 84c410a8e3
commit 7a7704bc94
2 changed files with 21 additions and 18 deletions

View File

@ -17,9 +17,10 @@
**********************************************************************/
#include "boxread.h"
#include <cstring> // for strchr, strcmp, strrchr
#include <cstring> // for strchr, strcmp
#include <locale> // for std::locale::classic
#include <sstream> // for std::stringstream
#include <string> // for std::string
#include "errcode.h" // for ERRCODE, TESSEXIT
#include "fileerr.h" // for CANTOPENFILE
#include "genericvector.h" // for GenericVector
@ -32,9 +33,26 @@
// Special char code used to identify multi-blob labels.
static const char* kMultiBlobLabelCode = "WordStr";
// Returns the box file name corresponding to the given image_filename.
static std::string BoxFileName(const char* image_filename) {
std::string box_filename = image_filename;
size_t length = box_filename.length();
std::string last = (length > 8) ? box_filename.substr(length - 8) : "";
if (last == ".bin.png" || last == ".nrm.png") {
box_filename.resize(length - 8);
} else {
size_t lastdot = box_filename.find_last_of('.');
if (lastdot < length) {
box_filename.resize(lastdot);
}
}
box_filename += ".box";
return box_filename;
}
// Open the boxfile based on the given image filename.
FILE* OpenBoxFile(const STRING& fname) {
STRING filename = BoxFileName(fname);
std::string filename = BoxFileName(fname.c_str());
FILE* box_file = nullptr;
if (!(box_file = fopen(filename.c_str(), "rb"))) {
CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s",
@ -56,7 +74,7 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages) {
GenericVector<char> box_data;
if (!tesseract::LoadDataFromFile(BoxFileName(filename).c_str(), &box_data))
if (!tesseract::LoadDataFromFile(BoxFileName(filename.c_str()).c_str(), &box_data))
return false;
// Convert the array of bytes to a string, so it can be used by the parser.
box_data.push_back('\0');
@ -102,17 +120,6 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
return num_boxes > 0;
}
// Returns the box file name corresponding to the given image_filename.
STRING BoxFileName(const STRING& image_filename) {
STRING box_filename = image_filename;
const char *lastdot = strrchr(box_filename.c_str(), '.');
if (lastdot != nullptr)
box_filename.truncate_at(lastdot - box_filename.c_str());
box_filename += ".box";
return box_filename;
}
// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.
// Box files are used ONLY DURING TRAINING, but by both processes of
// creating tr files with tesseract, and unicharset_extractor.

View File

@ -2,7 +2,6 @@
* File: boxread.h
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
@ -59,9 +58,6 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages);
// Returns the box file name corresponding to the given image_filename.
STRING BoxFileName(const STRING& image_filename);
// ReadNextBox factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpret the same way.
// This function returns the next valid box file utf8 string and coords