Merge pull request #2686 from stweil/boxfilename

Extend function BoxFileName to handle more common image names
This commit is contained in:
zdenop 2019-10-05 16:53:34 +02:00 committed by GitHub
commit 6d171b889c
No known key found for this signature in database
GPG Key ID: 4AEE18F83AFDEB23
2 changed files with 21 additions and 18 deletions

View File

@ -17,9 +17,10 @@
**********************************************************************/
#include "boxread.h"
#include <cstring> // for strchr, strcmp, strrchr
#include <cstring> // for strchr, strcmp
#include <locale> // for std::locale::classic
#include <sstream> // for std::stringstream
#include <string> // for std::string
#include "errcode.h" // for ERRCODE, TESSEXIT
#include "fileerr.h" // for CANTOPENFILE
#include "genericvector.h" // for GenericVector
@ -32,9 +33,26 @@
// Special char code used to identify multi-blob labels.
static const char* kMultiBlobLabelCode = "WordStr";
// Returns the box file name corresponding to the given image_filename.
static std::string BoxFileName(const char* image_filename) {
std::string box_filename = image_filename;
size_t length = box_filename.length();
std::string last = (length > 8) ? box_filename.substr(length - 8) : "";
if (last == ".bin.png" || last == ".nrm.png") {
box_filename.resize(length - 8);
} else {
size_t lastdot = box_filename.find_last_of('.');
if (lastdot < length) {
box_filename.resize(lastdot);
}
}
box_filename += ".box";
return box_filename;
}
// Open the boxfile based on the given image filename.
FILE* OpenBoxFile(const STRING& fname) {
STRING filename = BoxFileName(fname);
std::string filename = BoxFileName(fname.c_str());
FILE* box_file = nullptr;
if (!(box_file = fopen(filename.c_str(), "rb"))) {
CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s",
@ -56,7 +74,7 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages) {
GenericVector<char> box_data;
if (!tesseract::LoadDataFromFile(BoxFileName(filename).c_str(), &box_data))
if (!tesseract::LoadDataFromFile(BoxFileName(filename.c_str()).c_str(), &box_data))
return false;
// Convert the array of bytes to a string, so it can be used by the parser.
box_data.push_back('\0');
@ -102,17 +120,6 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
return num_boxes > 0;
}
// Returns the box file name corresponding to the given image_filename.
STRING BoxFileName(const STRING& image_filename) {
STRING box_filename = image_filename;
const char *lastdot = strrchr(box_filename.c_str(), '.');
if (lastdot != nullptr)
box_filename.truncate_at(lastdot - box_filename.c_str());
box_filename += ".box";
return box_filename;
}
// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.
// Box files are used ONLY DURING TRAINING, but by both processes of
// creating tr files with tesseract, and unicharset_extractor.

View File

@ -2,7 +2,6 @@
* File: boxread.h
* Description: Read data from a box file.
* Author: Ray Smith
* Created: Fri Aug 24 17:47:23 PDT 2007
*
* (C) Copyright 2007, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
@ -59,9 +58,6 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
GenericVector<STRING>* box_texts,
GenericVector<int>* pages);
// Returns the box file name corresponding to the given image_filename.
STRING BoxFileName(const STRING& image_filename);
// ReadNextBox factors out the code to interpret a line of a box
// file so that applybox and unicharset_extractor interpret the same way.
// This function returns the next valid box file utf8 string and coords