mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-23 02:24:09 +08:00
Merge pull request #2686 from stweil/boxfilename
Extend function BoxFileName to handle more common image names
This commit is contained in:
commit
6d171b889c
@ -17,9 +17,10 @@
|
|||||||
**********************************************************************/
|
**********************************************************************/
|
||||||
|
|
||||||
#include "boxread.h"
|
#include "boxread.h"
|
||||||
#include <cstring> // for strchr, strcmp, strrchr
|
#include <cstring> // for strchr, strcmp
|
||||||
#include <locale> // for std::locale::classic
|
#include <locale> // for std::locale::classic
|
||||||
#include <sstream> // for std::stringstream
|
#include <sstream> // for std::stringstream
|
||||||
|
#include <string> // for std::string
|
||||||
#include "errcode.h" // for ERRCODE, TESSEXIT
|
#include "errcode.h" // for ERRCODE, TESSEXIT
|
||||||
#include "fileerr.h" // for CANTOPENFILE
|
#include "fileerr.h" // for CANTOPENFILE
|
||||||
#include "genericvector.h" // for GenericVector
|
#include "genericvector.h" // for GenericVector
|
||||||
@ -32,9 +33,26 @@
|
|||||||
// Special char code used to identify multi-blob labels.
|
// Special char code used to identify multi-blob labels.
|
||||||
static const char* kMultiBlobLabelCode = "WordStr";
|
static const char* kMultiBlobLabelCode = "WordStr";
|
||||||
|
|
||||||
|
// Returns the box file name corresponding to the given image_filename.
|
||||||
|
static std::string BoxFileName(const char* image_filename) {
|
||||||
|
std::string box_filename = image_filename;
|
||||||
|
size_t length = box_filename.length();
|
||||||
|
std::string last = (length > 8) ? box_filename.substr(length - 8) : "";
|
||||||
|
if (last == ".bin.png" || last == ".nrm.png") {
|
||||||
|
box_filename.resize(length - 8);
|
||||||
|
} else {
|
||||||
|
size_t lastdot = box_filename.find_last_of('.');
|
||||||
|
if (lastdot < length) {
|
||||||
|
box_filename.resize(lastdot);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
box_filename += ".box";
|
||||||
|
return box_filename;
|
||||||
|
}
|
||||||
|
|
||||||
// Open the boxfile based on the given image filename.
|
// Open the boxfile based on the given image filename.
|
||||||
FILE* OpenBoxFile(const STRING& fname) {
|
FILE* OpenBoxFile(const STRING& fname) {
|
||||||
STRING filename = BoxFileName(fname);
|
std::string filename = BoxFileName(fname.c_str());
|
||||||
FILE* box_file = nullptr;
|
FILE* box_file = nullptr;
|
||||||
if (!(box_file = fopen(filename.c_str(), "rb"))) {
|
if (!(box_file = fopen(filename.c_str(), "rb"))) {
|
||||||
CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s",
|
CANTOPENFILE.error("read_next_box", TESSEXIT, "Can't open box file %s",
|
||||||
@ -56,7 +74,7 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename,
|
|||||||
GenericVector<STRING>* box_texts,
|
GenericVector<STRING>* box_texts,
|
||||||
GenericVector<int>* pages) {
|
GenericVector<int>* pages) {
|
||||||
GenericVector<char> box_data;
|
GenericVector<char> box_data;
|
||||||
if (!tesseract::LoadDataFromFile(BoxFileName(filename).c_str(), &box_data))
|
if (!tesseract::LoadDataFromFile(BoxFileName(filename.c_str()).c_str(), &box_data))
|
||||||
return false;
|
return false;
|
||||||
// Convert the array of bytes to a string, so it can be used by the parser.
|
// Convert the array of bytes to a string, so it can be used by the parser.
|
||||||
box_data.push_back('\0');
|
box_data.push_back('\0');
|
||||||
@ -102,17 +120,6 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
|
|||||||
return num_boxes > 0;
|
return num_boxes > 0;
|
||||||
}
|
}
|
||||||
|
|
||||||
// Returns the box file name corresponding to the given image_filename.
|
|
||||||
STRING BoxFileName(const STRING& image_filename) {
|
|
||||||
STRING box_filename = image_filename;
|
|
||||||
const char *lastdot = strrchr(box_filename.c_str(), '.');
|
|
||||||
if (lastdot != nullptr)
|
|
||||||
box_filename.truncate_at(lastdot - box_filename.c_str());
|
|
||||||
|
|
||||||
box_filename += ".box";
|
|
||||||
return box_filename;
|
|
||||||
}
|
|
||||||
|
|
||||||
// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.
|
// TODO(rays) convert all uses of ReadNextBox to use the new ReadAllBoxes.
|
||||||
// Box files are used ONLY DURING TRAINING, but by both processes of
|
// Box files are used ONLY DURING TRAINING, but by both processes of
|
||||||
// creating tr files with tesseract, and unicharset_extractor.
|
// creating tr files with tesseract, and unicharset_extractor.
|
||||||
|
@ -2,7 +2,6 @@
|
|||||||
* File: boxread.h
|
* File: boxread.h
|
||||||
* Description: Read data from a box file.
|
* Description: Read data from a box file.
|
||||||
* Author: Ray Smith
|
* Author: Ray Smith
|
||||||
* Created: Fri Aug 24 17:47:23 PDT 2007
|
|
||||||
*
|
*
|
||||||
* (C) Copyright 2007, Google Inc.
|
* (C) Copyright 2007, Google Inc.
|
||||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
@ -59,9 +58,6 @@ bool ReadMemBoxes(int target_page, bool skip_blanks, const char* box_data,
|
|||||||
GenericVector<STRING>* box_texts,
|
GenericVector<STRING>* box_texts,
|
||||||
GenericVector<int>* pages);
|
GenericVector<int>* pages);
|
||||||
|
|
||||||
// Returns the box file name corresponding to the given image_filename.
|
|
||||||
STRING BoxFileName(const STRING& image_filename);
|
|
||||||
|
|
||||||
// ReadNextBox factors out the code to interpret a line of a box
|
// ReadNextBox factors out the code to interpret a line of a box
|
||||||
// file so that applybox and unicharset_extractor interpret the same way.
|
// file so that applybox and unicharset_extractor interpret the same way.
|
||||||
// This function returns the next valid box file utf8 string and coords
|
// This function returns the next valid box file utf8 string and coords
|
||||||
|
Loading…
Reference in New Issue
Block a user