From 8cb677d6a25134d521c4c086114de64104c9bd98 Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sun, 22 Sep 2019 19:48:53 +0200 Subject: [PATCH 1/2] Replace STRING arguments for LoadDataFromFile and SaveDataToFile This is a step to eliminate the proprietary STRING data type from the public Tesseract API. Signed-off-by: Stefan Weil --- src/ccstruct/boxread.cpp | 2 +- src/ccutil/genericvector.h | 13 ++++--------- src/ccutil/serialis.cpp | 9 ++++----- src/ccutil/serialis.h | 4 ++-- src/ccutil/tessdatamanager.cpp | 6 +++--- src/training/lstmtester.cpp | 2 +- src/training/lstmtrainer.cpp | 4 ++-- 7 files changed, 17 insertions(+), 23 deletions(-) diff --git a/src/ccstruct/boxread.cpp b/src/ccstruct/boxread.cpp index a5212e19..aa65cf1b 100644 --- a/src/ccstruct/boxread.cpp +++ b/src/ccstruct/boxread.cpp @@ -56,7 +56,7 @@ bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING& filename, GenericVector* box_texts, GenericVector* pages) { GenericVector box_data; - if (!tesseract::LoadDataFromFile(BoxFileName(filename), &box_data)) + if (!tesseract::LoadDataFromFile(BoxFileName(filename).c_str(), &box_data)) return false; // Convert the array of bytes to a string, so it can be used by the parser. box_data.push_back('\0'); diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h index 55384d04..6e79d548 100644 --- a/src/ccutil/genericvector.h +++ b/src/ccutil/genericvector.h @@ -361,16 +361,11 @@ inline bool LoadDataFromFile(const char* filename, GenericVector* data) { return result; } -inline bool LoadDataFromFile(const STRING& filename, - GenericVector* data) { - return LoadDataFromFile(filename.string(), data); -} - // The default FileWriter writes the vector of char to the filename file, // returning false on error. inline bool SaveDataToFile(const GenericVector& data, - const STRING& filename) { - FILE* fp = fopen(filename.string(), "wb"); + const char* filename) { + FILE* fp = fopen(filename, "wb"); if (fp == nullptr) { return false; } @@ -380,10 +375,10 @@ inline bool SaveDataToFile(const GenericVector& data, return result; } // Reads a file as a vector of STRING. -inline bool LoadFileLinesToStrings(const STRING& filename, +inline bool LoadFileLinesToStrings(const char* filename, GenericVector* lines) { GenericVector data; - if (!LoadDataFromFile(filename.string(), &data)) { + if (!LoadDataFromFile(filename, &data)) { return false; } STRING lines_str(&data[0], data.size()); diff --git a/src/ccutil/serialis.cpp b/src/ccutil/serialis.cpp index 6faeb6f0..b740a5b5 100644 --- a/src/ccutil/serialis.cpp +++ b/src/ccutil/serialis.cpp @@ -2,7 +2,6 @@ * File: serialis.cpp (Formerly serialmac.h) * Description: Inline routines and macros for serialisation functions * Author: Phil Cheatle - * Created: Tue Oct 08 08:33:12 BST 1991 * * (C) Copyright 1990, Hewlett-Packard Ltd. ** Licensed under the Apache License, Version 2.0 (the "License"); @@ -202,9 +201,9 @@ bool TFile::Open(const STRING& filename, FileReader reader) { is_writing_ = false; swap_ = false; if (reader == nullptr) - return LoadDataFromFile(filename, data_); + return LoadDataFromFile(filename.c_str(), data_); else - return (*reader)(filename, data_); + return (*reader)(filename.c_str(), data_); } bool TFile::Open(const char* data, int size) { @@ -310,9 +309,9 @@ void TFile::OpenWrite(GenericVector* data) { bool TFile::CloseWrite(const STRING& filename, FileWriter writer) { ASSERT_HOST(is_writing_); if (writer == nullptr) - return SaveDataToFile(*data_, filename); + return SaveDataToFile(*data_, filename.c_str()); else - return (*writer)(*data_, filename); + return (*writer)(*data_, filename.c_str()); } int TFile::FWrite(const void* buffer, size_t size, int count) { diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h index 604f44c2..076d28e7 100644 --- a/src/ccutil/serialis.h +++ b/src/ccutil/serialis.h @@ -46,10 +46,10 @@ constexpr size_t countof(T const (&)[N]) noexcept { // Function to read a GenericVector from a whole file. // Returns false on failure. -using FileReader = bool (*)(const STRING&, GenericVector*); +using FileReader = bool (*)(const char* filename, GenericVector* data); // Function to write a GenericVector to a whole file. // Returns false on failure. -using FileWriter = bool (*)(const GenericVector&, const STRING&); +using FileWriter = bool (*)(const GenericVector& data, const char* filename); // Deserialize data from file. bool DeSerialize(FILE* fp, char* data, size_t n = 1); diff --git a/src/ccutil/tessdatamanager.cpp b/src/ccutil/tessdatamanager.cpp index ef83ba9f..ecd400c1 100644 --- a/src/ccutil/tessdatamanager.cpp +++ b/src/ccutil/tessdatamanager.cpp @@ -157,9 +157,9 @@ bool TessdataManager::SaveFile(const STRING &filename, GenericVector data; Serialize(&data); if (writer == nullptr) - return SaveDataToFile(data, filename); + return SaveDataToFile(data, filename.c_str()); else - return (*writer)(data, filename); + return (*writer)(data, filename.c_str()); } // Serializes to the given vector. @@ -253,7 +253,7 @@ bool TessdataManager::CombineDataFiles( FILE *fp = fopen(filename.string(), "rb"); if (fp != nullptr) { fclose(fp); - if (!LoadDataFromFile(filename, &entries_[type])) { + if (!LoadDataFromFile(filename.c_str(), &entries_[type])) { tprintf("Load of file %s failed!\n", filename.string()); return false; } diff --git a/src/training/lstmtester.cpp b/src/training/lstmtester.cpp index 045820d8..826039df 100644 --- a/src/training/lstmtester.cpp +++ b/src/training/lstmtester.cpp @@ -29,7 +29,7 @@ LSTMTester::LSTMTester(int64_t max_memory) // loaded. The arg is a filename of a file that lists the filenames. bool LSTMTester::LoadAllEvalData(const STRING& filenames_file) { GenericVector filenames; - if (!LoadFileLinesToStrings(filenames_file, &filenames)) { + if (!LoadFileLinesToStrings(filenames_file.c_str(), &filenames)) { tprintf("Failed to load list of eval filenames from %s\n", filenames_file.string()); return false; diff --git a/src/training/lstmtrainer.cpp b/src/training/lstmtrainer.cpp index 78bf8ba0..5185a78e 100644 --- a/src/training/lstmtrainer.cpp +++ b/src/training/lstmtrainer.cpp @@ -320,7 +320,7 @@ bool LSTMTrainer::MaintainCheckpoints(TestCallback tester, STRING* log_msg) { SaveTrainingDump(NO_BEST_TRAINER, this, &best_trainer_); if (error_rate < error_rate_of_last_saved_best_ * kBestCheckpointFraction) { STRING best_model_name = DumpFilename(); - if (!SaveDataToFile(best_trainer_, best_model_name)) { + if (!SaveDataToFile(best_trainer_, best_model_name.c_str())) { *log_msg += " failed to write best model:"; } else { *log_msg += " wrote best model:"; @@ -358,7 +358,7 @@ bool LSTMTrainer::MaintainCheckpoints(TestCallback tester, STRING* log_msg) { // Write a current checkpoint. GenericVector checkpoint; if (!SaveTrainingDump(FULL, this, &checkpoint) || - !SaveDataToFile(checkpoint, checkpoint_name_)) { + !SaveDataToFile(checkpoint, checkpoint_name_.c_str())) { *log_msg += " failed to write checkpoint."; } else { *log_msg += " wrote checkpoint."; From a730b5c4ff6683483573080e3dfc94aa093af59e Mon Sep 17 00:00:00 2001 From: Stefan Weil Date: Sun, 22 Sep 2019 20:31:37 +0200 Subject: [PATCH 2/2] Remove STRING from the public Tesseract API Removing STRING from genericvector.h allows eliminating the proprietary STRING data type from the public Tesseract API. Signed-off-by: Stefan Weil --- src/api/renderer.h | 1 + src/ccutil/Makefile.am | 7 ++++--- src/ccutil/genericvector.h | 14 ++------------ src/ccutil/serialis.cpp | 1 + src/ccutil/tessdatamanager.h | 2 +- src/lstm/network.h | 1 + src/training/fileio.h | 16 +++++++++++++++- src/training/lstmtester.cpp | 1 + src/training/lstmtraining.cpp | 1 + 9 files changed, 27 insertions(+), 17 deletions(-) diff --git a/src/api/renderer.h b/src/api/renderer.h index 1fa1534c..177fd5ff 100644 --- a/src/api/renderer.h +++ b/src/api/renderer.h @@ -24,6 +24,7 @@ #include // for std::string #include "genericvector.h" #include "platform.h" +#include "strngs.h" // for STRING struct Pix; diff --git a/src/ccutil/Makefile.am b/src/ccutil/Makefile.am index a5098966..b7265af5 100644 --- a/src/ccutil/Makefile.am +++ b/src/ccutil/Makefile.am @@ -16,7 +16,7 @@ endif pkginclude_HEADERS = \ genericvector.h helpers.h \ - ocrclass.h platform.h serialis.h strngs.h \ + ocrclass.h platform.h serialis.h \ unichar.h noinst_HEADERS = \ @@ -25,8 +25,9 @@ noinst_HEADERS = \ genericheap.h globaloc.h host.h \ kdpair.h lsterr.h \ object_cache.h params.h qrsequence.h sorthelper.h \ - scanutils.h tessdatamanager.h tprintf.h \ - unicharcompress.h unicharmap.h unicharset.h unicity_table.h unicodes.h + scanutils.h strngs.h \ + tessdatamanager.h tprintf.h \ + unicharcompress.h unicharmap.h unicharset.h unicity_table.h unicodes.h if !DISABLED_LEGACY_ENGINE noinst_HEADERS += ambigs.h diff --git a/src/ccutil/genericvector.h b/src/ccutil/genericvector.h index 6e79d548..5f1d44c6 100644 --- a/src/ccutil/genericvector.h +++ b/src/ccutil/genericvector.h @@ -21,13 +21,14 @@ #include #include +#include // for LONG_MAX +#include // for uint32_t #include #include #include // for std::function #include "helpers.h" #include "serialis.h" -#include "strngs.h" // Use PointerVector below in preference to GenericVector, as that // provides automatic deletion of pointers, [De]Serialize that works, and @@ -374,17 +375,6 @@ inline bool SaveDataToFile(const GenericVector& data, fclose(fp); return result; } -// Reads a file as a vector of STRING. -inline bool LoadFileLinesToStrings(const char* filename, - GenericVector* lines) { - GenericVector data; - if (!LoadDataFromFile(filename, &data)) { - return false; - } - STRING lines_str(&data[0], data.size()); - lines_str.split('\n', lines); - return true; -} template bool cmp_eq(T const& t1, T const& t2) { diff --git a/src/ccutil/serialis.cpp b/src/ccutil/serialis.cpp index b740a5b5..cca47029 100644 --- a/src/ccutil/serialis.cpp +++ b/src/ccutil/serialis.cpp @@ -20,6 +20,7 @@ #include #include "errcode.h" #include "genericvector.h" +#include "strngs.h" // for STRING namespace tesseract { diff --git a/src/ccutil/tessdatamanager.h b/src/ccutil/tessdatamanager.h index 4372f291..3f6da32c 100644 --- a/src/ccutil/tessdatamanager.h +++ b/src/ccutil/tessdatamanager.h @@ -2,7 +2,6 @@ // File: tessdatamanager.h // Description: Functions to handle loading/combining tesseract data files. // Author: Daria Antonova -// Created: Wed Jun 03 11:26:43 PST 2009 // // (C) Copyright 2009, Google Inc. // Licensed under the Apache License, Version 2.0 (the "License"); @@ -21,6 +20,7 @@ #define TESSERACT_CCUTIL_TESSDATAMANAGER_H_ #include "genericvector.h" +#include "strngs.h" // for STRING static const char kTrainedDataSuffix[] = "traineddata"; diff --git a/src/lstm/network.h b/src/lstm/network.h index 24e047d6..f7289b84 100644 --- a/src/lstm/network.h +++ b/src/lstm/network.h @@ -27,6 +27,7 @@ #include "networkio.h" #include "serialis.h" #include "static_shape.h" +#include "strngs.h" // for STRING #include "tprintf.h" struct Pix; diff --git a/src/training/fileio.h b/src/training/fileio.h index 43435ce4..88e20a8b 100644 --- a/src/training/fileio.h +++ b/src/training/fileio.h @@ -2,7 +2,6 @@ * File: fileio.h * Description: File I/O utilities. * Author: Samuel Charron - * Created: Tuesday, July 9, 2013 * * (C) Copyright 2013, Google Inc. * Licensed under the Apache License, Version 2.0 (the "License"); you may not @@ -21,10 +20,25 @@ #include #include +#include "genericvector.h" // for GenericVector #include "platform.h" +#include "strngs.h" // for STRING namespace tesseract { +// Reads a file as a vector of STRING. +// TODO: Use std::vector and std::string for LoadFileLinesToStrings. +inline bool LoadFileLinesToStrings(const char* filename, + GenericVector* lines) { + GenericVector data; + if (!LoadDataFromFile(filename, &data)) { + return false; + } + STRING lines_str(&data[0], data.size()); + lines_str.split('\n', lines); + return true; +} + // A class to manipulate FILE*s. class File { public: diff --git a/src/training/lstmtester.cpp b/src/training/lstmtester.cpp index 826039df..d04f799f 100644 --- a/src/training/lstmtester.cpp +++ b/src/training/lstmtester.cpp @@ -16,6 +16,7 @@ /////////////////////////////////////////////////////////////////////// #include // for std::thread +#include "fileio.h" // for LoadFileLinesToStrings #include "lstmtester.h" #include "genericvector.h" diff --git a/src/training/lstmtraining.cpp b/src/training/lstmtraining.cpp index 1111183c..b2cf052e 100644 --- a/src/training/lstmtraining.cpp +++ b/src/training/lstmtraining.cpp @@ -20,6 +20,7 @@ #endif #include #include "commontraining.h" +#include "fileio.h" // for LoadFileLinesToStrings #include "lstmtester.h" #include "lstmtrainer.h" #include "params.h"