Merge pull request #4314 from stweil/optimize

Add C++ stream for log messages and use it in two debug messages
This commit is contained in:
Stefan Weil 2024-09-04 05:22:03 +02:00 committed by GitHub
commit 4f43536335
No known key found for this signature in database
GPG Key ID: B5690EEEBB952194
5 changed files with 96 additions and 15 deletions

View File

@ -41,6 +41,7 @@
#endif
#include "sorthelper.h"
#include "tesseractclass.h"
#include "tesserrstream.h" // for tesserr
#include "tessvars.h"
#include "werdit.h"
@ -1313,9 +1314,10 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordD
PointerVector<WERD_RES> best_words;
// Points to the best result. May be word or in lang_words.
const WERD_RES *word = word_data->word;
clock_t start_t = 0;
if (tessedit_timing_debug) {
start_t = clock();
clock_t total_time = 0;
const bool timing_debug = tessedit_timing_debug;
if (timing_debug) {
total_time = clock();
}
const bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
if (debug) {
@ -1368,10 +1370,10 @@ void Tesseract::classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordD
} else {
tprintf("no best words!!\n");
}
if (tessedit_timing_debug) {
clock_t ocr_t = clock();
tprintf("%s (ocr took %.2f sec)\n", word_data->word->best_choice->unichar_string().c_str(),
static_cast<double>(ocr_t - start_t) / CLOCKS_PER_SEC);
if (timing_debug) {
total_time = clock() - total_time;
tesserr << word_data->word->best_choice->unichar_string()
<< " (ocr took " << 1000 * total_time / CLOCKS_PER_SEC << " ms)\n";
}
}

View File

@ -0,0 +1,68 @@
// File: tesserrstream.h
// Description: C++ stream which enhances tprintf
// Author: Stefan Weil
//
// (C) Copyright 2024
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TESSERACT_CCUTIL_TESSERRSTREAM_H
#define TESSERACT_CCUTIL_TESSERRSTREAM_H
#include "tprintf.h"
#include <tesseract/export.h> // for TESS_API
#include <ostream> // for std::ostream
namespace tesseract {
class TessStreamBuf : public std::streambuf {
public:
TessStreamBuf() = default;
protected:
virtual int_type overflow(int_type c) override {
if (c != EOF) {
if (debugfp == nullptr) {
debugfp = get_debugfp();
}
if (fputc(c, debugfp) == EOF) {
return EOF;
}
}
return c;
}
virtual std::streamsize xsputn(const char* s, std::streamsize n) override {
if (debugfp == nullptr) {
debugfp = get_debugfp();
}
return fwrite(s, 1, n, debugfp);
}
private:
FILE *debugfp = nullptr;
};
class TessErrStream : public std::ostream {
private:
TessStreamBuf buf;
public:
TessErrStream() : std::ostream(nullptr), buf() {
rdbuf(&buf);
}
};
extern TESS_API TessErrStream tesserr;
} // namespace tesseract
#endif // TESSERACT_CCUTIL_TESSERRSTREAM_H

View File

@ -21,6 +21,7 @@
# include "config_auto.h"
#endif
#include "tesserrstream.h"
#include "tprintf.h"
#include "params.h"
@ -36,7 +37,7 @@ INT_VAR(log_level, INT_MAX, "Logging level");
static STRING_VAR(debug_file, "", "File to send tprintf output to");
// File for debug output.
static FILE *debugfp;
FILE *debugfp;
// Set output for log messages.
// The output is written to stderr if debug_file is empty.
@ -49,7 +50,7 @@ static FILE *debugfp;
// tprintf("write to /tmp/log\n");
// debug_file = "";
// tprintf("write to stderr\n");
static void set_debugfp() {
FILE *get_debugfp() {
if (debug_file.empty()) {
// Write to stderr.
if (debugfp != stderr && debugfp != nullptr) {
@ -66,15 +67,18 @@ static void set_debugfp() {
#endif
debugfp = fopen(debug_file.c_str(), "wb");
}
return debugfp;
}
// Trace printf.
void tprintf(const char *format, ...) {
set_debugfp();
FILE *f = get_debugfp();
va_list args; // variable args
va_start(args, format); // variable list
vfprintf(debugfp, format, args);
vfprintf(f, format, args);
va_end(args);
}
TessErrStream tesserr;
} // namespace tesseract

View File

@ -36,6 +36,9 @@ extern TESS_API void tprintf( // Trace printf
const char *format, ...) // Message
__attribute__((format(printf, 1, 2)));
// Get file for debug output.
FILE *get_debugfp();
} // namespace tesseract
#undef __attribute__

View File

@ -23,6 +23,7 @@
#include "sampleiterator.h"
#include "shapeclassifier.h"
#include "shapetable.h"
#include "tesserrstream.h"
#include "trainingsample.h"
#include "trainingsampleset.h"
#include "unicity_table.h"
@ -50,7 +51,10 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_le
ErrorCounter counter(classifier->GetUnicharset(), fontsize);
std::vector<UnicharRating> results;
clock_t start = clock();
clock_t total_time = 0;
if (report_level > 1) {
total_time = clock();
}
unsigned total_samples = 0;
double unscaled_error = 0.0;
// Set a number of samples on which to run the classify debug mode.
@ -85,7 +89,6 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_le
}
++total_samples;
}
const double total_time = 1.0 * (clock() - start) / CLOCKS_PER_SEC;
// Create the appropriate error report.
unscaled_error = counter.ReportErrors(report_level, boosting_mode, fontinfo_table, *it,
unichar_error, fonts_report);
@ -94,8 +97,9 @@ double ErrorCounter::ComputeErrorRate(ShapeClassifier *classifier, int report_le
}
if (report_level > 1 && total_samples > 0) {
// It is useful to know the time in microseconds/char.
tprintf("Errors computed in %.2fs at %.1f μs/char\n", total_time,
1000000.0 * total_time / total_samples);
total_time = 1000 * (clock() - total_time) / CLOCKS_PER_SEC;
tesserr << "Errors computed in " << total_time << " ms at "
<< 1000 * total_time / total_samples << " μs/char\n";
}
return unscaled_error;
}