mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-22 09:53:03 +08:00
226 lines
6.5 KiB
C++
226 lines
6.5 KiB
C++
// Copyright 2008 Google Inc. All Rights Reserved.
|
|
// Author: scharron@google.com (Samuel Charron)
|
|
//
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
// you may not use this file except in compliance with the License.
|
|
// You may obtain a copy of the License at
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
// See the License for the specific language governing permissions and
|
|
// limitations under the License.
|
|
|
|
#ifndef TESSERACT_TRAINING_COMMONTRAINING_H_
|
|
#define TESSERACT_TRAINING_COMMONTRAINING_H_
|
|
|
|
#ifdef HAVE_CONFIG_H
|
|
#include "config_auto.h"
|
|
#endif
|
|
|
|
#include <tesseract/baseapi.h>
|
|
|
|
#ifdef DISABLED_LEGACY_ENGINE
|
|
|
|
#include "tprintf.h"
|
|
#include "commandlineflags.h"
|
|
|
|
|
|
void ParseArguments(int* argc, char*** argv);
|
|
|
|
|
|
namespace tesseract {
|
|
|
|
// Check whether the shared tesseract library is the right one.
|
|
// This function must be inline because otherwise it would be part of
|
|
// the shared library, so it could not compare the versions.
|
|
static inline void CheckSharedLibraryVersion()
|
|
{
|
|
#ifdef HAVE_CONFIG_H
|
|
if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
|
|
tprintf("ERROR: shared library version mismatch (was %s, expected %s\n"
|
|
"Did you use a wrong shared tesseract library?\n",
|
|
TessBaseAPI::Version(), TESSERACT_VERSION_STR);
|
|
exit(1);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
} // namespace tesseract
|
|
|
|
|
|
#else
|
|
|
|
#include "cluster.h"
|
|
#include "commandlineflags.h"
|
|
#include "featdefs.h"
|
|
#include "intproto.h"
|
|
#include "oldlist.h"
|
|
|
|
namespace tesseract {
|
|
class Classify;
|
|
class MasterTrainer;
|
|
class ShapeTable;
|
|
}
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// Globals ///////////////////////////////////////////////////////////////////
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
|
|
extern FEATURE_DEFS_STRUCT feature_defs;
|
|
|
|
// Must be defined in the file that "implements" commonTraining facilities.
|
|
extern CLUSTERCONFIG Config;
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// Structs ///////////////////////////////////////////////////////////////////
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
typedef struct
|
|
{
|
|
char *Label;
|
|
int SampleCount;
|
|
int font_sample_count;
|
|
LIST List;
|
|
}
|
|
LABELEDLISTNODE, *LABELEDLIST;
|
|
|
|
typedef struct
|
|
{
|
|
char* Label;
|
|
int NumMerged[MAX_NUM_PROTOS];
|
|
CLASS_TYPE Class;
|
|
}MERGE_CLASS_NODE;
|
|
using MERGE_CLASS = MERGE_CLASS_NODE*;
|
|
|
|
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
// Functions /////////////////////////////////////////////////////////////////
|
|
//////////////////////////////////////////////////////////////////////////////
|
|
void ParseArguments(int* argc, char*** argv);
|
|
|
|
namespace tesseract {
|
|
|
|
// Check whether the shared tesseract library is the right one.
|
|
// This function must be inline because otherwise it would be part of
|
|
// the shared library, so it could not compare the versions.
|
|
static inline void CheckSharedLibraryVersion()
|
|
{
|
|
#ifdef HAVE_CONFIG_H
|
|
if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) {
|
|
tprintf("ERROR: shared library version mismatch (was %s, expected %s\n"
|
|
"Did you use a wrong shared tesseract library?\n",
|
|
TessBaseAPI::Version(), TESSERACT_VERSION_STR);
|
|
exit(1);
|
|
}
|
|
#endif
|
|
}
|
|
|
|
// Helper loads shape table from the given file.
|
|
ShapeTable* LoadShapeTable(const STRING& file_prefix);
|
|
// Helper to write the shape_table.
|
|
void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table);
|
|
|
|
// Creates a MasterTraininer and loads the training data into it:
|
|
// Initializes feature_defs and IntegerFX.
|
|
// Loads the shape_table if shape_table != nullptr.
|
|
// Loads initial unicharset from -U command-line option.
|
|
// If FLAGS_input_trainer is set, loads the majority of data from there, else:
|
|
// Loads font info from -F option.
|
|
// Loads xheights from -X option.
|
|
// Loads samples from .tr files in remaining command-line args.
|
|
// Deletes outliers and computes canonical samples.
|
|
// If FLAGS_output_trainer is set, saves the trainer for future use.
|
|
// Computes canonical and cloud features.
|
|
// If shape_table is not nullptr, but failed to load, make a fake flat one,
|
|
// as shape clustering was not run.
|
|
MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
|
|
bool replication,
|
|
ShapeTable** shape_table,
|
|
STRING* file_prefix);
|
|
} // namespace tesseract.
|
|
|
|
const char *GetNextFilename(int argc, const char* const * argv);
|
|
|
|
LABELEDLIST FindList(
|
|
LIST List,
|
|
char *Label);
|
|
|
|
LABELEDLIST NewLabeledList(
|
|
const char *Label);
|
|
|
|
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
|
|
const char *feature_name, int max_samples,
|
|
UNICHARSET* unicharset,
|
|
FILE* file, LIST* training_samples);
|
|
|
|
void WriteTrainingSamples(
|
|
const FEATURE_DEFS_STRUCT &FeatureDefs,
|
|
char *Directory,
|
|
LIST CharList,
|
|
const char *program_feature_type);
|
|
|
|
void FreeTrainingSamples(
|
|
LIST CharList);
|
|
|
|
void FreeLabeledList(
|
|
LABELEDLIST LabeledList);
|
|
|
|
void FreeLabeledClassList(
|
|
LIST ClassListList);
|
|
|
|
CLUSTERER *SetUpForClustering(
|
|
const FEATURE_DEFS_STRUCT &FeatureDefs,
|
|
LABELEDLIST CharSample,
|
|
const char *program_feature_type);
|
|
|
|
LIST RemoveInsignificantProtos(
|
|
LIST ProtoList,
|
|
bool KeepSigProtos,
|
|
bool KeepInsigProtos,
|
|
int N);
|
|
|
|
void CleanUpUnusedData(
|
|
LIST ProtoList);
|
|
|
|
void MergeInsignificantProtos(
|
|
LIST ProtoList,
|
|
const char *label,
|
|
CLUSTERER *Clusterer,
|
|
CLUSTERCONFIG *Config);
|
|
|
|
MERGE_CLASS FindClass(
|
|
LIST List,
|
|
const char *Label);
|
|
|
|
MERGE_CLASS NewLabeledClass(
|
|
const char *Label);
|
|
|
|
void FreeTrainingSamples(
|
|
LIST CharList);
|
|
|
|
CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset,
|
|
LIST LabeledClassList);
|
|
|
|
void Normalize(
|
|
float *Values);
|
|
|
|
void FreeNormProtoList(
|
|
LIST CharList);
|
|
|
|
void AddToNormProtosList(
|
|
LIST* NormProtoList,
|
|
LIST ProtoList,
|
|
char *CharName);
|
|
|
|
int NumberOfProtos(
|
|
LIST ProtoList,
|
|
bool CountSigProtos,
|
|
bool CountInsigProtos);
|
|
|
|
|
|
void allocNormProtos();
|
|
|
|
#endif // def DISABLED_LEGACY_ENGINE
|
|
|
|
#endif // TESSERACT_TRAINING_COMMONTRAINING_H_
|