// Copyright 2008 Google Inc. All Rights Reserved. // Author: scharron@google.com (Samuel Charron) // // Licensed under the Apache License, Version 2.0 (the "License"); // you may not use this file except in compliance with the License. // You may obtain a copy of the License at // http://www.apache.org/licenses/LICENSE-2.0 // Unless required by applicable law or agreed to in writing, software // distributed under the License is distributed on an "AS IS" BASIS, // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // See the License for the specific language governing permissions and // limitations under the License. #ifndef TESSERACT_TRAINING_COMMONTRAINING_H_ #define TESSERACT_TRAINING_COMMONTRAINING_H_ #ifdef HAVE_CONFIG_H #include "config_auto.h" #include "baseapi.h" #endif #include "cluster.h" #include "commandlineflags.h" #include "featdefs.h" #include "intproto.h" #include "oldlist.h" namespace tesseract { class Classify; class MasterTrainer; class ShapeTable; } ////////////////////////////////////////////////////////////////////////////// // Globals /////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// extern FEATURE_DEFS_STRUCT feature_defs; // Must be defined in the file that "implements" commonTraining facilities. extern CLUSTERCONFIG Config; ////////////////////////////////////////////////////////////////////////////// // Structs /////////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// typedef struct { char *Label; int SampleCount; int font_sample_count; LIST List; } LABELEDLISTNODE, *LABELEDLIST; typedef struct { char* Label; int NumMerged[MAX_NUM_PROTOS]; CLASS_TYPE Class; }MERGE_CLASS_NODE; using MERGE_CLASS = MERGE_CLASS_NODE*; ////////////////////////////////////////////////////////////////////////////// // Functions ///////////////////////////////////////////////////////////////// ////////////////////////////////////////////////////////////////////////////// void ParseArguments(int* argc, char*** argv); namespace tesseract { // Check whether the shared tesseract library is the right one. // This function must be inline because otherwise it would be part of // the shared library, so it could not compare the versions. static inline void CheckSharedLibraryVersion() { #ifdef HAVE_CONFIG_H if (!!strcmp(TESSERACT_VERSION_STR, TessBaseAPI::Version())) { tprintf("ERROR: shared library version mismatch (was %s, expected %s\n" "Did you use a wrong shared tesseract library?\n", TessBaseAPI::Version(), TESSERACT_VERSION_STR); exit(1); } #endif } // Helper loads shape table from the given file. ShapeTable* LoadShapeTable(const STRING& file_prefix); // Helper to write the shape_table. void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table); // Creates a MasterTraininer and loads the training data into it: // Initializes feature_defs and IntegerFX. // Loads the shape_table if shape_table != nullptr. // Loads initial unicharset from -U command-line option. // If FLAGS_input_trainer is set, loads the majority of data from there, else: // Loads font info from -F option. // Loads xheights from -X option. // Loads samples from .tr files in remaining command-line args. // Deletes outliers and computes canonical samples. // If FLAGS_output_trainer is set, saves the trainer for future use. // Computes canonical and cloud features. // If shape_table is not nullptr, but failed to load, make a fake flat one, // as shape clustering was not run. MasterTrainer* LoadTrainingData(int argc, const char* const * argv, bool replication, ShapeTable** shape_table, STRING* file_prefix); } // namespace tesseract. const char *GetNextFilename(int argc, const char* const * argv); LABELEDLIST FindList( LIST List, char *Label); LABELEDLIST NewLabeledList( const char *Label); void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs, const char *feature_name, int max_samples, UNICHARSET* unicharset, FILE* file, LIST* training_samples); void WriteTrainingSamples( const FEATURE_DEFS_STRUCT &FeatureDefs, char *Directory, LIST CharList, const char *program_feature_type); void FreeTrainingSamples( LIST CharList); void FreeLabeledList( LABELEDLIST LabeledList); void FreeLabeledClassList( LIST ClassListList); CLUSTERER *SetUpForClustering( const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST CharSample, const char *program_feature_type); LIST RemoveInsignificantProtos( LIST ProtoList, bool KeepSigProtos, bool KeepInsigProtos, int N); void CleanUpUnusedData( LIST ProtoList); void MergeInsignificantProtos( LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config); MERGE_CLASS FindClass( LIST List, const char *Label); MERGE_CLASS NewLabeledClass( const char *Label); void FreeTrainingSamples( LIST CharList); CLASS_STRUCT* SetUpForFloat2Int(const UNICHARSET& unicharset, LIST LabeledClassList); void Normalize( float *Values); void FreeNormProtoList( LIST CharList); void AddToNormProtosList( LIST* NormProtoList, LIST ProtoList, char *CharName); int NumberOfProtos( LIST ProtoList, bool CountSigProtos, bool CountInsigProtos); void allocNormProtos(); #endif // TESSERACT_TRAINING_COMMONTRAINING_H_