mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 06:30:14 +08:00
Updated tessdatamanager/combine_tessdata to give more functionality
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@353 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
a5b4570180
commit
45aacc077e
@ -65,22 +65,14 @@ void TessdataManager::Init(const char *data_file_name) {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
FILE *TessdataManager::GetFilePtr(const char *language_data_path_prefix,
|
|
||||||
const char *file_suffix, bool required_file,
|
|
||||||
bool text_file) {
|
|
||||||
STRING file_name = language_data_path_prefix;
|
|
||||||
file_name += file_suffix;
|
|
||||||
FILE *file_ptr = fopen(file_name.string(), text_file ? "r" : "rb");
|
|
||||||
if (required_file && (file_ptr == NULL)) {
|
|
||||||
tprintf("Error openning required file %s\n", file_name.string());
|
|
||||||
exit(1);
|
|
||||||
}
|
|
||||||
return file_ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
|
void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
|
||||||
bool newline_end) {
|
bool newline_end, inT64 num_bytes_to_copy) {
|
||||||
|
if (num_bytes_to_copy == 0) return;
|
||||||
int buffer_size = 1024;
|
int buffer_size = 1024;
|
||||||
|
if (num_bytes_to_copy > 0 && buffer_size > num_bytes_to_copy) {
|
||||||
|
buffer_size = num_bytes_to_copy;
|
||||||
|
}
|
||||||
|
inT64 num_bytes_copied = 0;
|
||||||
char *chunk = new char[buffer_size];
|
char *chunk = new char[buffer_size];
|
||||||
int bytes_read;
|
int bytes_read;
|
||||||
char last_char = 0x0;
|
char last_char = 0x0;
|
||||||
@ -88,106 +80,19 @@ void TessdataManager::CopyFile(FILE *input_file, FILE *output_file,
|
|||||||
buffer_size, input_file))) {
|
buffer_size, input_file))) {
|
||||||
fwrite(chunk, sizeof(char), bytes_read, output_file);
|
fwrite(chunk, sizeof(char), bytes_read, output_file);
|
||||||
last_char = chunk[bytes_read-1];
|
last_char = chunk[bytes_read-1];
|
||||||
|
if (num_bytes_to_copy > 0) {
|
||||||
|
num_bytes_copied += bytes_read;
|
||||||
|
if (num_bytes_copied == num_bytes_to_copy) break;
|
||||||
|
if (num_bytes_copied + buffer_size > num_bytes_to_copy) {
|
||||||
|
buffer_size = num_bytes_to_copy - num_bytes_copied;
|
||||||
|
}
|
||||||
|
}
|
||||||
}
|
}
|
||||||
if (newline_end) ASSERT_HOST(last_char == '\n');
|
if (newline_end) ASSERT_HOST(last_char == '\n');
|
||||||
delete[] chunk;
|
delete[] chunk;
|
||||||
}
|
}
|
||||||
|
|
||||||
void TessdataManager::CombineDataFiles(
|
void TessdataManager::WriteMetadata(inT64 *offset_table, FILE *output_file) {
|
||||||
const char *language_data_path_prefix,
|
|
||||||
const char *output_filename) {
|
|
||||||
FILE *file_ptr;
|
|
||||||
STRING file_name;
|
|
||||||
int i;
|
|
||||||
inT64 offset_table[TESSDATA_NUM_ENTRIES];
|
|
||||||
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
|
|
||||||
FILE *output_file = fopen(output_filename, "wb");
|
|
||||||
// Leave some space for recording the offset_table.
|
|
||||||
fseek(output_file,
|
|
||||||
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
|
|
||||||
|
|
||||||
// Record language-specific tesseract config file.
|
|
||||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
|
||||||
kLangConfigFileSuffix, false, true);
|
|
||||||
if (file_ptr != NULL) {
|
|
||||||
offset_table[TESSDATA_LANG_CONFIG] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, true);
|
|
||||||
fclose(file_ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Record unicharset.
|
|
||||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
|
||||||
kUnicharsetFileSuffix, true, true);
|
|
||||||
offset_table[TESSDATA_UNICHARSET] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, true);
|
|
||||||
fclose(file_ptr);
|
|
||||||
|
|
||||||
// Record ambiguities.
|
|
||||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
|
||||||
kAmbigsFileSuffix, false, true);
|
|
||||||
if (file_ptr != NULL) {
|
|
||||||
offset_table[TESSDATA_AMBIGS] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, true);
|
|
||||||
fclose(file_ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Record inttemp.
|
|
||||||
file_ptr =
|
|
||||||
GetFilePtr(language_data_path_prefix,
|
|
||||||
kBuiltInTemplatesFileSuffix, false, false);
|
|
||||||
if (file_ptr != NULL) {
|
|
||||||
offset_table[TESSDATA_INTTEMP] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, false);
|
|
||||||
fclose(file_ptr);
|
|
||||||
|
|
||||||
// Record pffmtable.
|
|
||||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
|
||||||
kBuiltInCutoffsFileSuffix, true, true);
|
|
||||||
offset_table[TESSDATA_PFFMTABLE] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, true);
|
|
||||||
fclose(file_ptr);
|
|
||||||
|
|
||||||
// Record normproto.
|
|
||||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
|
||||||
kNormProtoFileSuffix, true, true);
|
|
||||||
offset_table[TESSDATA_NORMPROTO] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, true);
|
|
||||||
fclose(file_ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
// Record dawgs.
|
|
||||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
|
||||||
kPuncDawgFileSuffix, false, false);
|
|
||||||
if (file_ptr != NULL) {
|
|
||||||
offset_table[TESSDATA_PUNC_DAWG] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, false);
|
|
||||||
fclose(file_ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
|
||||||
kSystemDawgFileSuffix, false, false);
|
|
||||||
if (file_ptr != NULL) {
|
|
||||||
offset_table[TESSDATA_SYSTEM_DAWG] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, false);
|
|
||||||
fclose(file_ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
|
||||||
kNumberDawgFileSuffix, false, false);
|
|
||||||
if (file_ptr != NULL) {
|
|
||||||
offset_table[TESSDATA_NUMBER_DAWG] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, false);
|
|
||||||
fclose(file_ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
file_ptr = GetFilePtr(language_data_path_prefix,
|
|
||||||
kFreqDawgFileSuffix, false, false);
|
|
||||||
if (file_ptr != NULL) {
|
|
||||||
offset_table[TESSDATA_FREQ_DAWG] = ftell(output_file);
|
|
||||||
CopyFile(file_ptr, output_file, false);
|
|
||||||
fclose(file_ptr);
|
|
||||||
}
|
|
||||||
|
|
||||||
fseek(output_file, 0, SEEK_SET);
|
fseek(output_file, 0, SEEK_SET);
|
||||||
inT32 num_entries = TESSDATA_NUM_ENTRIES;
|
inT32 num_entries = TESSDATA_NUM_ENTRIES;
|
||||||
fwrite(&num_entries, sizeof(inT32), 1, output_file);
|
fwrite(&num_entries, sizeof(inT32), 1, output_file);
|
||||||
@ -195,9 +100,155 @@ void TessdataManager::CombineDataFiles(
|
|||||||
fclose(output_file);
|
fclose(output_file);
|
||||||
|
|
||||||
tprintf("TessdataManager combined tesseract data files.\n");
|
tprintf("TessdataManager combined tesseract data files.\n");
|
||||||
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||||
tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
|
tprintf("Offset for type %d is %lld\n", i, offset_table[i]);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool TessdataManager::CombineDataFiles(
|
||||||
|
const char *language_data_path_prefix,
|
||||||
|
const char *output_filename) {
|
||||||
|
int i;
|
||||||
|
inT64 offset_table[TESSDATA_NUM_ENTRIES];
|
||||||
|
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) offset_table[i] = -1;
|
||||||
|
FILE *output_file = fopen(output_filename, "wb");
|
||||||
|
if (output_file == NULL) {
|
||||||
|
tprintf("Error opening %s for writing\n", output_filename);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// Leave some space for recording the offset_table.
|
||||||
|
fseek(output_file,
|
||||||
|
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
|
||||||
|
|
||||||
|
TessdataType type;
|
||||||
|
bool text_file;
|
||||||
|
FILE *file_ptr[TESSDATA_NUM_ENTRIES];
|
||||||
|
|
||||||
|
// Load individual tessdata components from files.
|
||||||
|
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||||
|
ASSERT_HOST(TessdataTypeFromFileSuffix(
|
||||||
|
kTessdataFileSuffixes[i], &type, &text_file));
|
||||||
|
STRING filename = language_data_path_prefix;
|
||||||
|
filename += kTessdataFileSuffixes[i];
|
||||||
|
file_ptr[i] = fopen(filename.string(), text_file ? "r" : "rb");
|
||||||
|
if (file_ptr[i] != NULL) {
|
||||||
|
offset_table[type] = ftell(output_file);
|
||||||
|
CopyFile(file_ptr[i], output_file, text_file, -1);
|
||||||
|
fclose(file_ptr[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Make sure that the required components are present.
|
||||||
|
if (file_ptr[TESSDATA_UNICHARSET] == NULL) {
|
||||||
|
tprintf("Error opening unicharset file\n");
|
||||||
|
fclose(output_file);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
if (file_ptr[TESSDATA_INTTEMP] != NULL &&
|
||||||
|
(file_ptr[TESSDATA_PFFMTABLE] == NULL ||
|
||||||
|
file_ptr[TESSDATA_NORMPROTO] == NULL)) {
|
||||||
|
tprintf("Error opening pffmtable and/or normproto files"
|
||||||
|
" while inttemp file was present\n");
|
||||||
|
fclose(output_file);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
WriteMetadata(offset_table, output_file);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TessdataManager::OverwriteComponents(
|
||||||
|
const char *new_traineddata_filename,
|
||||||
|
char **component_filenames,
|
||||||
|
int num_new_components) {
|
||||||
|
int i;
|
||||||
|
inT64 offset_table[TESSDATA_NUM_ENTRIES];
|
||||||
|
TessdataType type;
|
||||||
|
bool text_file;
|
||||||
|
FILE *file_ptr[TESSDATA_NUM_ENTRIES];
|
||||||
|
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||||
|
offset_table[i] = -1;
|
||||||
|
file_ptr[i] = NULL;
|
||||||
|
}
|
||||||
|
FILE *output_file = fopen(new_traineddata_filename, "wb");
|
||||||
|
if (output_file == NULL) {
|
||||||
|
tprintf("Error opening %s for writing\n", new_traineddata_filename);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
// Leave some space for recording the offset_table.
|
||||||
|
fseek(output_file,
|
||||||
|
sizeof(inT32) + sizeof(inT64) * TESSDATA_NUM_ENTRIES, SEEK_SET);
|
||||||
|
|
||||||
|
// Open the files with the new components.
|
||||||
|
for (i = 0; i < num_new_components; ++i) {
|
||||||
|
TessdataTypeFromFileName(component_filenames[i], &type, &text_file);
|
||||||
|
file_ptr[type] = fopen(component_filenames[i], text_file ? "r" : "rb");
|
||||||
|
}
|
||||||
|
|
||||||
|
// Write updated data to the output traineddata file.
|
||||||
|
for (i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||||
|
if (file_ptr[i] != NULL) {
|
||||||
|
// Get the data from the opened component file.
|
||||||
|
offset_table[i] = ftell(output_file);
|
||||||
|
CopyFile(file_ptr[i], output_file, kTessdataFileIsText[i], -1);
|
||||||
|
fclose(file_ptr[i]);
|
||||||
|
} else {
|
||||||
|
// Get this data component from the loaded data file.
|
||||||
|
if (SeekToStart(static_cast<TessdataType>(i))) {
|
||||||
|
offset_table[i] = ftell(output_file);
|
||||||
|
CopyFile(data_file_, output_file, kTessdataFileIsText[i],
|
||||||
|
GetEndOffset(static_cast<TessdataType>(i)) -
|
||||||
|
ftell(data_file_) + 1);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
WriteMetadata(offset_table, output_file);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TessdataManager::TessdataTypeFromFileSuffix(
|
||||||
|
const char *suffix, TessdataType *type, bool *text_file) {
|
||||||
|
for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
|
||||||
|
if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
|
||||||
|
*type = static_cast<TessdataType>(i);
|
||||||
|
*text_file = kTessdataFileIsText[i];
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
printf("TessdataManager can't determine which tessdata"
|
||||||
|
" component is represented by %s\n", suffix);
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TessdataManager::TessdataTypeFromFileName(
|
||||||
|
const char *filename, TessdataType *type, bool *text_file) {
|
||||||
|
// Get the file suffix (extension)
|
||||||
|
const char *suffix = strrchr(filename, '.');
|
||||||
|
if (suffix == NULL || *(++suffix) == '\0') return false;
|
||||||
|
return TessdataTypeFromFileSuffix(suffix, type, text_file);
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TessdataManager::ExtractToFile(const char *filename) {
|
||||||
|
TessdataType type;
|
||||||
|
bool text_file;
|
||||||
|
ASSERT_HOST(tesseract::TessdataManager::TessdataTypeFromFileName(
|
||||||
|
filename, &type, &text_file));
|
||||||
|
if (!SeekToStart(type)) return false;
|
||||||
|
|
||||||
|
FILE *output_file = fopen(filename, "wb");
|
||||||
|
if (output_file == NULL) {
|
||||||
|
printf("Error openning %s\n", filename);
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
inT64 begin_offset = ftell(GetDataFilePtr());
|
||||||
|
inT64 end_offset = GetEndOffset(type);
|
||||||
|
tesseract::TessdataManager::CopyFile(
|
||||||
|
GetDataFilePtr(), output_file, text_file,
|
||||||
|
end_offset - begin_offset + 1);
|
||||||
|
fclose(output_file);
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
} // namespace tesseract
|
} // namespace tesseract
|
||||||
|
@ -37,6 +37,8 @@ extern INT_VAR_H(global_tessdata_manager_debug_level, 0,
|
|||||||
|
|
||||||
static const char kTrainedDataSuffix[] = "traineddata";
|
static const char kTrainedDataSuffix[] = "traineddata";
|
||||||
|
|
||||||
|
// When adding new tessdata types and file suffixes, please make sure to
|
||||||
|
// update TessdataType enum, kTessdataFileSuffixes and kTessdataFileIsText.
|
||||||
static const char kLangConfigFileSuffix[] = "config";
|
static const char kLangConfigFileSuffix[] = "config";
|
||||||
static const char kUnicharsetFileSuffix[] = "unicharset";
|
static const char kUnicharsetFileSuffix[] = "unicharset";
|
||||||
static const char kAmbigsFileSuffix[] = "unicharambigs";
|
static const char kAmbigsFileSuffix[] = "unicharambigs";
|
||||||
@ -65,6 +67,36 @@ enum TessdataType {
|
|||||||
TESSDATA_NUM_ENTRIES
|
TESSDATA_NUM_ENTRIES
|
||||||
};
|
};
|
||||||
|
|
||||||
|
// kTessdataFileSuffixes[i] indicates the file suffix for
|
||||||
|
// tessdata of type i (from TessdataType enum).
|
||||||
|
static const char * const kTessdataFileSuffixes[] = {
|
||||||
|
kLangConfigFileSuffix, // 0
|
||||||
|
kUnicharsetFileSuffix, // 1
|
||||||
|
kAmbigsFileSuffix, // 2
|
||||||
|
kBuiltInTemplatesFileSuffix, // 3
|
||||||
|
kBuiltInCutoffsFileSuffix, // 4
|
||||||
|
kNormProtoFileSuffix, // 5
|
||||||
|
kPuncDawgFileSuffix, // 6
|
||||||
|
kSystemDawgFileSuffix, // 7
|
||||||
|
kNumberDawgFileSuffix, // 8
|
||||||
|
kFreqDawgFileSuffix, // 9
|
||||||
|
};
|
||||||
|
|
||||||
|
// If kTessdataFileIsText[i] is true - the tessdata component
|
||||||
|
// of type i (from TessdataType enum) is text, and is binary otherwise.
|
||||||
|
static const bool kTessdataFileIsText[] = {
|
||||||
|
true, // 0
|
||||||
|
true, // 1
|
||||||
|
true, // 2
|
||||||
|
false, // 3
|
||||||
|
true, // 4
|
||||||
|
true, // 5
|
||||||
|
false, // 6
|
||||||
|
false, // 7
|
||||||
|
false, // 8
|
||||||
|
false, // 9
|
||||||
|
};
|
||||||
|
|
||||||
// TessdataType could be updated to contain more entries, however
|
// TessdataType could be updated to contain more entries, however
|
||||||
// we do not expect that number to be astronomically high.
|
// we do not expect that number to be astronomically high.
|
||||||
// In order to automatically detect endianness TessdataManager will
|
// In order to automatically detect endianness TessdataManager will
|
||||||
@ -102,7 +134,8 @@ class TessdataManager {
|
|||||||
return false;
|
return false;
|
||||||
} else {
|
} else {
|
||||||
ASSERT_HOST(fseek(data_file_,
|
ASSERT_HOST(fseek(data_file_,
|
||||||
offset_table_[tessdata_type], SEEK_SET) == 0);
|
static_cast<size_t>(offset_table_[tessdata_type]),
|
||||||
|
SEEK_SET) == 0);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -128,24 +161,55 @@ class TessdataManager {
|
|||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// Writes the number of entries and the given offset table to output_file.
|
||||||
|
static void WriteMetadata(inT64 *offset_table, FILE *output_file);
|
||||||
|
|
||||||
// Reads all the standard tesseract config and data files for a language
|
// Reads all the standard tesseract config and data files for a language
|
||||||
// at the given path and bundles them up into one binary data file.
|
// at the given path and bundles them up into one binary data file.
|
||||||
static void CombineDataFiles(const char *language_data_path_prefix,
|
// Returns true if the combined traineddata file was successfully written.
|
||||||
|
static bool CombineDataFiles(const char *language_data_path_prefix,
|
||||||
const char *output_filename);
|
const char *output_filename);
|
||||||
|
|
||||||
|
// Gets the individual components from the data_file_ with which the class was
|
||||||
|
// initialized. Overwrites the components specified by component_filenames.
|
||||||
|
// Writes the updated traineddata file to new_traineddata_filename.
|
||||||
|
bool OverwriteComponents(const char *new_traineddata_filename,
|
||||||
|
char **component_filenames,
|
||||||
|
int num_new_components);
|
||||||
|
|
||||||
|
// Extracts tessdata component implied by the name of the input file from
|
||||||
|
// the combined traineddata loaded into TessdataManager.
|
||||||
|
// Writes the extracted component to the file indicated by the file name.
|
||||||
|
// E.g. if the filename given is somepath/somelang.unicharset, unicharset
|
||||||
|
// will be extracted from the data loaded into the TessdataManager and will
|
||||||
|
// be written to somepath/somelang.unicharset.
|
||||||
|
// Returns true if the component was successfully extracted, false if the
|
||||||
|
// component was not present in the traineddata loaded into TessdataManager.
|
||||||
|
bool ExtractToFile(const char *filename);
|
||||||
|
|
||||||
|
// Copies data from the given input file to the output_file provided.
|
||||||
|
// If num_bytes_to_copy is >= 0, only num_bytes_to_copy is copied from
|
||||||
|
// the input file, otherwise all the data in the input file is copied.
|
||||||
|
static void CopyFile(FILE *input_file, FILE *output_file,
|
||||||
|
bool newline_end, inT64 num_bytes_to_copy);
|
||||||
|
|
||||||
|
// Fills type with TessdataType of the tessdata component represented by the
|
||||||
|
// given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.
|
||||||
|
// Sets *text_file to true if the component is in text format (e.g.
|
||||||
|
// unicharset, unichar ambigs, config, etc).
|
||||||
|
// Returns true if the tessdata component type could be determined
|
||||||
|
// from the given file name.
|
||||||
|
static bool TessdataTypeFromFileSuffix(const char *suffix,
|
||||||
|
TessdataType *type,
|
||||||
|
bool *text_file);
|
||||||
|
|
||||||
|
// Tries to determine tessdata component file suffix from filename,
|
||||||
|
// returns true on success.
|
||||||
|
static bool TessdataTypeFromFileName(const char *filename,
|
||||||
|
TessdataType *type,
|
||||||
|
bool *text_file);
|
||||||
|
|
||||||
private:
|
private:
|
||||||
|
|
||||||
// Opens the file whose name is a concatentation of language_data_path_prefix
|
|
||||||
// and file_suffix. Terminates the program if required_file is set to true,
|
|
||||||
// but the file could not be found or opened for reading.
|
|
||||||
// Returns a file pointer to the opened file.
|
|
||||||
static FILE *GetFilePtr(const char *language_data_path_prefix,
|
|
||||||
const char *file_suffix, bool required_file,
|
|
||||||
bool text_file);
|
|
||||||
|
|
||||||
// Copies all the bytes in the given input file to the output_file provided.
|
|
||||||
static void CopyFile(FILE *input_file, FILE *output_file, bool newline_end);
|
|
||||||
|
|
||||||
// Each offset_table_[i] contains a file offset in the combined data file
|
// Each offset_table_[i] contains a file offset in the combined data file
|
||||||
// where the data of TessdataFileType i is stored.
|
// where the data of TessdataFileType i is stored.
|
||||||
inT64 offset_table_[TESSDATA_NUM_ENTRIES];
|
inT64 offset_table_[TESSDATA_NUM_ENTRIES];
|
||||||
|
@ -20,12 +20,113 @@
|
|||||||
|
|
||||||
#include "tessdatamanager.h"
|
#include "tessdatamanager.h"
|
||||||
|
|
||||||
|
// Main program to combine/extract/overwrite tessdata components
|
||||||
|
// in [lang].traineddata files.
|
||||||
|
//
|
||||||
|
// To combine all the individual tessdata components (unicharset, DAWGs,
|
||||||
|
// classifier templates, ambiguities, language configs) located at, say,
|
||||||
|
// /home/$USER/temp/eng.* run:
|
||||||
|
//
|
||||||
|
// combine_tessdata /home/$USER/temp/eng.
|
||||||
|
//
|
||||||
|
// The result will be a combined tessdata file /home/$USER/temp/eng.traineddata
|
||||||
|
//
|
||||||
|
// Specify option -e if you would like to extract individual components
|
||||||
|
// from a combined traineddata file. For example, to extract language config
|
||||||
|
// file and the unicharset from tessdata/eng.traineddata run:
|
||||||
|
//
|
||||||
|
// combine_tessdata -e tessdata/eng.traineddata
|
||||||
|
// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
|
||||||
|
//
|
||||||
|
// The desired config file and unicharset will be written to
|
||||||
|
// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharset
|
||||||
|
//
|
||||||
|
// Specify option -o to overwrite individual components of the given
|
||||||
|
// [lang].traineddata file. For example, to overwrite language config
|
||||||
|
// and unichar ambiguities files in tessdata/eng.traineddata use:
|
||||||
|
//
|
||||||
|
// combine_tessdata -o tessdata/eng.traineddata
|
||||||
|
// /home/$USER/temp/eng.config /home/$USER/temp/eng.unicharambigs
|
||||||
|
//
|
||||||
|
// As a result, tessdata/eng.traineddata will contain the new language config
|
||||||
|
// and unichar ambigs, plus all the original DAWGs, classifier teamples, etc.
|
||||||
|
//
|
||||||
|
// Note: the file names of the files to extract to and to overwrite from should
|
||||||
|
// have the appropriate file suffixes (extensions) indicating their tessdata
|
||||||
|
// component type (.unicharset for the unicharset, .unicharambigs for unichar
|
||||||
|
// ambigs, etc). See k*FileSuffix variable in ccutil/tessdatamanager.h.
|
||||||
|
//
|
||||||
|
// Specify option -u to unpack all the components to the specified path:
|
||||||
|
//
|
||||||
|
// combine_tessdata -u tessdata/eng.traineddata /home/$USER/temp/eng.
|
||||||
|
//
|
||||||
|
// This will create /home/$USER/temp/eng.* files with individual tessdata
|
||||||
|
// components from tessdata/eng.traineddata.
|
||||||
|
//
|
||||||
int main(int argc, char **argv) {
|
int main(int argc, char **argv) {
|
||||||
if (!(argc == 2)) {
|
int i;
|
||||||
printf("Usage: %s language_data_path_prefix (e.g. tessdata/eng.)", argv[0]);
|
if (argc == 2) {
|
||||||
|
printf("Combininig tessdata files\n");
|
||||||
|
STRING output_file = argv[1];
|
||||||
|
output_file += kTrainedDataSuffix;
|
||||||
|
if (!tesseract::TessdataManager::CombineDataFiles(
|
||||||
|
argv[1], output_file.string())) {
|
||||||
|
printf("Error combining tessdata files into %s\n",
|
||||||
|
output_file.string());
|
||||||
|
}
|
||||||
|
} else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
|
||||||
|
strcmp(argv[1], "-u") == 0)) {
|
||||||
|
// Initialize TessdataManager with the data in the given traineddata file.
|
||||||
|
tesseract::TessdataManager tm;
|
||||||
|
tm.Init(argv[2]);
|
||||||
|
printf("Extracting tessdata components from %s\n", argv[2]);
|
||||||
|
if (strcmp(argv[1], "-e") == 0) {
|
||||||
|
for (i = 3; i < argc; ++i) {
|
||||||
|
if (tm.ExtractToFile(argv[i])) {
|
||||||
|
printf("Wrote %s\n", argv[i]);
|
||||||
|
} else {
|
||||||
|
printf("Not extracting %s, since this component"
|
||||||
|
" is not present\n", argv[i]);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} else { // extract all the components
|
||||||
|
for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
|
||||||
|
STRING filename = argv[3];
|
||||||
|
filename += tesseract::kTessdataFileSuffixes[i];
|
||||||
|
if (tm.ExtractToFile(filename.string())) {
|
||||||
|
printf("Wrote %s\n", filename.string());
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
tm.End();
|
||||||
|
} else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
|
||||||
|
// Rename the current traineddata file to a temporary name.
|
||||||
|
const char *new_traineddata_filename = argv[2];
|
||||||
|
STRING traineddata_filename = new_traineddata_filename;
|
||||||
|
traineddata_filename += ".__tmp__";
|
||||||
|
if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
|
||||||
|
tprintf("Failed to create a temporary file %s\n",
|
||||||
|
traineddata_filename.string());
|
||||||
|
exit(1);
|
||||||
|
}
|
||||||
|
|
||||||
|
// Initialize TessdataManager with the data in the given traineddata file.
|
||||||
|
tesseract::TessdataManager tm;
|
||||||
|
tm.Init(traineddata_filename.string());
|
||||||
|
|
||||||
|
// Write the updated traineddata file.
|
||||||
|
tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
|
||||||
|
tm.End();
|
||||||
|
} else {
|
||||||
|
printf("Usage for combining tessdata components:\n"
|
||||||
|
"%s language_data_path_prefix (e.g. tessdata/eng.)\n", argv[0]);
|
||||||
|
printf("Usage for extracting tessdata components:\n"
|
||||||
|
"%s -e traineddata_file [output_component_file...]\n", argv[0]);
|
||||||
|
printf("Usage for overwriting tessdata components:\n"
|
||||||
|
"%s -o traineddata_file [input_component_file...]\n", argv[0]);
|
||||||
|
printf("Usage for unpacking all tessdata components:\n"
|
||||||
|
"%s -u traineddata_file output_path_prefix"
|
||||||
|
" (e.g. /tmp/eng.)\n", argv[0]);
|
||||||
return 1;
|
return 1;
|
||||||
}
|
}
|
||||||
STRING output_file = argv[1];
|
|
||||||
output_file += kTrainedDataSuffix;
|
|
||||||
tesseract::TessdataManager::CombineDataFiles(argv[1], output_file.string());
|
|
||||||
}
|
}
|
||||||
|
Loading…
Reference in New Issue
Block a user