tesseract/training/ambiguous_words.cpp

78 lines
2.7 KiB
C++
Raw Normal View History

///////////////////////////////////////////////////////////////////////
// File: ambiguous_words.cpp
// Description: A program that takes a text file with a list of words as
// input (one per line) and outputs a file with the words
// that were found in the dictionary followed by the words
// that are ambiguous to them.
// Author: Rika Antonova
// Created: Fri Oct 21 11:26:43 PDT 2011
//
// (C) Copyright 2011, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////
//
#include <stdio.h>
#include "baseapi.h"
#include "helpers.h"
#include "strngs.h"
#include "dict.h"
#include "tesseractclass.h"
int main(int argc, char** argv) {
// Parse input arguments.
if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) {
printf("Usage: %s [-l lang] tessdata_dir wordlist_file"
" output_ambiguious_wordlist_file\n", argv[0]);
return 1;
}
int argv_offset = 0;
STRING lang;
if (argc == 6) {
lang = argv[2];
argv_offset = 2;
} else {
lang = "eng";
}
const char *tessdata_dir = argv[++argv_offset];
const char *input_file_str = argv[++argv_offset];
const char *output_file_str = argv[++argv_offset];
// Initialize Tesseract.
tesseract::TessBaseAPI api;
GenericVector<STRING> vars_vec;
GenericVector<STRING> vars_values;
vars_vec.push_back("output_ambig_words_file");
vars_values.push_back(output_file_str);
api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY,
nullptr, 0, &vars_vec, &vars_values, false);
tesseract::Dict &dict = api.tesseract()->getDict();
FILE *input_file = fopen(input_file_str, "rb");
if (input_file == nullptr) {
tprintf("Failed to open input wordlist file %s\n", input_file_str);
exit(1);
}
char str[CHARS_PER_LINE];
// Read word list and call Dict::NoDangerousAmbig() for each word
// to record ambiguities in the output file.
while (fgets(str, CHARS_PER_LINE, input_file) != nullptr) {
chomp_string(str); // remove newline
WERD_CHOICE word(str, dict.getUnicharset());
dict.NoDangerousAmbig(&word, nullptr, false, nullptr);
}
// Clean up.
fclose(input_file);
}