List available languages recursively

Tesseract supports hierarchies of languages and uses them since
the new files best/*.traineddata were added.

Now `tesseract --list-langs` also shows any traineddata files in
subdirectories of the tessdata directory.

Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
Stefan Weil 2017-08-10 08:27:43 +02:00
parent efa50daf5a
commit cc0d87c5b8

View File

@ -39,6 +39,9 @@
#include <dirent.h>
#include <libgen.h>
#include <string.h>
#include <sys/types.h>
#include <sys/stat.h>
#include <unistd.h>
#endif // _WIN32
#include <fstream>
@ -109,6 +112,65 @@ const int kMaxCredibleResolution = 2400;
/** Default resolution. */
const int kDefaultResolution = 300;
/* Add all available languages recursively.
*/
static void addAvailableLanguages(const STRING &datadir, const STRING &base,
GenericVector<STRING>* langs)
{
const STRING base2 = (base.string()[0] == '\0') ? base : base + "/";
const size_t extlen = sizeof(kTrainedDataSuffix);
#ifdef _WIN32
WIN32_FIND_DATA data;
HANDLE handle = FindFirstFile((datadir + base2 + "*").string(), &data);
if (handle != INVALID_HANDLE_VALUE) {
BOOL result = TRUE;
for (; result;) {
char *name = data.cFileName;
// Skip '.', '..', and hidden files
if (name[0] != '.') {
if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ==
FILE_ATTRIBUTE_DIRECTORY) {
addAvailableLanguages(datadir, base2 + name, langs);
} else {
size_t len = strlen(name);
if (len > extlen && name[len - extlen] == '.' &&
strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
name[len - extlen] = '\0';
langs->push_back(base2 + name);
}
}
}
result = FindNextFile(handle, &data);
}
FindClose(handle);
}
#else // _WIN32
DIR* dir = opendir((datadir + base).string());
if (dir != NULL) {
dirent *de;
while ((de = readdir(dir))) {
char *name = de->d_name;
// Skip '.', '..', and hidden files
if (name[0] != '.') {
struct stat st;
if (stat((datadir + base2 + name).string(), &st) == 0 &&
(st.st_mode & S_IFDIR) == S_IFDIR) {
addAvailableLanguages(datadir, base2 + name, langs);
} else {
size_t len = strlen(name);
if (len > extlen && name[len - extlen] == '.' &&
strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
name[len - extlen] = '\0';
langs->push_back(base2 + name);
}
}
}
}
closedir(dir);
}
#endif
}
TessBaseAPI::TessBaseAPI()
: tesseract_(nullptr),
osd_tesseract_(nullptr),
@ -393,45 +455,7 @@ void TessBaseAPI::GetAvailableLanguagesAsVector(
GenericVector<STRING>* langs) const {
langs->clear();
if (tesseract_ != NULL) {
#ifdef _WIN32
STRING pattern = tesseract_->datadir + "/*." + kTrainedDataSuffix;
char fname[_MAX_FNAME];
WIN32_FIND_DATA data;
BOOL result = TRUE;
HANDLE handle = FindFirstFile(pattern.string(), &data);
if (handle != INVALID_HANDLE_VALUE) {
for (; result; result = FindNextFile(handle, &data)) {
_splitpath(data.cFileName, NULL, NULL, fname, NULL);
langs->push_back(STRING(fname));
}
FindClose(handle);
}
#else // _WIN32
DIR *dir;
struct dirent *dirent;
char *dot;
STRING extension = STRING(".") + kTrainedDataSuffix;
dir = opendir(tesseract_->datadir.string());
if (dir != NULL) {
while ((dirent = readdir(dir))) {
// Skip '.', '..', and hidden files
if (dirent->d_name[0] != '.') {
if (strstr(dirent->d_name, extension.string()) != NULL) {
dot = strrchr(dirent->d_name, '.');
// This ensures that .traineddata is at the end of the file name
if (strncmp(dot, extension.string(),
strlen(extension.string())) == 0) {
*dot = '\0';
langs->push_back(STRING(dirent->d_name));
}
}
}
}
closedir(dir);
}
#endif
addAvailableLanguages(tesseract_->datadir, "", langs);
}
}