mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-19 06:53:36 +08:00
List available languages recursively
Tesseract supports hierarchies of languages and uses them since the new files best/*.traineddata were added. Now `tesseract --list-langs` also shows any traineddata files in subdirectories of the tessdata directory. Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
efa50daf5a
commit
cc0d87c5b8
102
api/baseapi.cpp
102
api/baseapi.cpp
@ -39,6 +39,9 @@
|
||||
#include <dirent.h>
|
||||
#include <libgen.h>
|
||||
#include <string.h>
|
||||
#include <sys/types.h>
|
||||
#include <sys/stat.h>
|
||||
#include <unistd.h>
|
||||
#endif // _WIN32
|
||||
|
||||
#include <fstream>
|
||||
@ -109,6 +112,65 @@ const int kMaxCredibleResolution = 2400;
|
||||
/** Default resolution. */
|
||||
const int kDefaultResolution = 300;
|
||||
|
||||
/* Add all available languages recursively.
|
||||
*/
|
||||
static void addAvailableLanguages(const STRING &datadir, const STRING &base,
|
||||
GenericVector<STRING>* langs)
|
||||
{
|
||||
const STRING base2 = (base.string()[0] == '\0') ? base : base + "/";
|
||||
const size_t extlen = sizeof(kTrainedDataSuffix);
|
||||
#ifdef _WIN32
|
||||
WIN32_FIND_DATA data;
|
||||
HANDLE handle = FindFirstFile((datadir + base2 + "*").string(), &data);
|
||||
if (handle != INVALID_HANDLE_VALUE) {
|
||||
BOOL result = TRUE;
|
||||
for (; result;) {
|
||||
char *name = data.cFileName;
|
||||
// Skip '.', '..', and hidden files
|
||||
if (name[0] != '.') {
|
||||
if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ==
|
||||
FILE_ATTRIBUTE_DIRECTORY) {
|
||||
addAvailableLanguages(datadir, base2 + name, langs);
|
||||
} else {
|
||||
size_t len = strlen(name);
|
||||
if (len > extlen && name[len - extlen] == '.' &&
|
||||
strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
|
||||
name[len - extlen] = '\0';
|
||||
langs->push_back(base2 + name);
|
||||
}
|
||||
}
|
||||
}
|
||||
result = FindNextFile(handle, &data);
|
||||
}
|
||||
FindClose(handle);
|
||||
}
|
||||
#else // _WIN32
|
||||
DIR* dir = opendir((datadir + base).string());
|
||||
if (dir != NULL) {
|
||||
dirent *de;
|
||||
while ((de = readdir(dir))) {
|
||||
char *name = de->d_name;
|
||||
// Skip '.', '..', and hidden files
|
||||
if (name[0] != '.') {
|
||||
struct stat st;
|
||||
if (stat((datadir + base2 + name).string(), &st) == 0 &&
|
||||
(st.st_mode & S_IFDIR) == S_IFDIR) {
|
||||
addAvailableLanguages(datadir, base2 + name, langs);
|
||||
} else {
|
||||
size_t len = strlen(name);
|
||||
if (len > extlen && name[len - extlen] == '.' &&
|
||||
strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
|
||||
name[len - extlen] = '\0';
|
||||
langs->push_back(base2 + name);
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
closedir(dir);
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
||||
TessBaseAPI::TessBaseAPI()
|
||||
: tesseract_(nullptr),
|
||||
osd_tesseract_(nullptr),
|
||||
@ -393,45 +455,7 @@ void TessBaseAPI::GetAvailableLanguagesAsVector(
|
||||
GenericVector<STRING>* langs) const {
|
||||
langs->clear();
|
||||
if (tesseract_ != NULL) {
|
||||
#ifdef _WIN32
|
||||
STRING pattern = tesseract_->datadir + "/*." + kTrainedDataSuffix;
|
||||
char fname[_MAX_FNAME];
|
||||
WIN32_FIND_DATA data;
|
||||
BOOL result = TRUE;
|
||||
HANDLE handle = FindFirstFile(pattern.string(), &data);
|
||||
if (handle != INVALID_HANDLE_VALUE) {
|
||||
for (; result; result = FindNextFile(handle, &data)) {
|
||||
_splitpath(data.cFileName, NULL, NULL, fname, NULL);
|
||||
langs->push_back(STRING(fname));
|
||||
}
|
||||
FindClose(handle);
|
||||
}
|
||||
#else // _WIN32
|
||||
DIR *dir;
|
||||
struct dirent *dirent;
|
||||
char *dot;
|
||||
|
||||
STRING extension = STRING(".") + kTrainedDataSuffix;
|
||||
|
||||
dir = opendir(tesseract_->datadir.string());
|
||||
if (dir != NULL) {
|
||||
while ((dirent = readdir(dir))) {
|
||||
// Skip '.', '..', and hidden files
|
||||
if (dirent->d_name[0] != '.') {
|
||||
if (strstr(dirent->d_name, extension.string()) != NULL) {
|
||||
dot = strrchr(dirent->d_name, '.');
|
||||
// This ensures that .traineddata is at the end of the file name
|
||||
if (strncmp(dot, extension.string(),
|
||||
strlen(extension.string())) == 0) {
|
||||
*dot = '\0';
|
||||
langs->push_back(STRING(dirent->d_name));
|
||||
}
|
||||
}
|
||||
}
|
||||
}
|
||||
closedir(dir);
|
||||
}
|
||||
#endif
|
||||
addAvailableLanguages(tesseract_->datadir, "", langs);
|
||||
}
|
||||
}
|
||||
|
||||
|
Loading…
Reference in New Issue
Block a user