mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-12-01 07:59:05 +08:00
List available languages recursively
Tesseract supports hierarchies of languages and uses them since the new files best/*.traineddata were added. Now `tesseract --list-langs` also shows any traineddata files in subdirectories of the tessdata directory. Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
efa50daf5a
commit
cc0d87c5b8
102
api/baseapi.cpp
102
api/baseapi.cpp
@ -39,6 +39,9 @@
|
|||||||
#include <dirent.h>
|
#include <dirent.h>
|
||||||
#include <libgen.h>
|
#include <libgen.h>
|
||||||
#include <string.h>
|
#include <string.h>
|
||||||
|
#include <sys/types.h>
|
||||||
|
#include <sys/stat.h>
|
||||||
|
#include <unistd.h>
|
||||||
#endif // _WIN32
|
#endif // _WIN32
|
||||||
|
|
||||||
#include <fstream>
|
#include <fstream>
|
||||||
@ -109,6 +112,65 @@ const int kMaxCredibleResolution = 2400;
|
|||||||
/** Default resolution. */
|
/** Default resolution. */
|
||||||
const int kDefaultResolution = 300;
|
const int kDefaultResolution = 300;
|
||||||
|
|
||||||
|
/* Add all available languages recursively.
|
||||||
|
*/
|
||||||
|
static void addAvailableLanguages(const STRING &datadir, const STRING &base,
|
||||||
|
GenericVector<STRING>* langs)
|
||||||
|
{
|
||||||
|
const STRING base2 = (base.string()[0] == '\0') ? base : base + "/";
|
||||||
|
const size_t extlen = sizeof(kTrainedDataSuffix);
|
||||||
|
#ifdef _WIN32
|
||||||
|
WIN32_FIND_DATA data;
|
||||||
|
HANDLE handle = FindFirstFile((datadir + base2 + "*").string(), &data);
|
||||||
|
if (handle != INVALID_HANDLE_VALUE) {
|
||||||
|
BOOL result = TRUE;
|
||||||
|
for (; result;) {
|
||||||
|
char *name = data.cFileName;
|
||||||
|
// Skip '.', '..', and hidden files
|
||||||
|
if (name[0] != '.') {
|
||||||
|
if ((data.dwFileAttributes & FILE_ATTRIBUTE_DIRECTORY) ==
|
||||||
|
FILE_ATTRIBUTE_DIRECTORY) {
|
||||||
|
addAvailableLanguages(datadir, base2 + name, langs);
|
||||||
|
} else {
|
||||||
|
size_t len = strlen(name);
|
||||||
|
if (len > extlen && name[len - extlen] == '.' &&
|
||||||
|
strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
|
||||||
|
name[len - extlen] = '\0';
|
||||||
|
langs->push_back(base2 + name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
result = FindNextFile(handle, &data);
|
||||||
|
}
|
||||||
|
FindClose(handle);
|
||||||
|
}
|
||||||
|
#else // _WIN32
|
||||||
|
DIR* dir = opendir((datadir + base).string());
|
||||||
|
if (dir != NULL) {
|
||||||
|
dirent *de;
|
||||||
|
while ((de = readdir(dir))) {
|
||||||
|
char *name = de->d_name;
|
||||||
|
// Skip '.', '..', and hidden files
|
||||||
|
if (name[0] != '.') {
|
||||||
|
struct stat st;
|
||||||
|
if (stat((datadir + base2 + name).string(), &st) == 0 &&
|
||||||
|
(st.st_mode & S_IFDIR) == S_IFDIR) {
|
||||||
|
addAvailableLanguages(datadir, base2 + name, langs);
|
||||||
|
} else {
|
||||||
|
size_t len = strlen(name);
|
||||||
|
if (len > extlen && name[len - extlen] == '.' &&
|
||||||
|
strcmp(&name[len - extlen + 1], kTrainedDataSuffix) == 0) {
|
||||||
|
name[len - extlen] = '\0';
|
||||||
|
langs->push_back(base2 + name);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
closedir(dir);
|
||||||
|
}
|
||||||
|
#endif
|
||||||
|
}
|
||||||
|
|
||||||
TessBaseAPI::TessBaseAPI()
|
TessBaseAPI::TessBaseAPI()
|
||||||
: tesseract_(nullptr),
|
: tesseract_(nullptr),
|
||||||
osd_tesseract_(nullptr),
|
osd_tesseract_(nullptr),
|
||||||
@ -393,45 +455,7 @@ void TessBaseAPI::GetAvailableLanguagesAsVector(
|
|||||||
GenericVector<STRING>* langs) const {
|
GenericVector<STRING>* langs) const {
|
||||||
langs->clear();
|
langs->clear();
|
||||||
if (tesseract_ != NULL) {
|
if (tesseract_ != NULL) {
|
||||||
#ifdef _WIN32
|
addAvailableLanguages(tesseract_->datadir, "", langs);
|
||||||
STRING pattern = tesseract_->datadir + "/*." + kTrainedDataSuffix;
|
|
||||||
char fname[_MAX_FNAME];
|
|
||||||
WIN32_FIND_DATA data;
|
|
||||||
BOOL result = TRUE;
|
|
||||||
HANDLE handle = FindFirstFile(pattern.string(), &data);
|
|
||||||
if (handle != INVALID_HANDLE_VALUE) {
|
|
||||||
for (; result; result = FindNextFile(handle, &data)) {
|
|
||||||
_splitpath(data.cFileName, NULL, NULL, fname, NULL);
|
|
||||||
langs->push_back(STRING(fname));
|
|
||||||
}
|
|
||||||
FindClose(handle);
|
|
||||||
}
|
|
||||||
#else // _WIN32
|
|
||||||
DIR *dir;
|
|
||||||
struct dirent *dirent;
|
|
||||||
char *dot;
|
|
||||||
|
|
||||||
STRING extension = STRING(".") + kTrainedDataSuffix;
|
|
||||||
|
|
||||||
dir = opendir(tesseract_->datadir.string());
|
|
||||||
if (dir != NULL) {
|
|
||||||
while ((dirent = readdir(dir))) {
|
|
||||||
// Skip '.', '..', and hidden files
|
|
||||||
if (dirent->d_name[0] != '.') {
|
|
||||||
if (strstr(dirent->d_name, extension.string()) != NULL) {
|
|
||||||
dot = strrchr(dirent->d_name, '.');
|
|
||||||
// This ensures that .traineddata is at the end of the file name
|
|
||||||
if (strncmp(dot, extension.string(),
|
|
||||||
strlen(extension.string())) == 0) {
|
|
||||||
*dot = '\0';
|
|
||||||
langs->push_back(STRING(dirent->d_name));
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
|
||||||
closedir(dir);
|
|
||||||
}
|
|
||||||
#endif
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
Loading…
Reference in New Issue
Block a user