2018-08-24 21:07:48 +08:00
|
|
|
|
|
|
|
#include <string>
|
|
|
|
#include "leptonica/include/allheaders.h"
|
|
|
|
#include "tesseract/api/baseapi.h"
|
|
|
|
#include "tesseract/ccutil/helpers.h"
|
|
|
|
|
|
|
|
namespace {
|
|
|
|
|
|
|
|
// The fixture for testing Tesseract.
|
|
|
|
class PageSegModeTest : public testing::Test {
|
|
|
|
protected:
|
|
|
|
string TestDataNameToPath(const string& name) {
|
2018-09-29 15:19:13 +08:00
|
|
|
return file::JoinPath(FLAGS_test_srcdir, "testdata/" + name);
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
string TessdataPath() {
|
2018-09-29 15:19:13 +08:00
|
|
|
return file::JoinPath(FLAGS_test_srcdir, "tessdata");
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
2018-09-29 15:27:12 +08:00
|
|
|
PageSegModeTest() { src_pix_ = nullptr; }
|
2018-09-29 15:19:13 +08:00
|
|
|
~PageSegModeTest() { pixDestroy(&src_pix_); }
|
2018-08-24 21:07:48 +08:00
|
|
|
|
|
|
|
void SetImage(const char* filename) {
|
|
|
|
pixDestroy(&src_pix_);
|
|
|
|
src_pix_ = pixRead(TestDataNameToPath(filename).c_str());
|
|
|
|
api_.Init(TessdataPath().c_str(), "eng", tesseract::OEM_TESSERACT_ONLY);
|
|
|
|
api_.SetImage(src_pix_);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests that the given rectangle produces exactly the given text in the
|
|
|
|
// given segmentation mode (after chopping off the last 2 newlines.)
|
2018-09-29 15:19:13 +08:00
|
|
|
void VerifyRectText(tesseract::PageSegMode mode, const char* str, int left,
|
|
|
|
int top, int width, int height) {
|
2018-08-24 21:07:48 +08:00
|
|
|
api_.SetPageSegMode(mode);
|
|
|
|
api_.SetRectangle(left, top, width, height);
|
|
|
|
char* result = api_.GetUTF8Text();
|
|
|
|
chomp_string(result);
|
|
|
|
chomp_string(result);
|
|
|
|
EXPECT_STREQ(str, result);
|
2018-09-29 15:19:13 +08:00
|
|
|
delete[] result;
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests that the given rectangle does NOT produce the given text in the
|
|
|
|
// given segmentation mode.
|
2018-09-29 15:19:13 +08:00
|
|
|
void NotRectText(tesseract::PageSegMode mode, const char* str, int left,
|
|
|
|
int top, int width, int height) {
|
2018-08-24 21:07:48 +08:00
|
|
|
api_.SetPageSegMode(mode);
|
|
|
|
api_.SetRectangle(left, top, width, height);
|
|
|
|
char* result = api_.GetUTF8Text();
|
|
|
|
EXPECT_STRNE(str, result);
|
2018-09-29 15:19:13 +08:00
|
|
|
delete[] result;
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
Pix* src_pix_;
|
|
|
|
string ocr_text_;
|
|
|
|
tesseract::TessBaseAPI api_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Tests the single-word segmentation mode, and that it performs correctly
|
|
|
|
// and differently to line and block mode.
|
|
|
|
TEST_F(PageSegModeTest, WordTest) {
|
|
|
|
SetImage("segmodeimg.tif");
|
|
|
|
// Test various rectangles around the inverse page number.
|
2018-09-29 15:19:13 +08:00
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1482, 146, 72, 44);
|
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1474, 134, 82, 72);
|
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1459, 116, 118, 112);
|
2018-08-24 21:07:48 +08:00
|
|
|
// Test a random pair of words as a line
|
2018-09-29 15:19:13 +08:00
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_LINE, "What should", 1119, 621, 245, 54);
|
2018-08-24 21:07:48 +08:00
|
|
|
// Test a random pair of words as a word
|
2018-09-29 15:19:13 +08:00
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_WORD, "Whatshould", 1119, 621, 245, 54);
|
2018-08-24 21:07:48 +08:00
|
|
|
// Test single block mode.
|
2018-09-29 15:19:13 +08:00
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_BLOCK, "both the\nfrom the", 181, 676,
|
|
|
|
179, 104);
|
2018-08-24 21:07:48 +08:00
|
|
|
// But doesn't work in line or word mode.
|
2018-09-29 15:19:13 +08:00
|
|
|
NotRectText(tesseract::PSM_SINGLE_LINE, "both the\nfrom the", 181, 676, 179,
|
|
|
|
104);
|
|
|
|
NotRectText(tesseract::PSM_SINGLE_WORD, "both the\nfrom the", 181, 676, 179,
|
|
|
|
104);
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
} // namespace
|