2019-06-14 22:00:20 +08:00
|
|
|
// (C) Copyright 2017, Google Inc.
|
|
|
|
// Licensed under the Apache License, Version 2.0 (the "License");
|
|
|
|
// you may not use this file except in compliance with the License.
|
|
|
|
// You may obtain a copy of the License at
|
|
|
|
// http://www.apache.org/licenses/LICENSE-2.0
|
|
|
|
// Unless required by applicable law or agreed to in writing, software
|
|
|
|
// distributed under the License is distributed on an "AS IS" BASIS,
|
|
|
|
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
|
|
|
// See the License for the specific language governing permissions and
|
|
|
|
// limitations under the License.
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2021-02-13 17:17:20 +08:00
|
|
|
#include <allheaders.h>
|
2019-10-30 01:01:18 +08:00
|
|
|
#include <tesseract/baseapi.h>
|
2024-08-26 00:57:22 +08:00
|
|
|
#include <filesystem>
|
2021-03-13 05:06:34 +08:00
|
|
|
#include <string>
|
2020-12-31 16:03:56 +08:00
|
|
|
#include "helpers.h"
|
2019-06-14 22:00:20 +08:00
|
|
|
#include "include_gunit.h"
|
2021-04-01 03:39:43 +08:00
|
|
|
#include "image.h"
|
2021-03-13 05:06:34 +08:00
|
|
|
#include "log.h"
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2020-12-27 17:41:48 +08:00
|
|
|
namespace tesseract {
|
2018-08-24 21:07:48 +08:00
|
|
|
|
|
|
|
// The fixture for testing Tesseract.
|
|
|
|
class PageSegModeTest : public testing::Test {
|
2021-03-13 05:06:34 +08:00
|
|
|
protected:
|
2019-06-14 22:00:20 +08:00
|
|
|
PageSegModeTest() = default;
|
2021-03-22 15:26:05 +08:00
|
|
|
~PageSegModeTest() override {
|
2021-04-01 03:39:43 +08:00
|
|
|
src_pix_.destroy();
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
2019-06-14 22:00:20 +08:00
|
|
|
void SetUp() override {
|
|
|
|
static std::locale system_locale("");
|
|
|
|
std::locale::global(system_locale);
|
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
void SetImage(const char *filename) {
|
2021-04-01 03:39:43 +08:00
|
|
|
src_pix_.destroy();
|
2019-06-14 22:00:20 +08:00
|
|
|
src_pix_ = pixRead(filename);
|
|
|
|
api_.Init(TESSDATA_DIR, "eng", tesseract::OEM_TESSERACT_ONLY);
|
2018-08-24 21:07:48 +08:00
|
|
|
api_.SetImage(src_pix_);
|
|
|
|
}
|
|
|
|
|
|
|
|
// Tests that the given rectangle produces exactly the given text in the
|
|
|
|
// given segmentation mode (after chopping off the last 2 newlines.)
|
2021-03-13 05:06:34 +08:00
|
|
|
void VerifyRectText(tesseract::PageSegMode mode, const char *str, int left, int top, int width,
|
|
|
|
int height) {
|
2018-08-24 21:07:48 +08:00
|
|
|
api_.SetPageSegMode(mode);
|
|
|
|
api_.SetRectangle(left, top, width, height);
|
2021-03-13 05:06:34 +08:00
|
|
|
char *result = api_.GetUTF8Text();
|
2018-08-24 21:07:48 +08:00
|
|
|
chomp_string(result);
|
|
|
|
chomp_string(result);
|
|
|
|
EXPECT_STREQ(str, result);
|
2018-09-29 15:19:13 +08:00
|
|
|
delete[] result;
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
// Tests that the given rectangle does NOT produce the given text in the
|
|
|
|
// given segmentation mode.
|
2021-03-13 05:06:34 +08:00
|
|
|
void NotRectText(tesseract::PageSegMode mode, const char *str, int left, int top, int width,
|
|
|
|
int height) {
|
2018-08-24 21:07:48 +08:00
|
|
|
api_.SetPageSegMode(mode);
|
|
|
|
api_.SetRectangle(left, top, width, height);
|
2021-03-13 05:06:34 +08:00
|
|
|
char *result = api_.GetUTF8Text();
|
2018-08-24 21:07:48 +08:00
|
|
|
EXPECT_STRNE(str, result);
|
2018-09-29 15:19:13 +08:00
|
|
|
delete[] result;
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
2021-04-01 03:39:43 +08:00
|
|
|
Image src_pix_ = nullptr;
|
2019-06-14 22:00:20 +08:00
|
|
|
std::string ocr_text_;
|
2018-08-24 21:07:48 +08:00
|
|
|
tesseract::TessBaseAPI api_;
|
|
|
|
};
|
|
|
|
|
|
|
|
// Tests the single-word segmentation mode, and that it performs correctly
|
|
|
|
// and differently to line and block mode.
|
|
|
|
TEST_F(PageSegModeTest, WordTest) {
|
2019-06-14 22:00:20 +08:00
|
|
|
std::string filename = file::JoinPath(TESTING_DIR, "segmodeimg.tif");
|
2024-08-26 00:57:22 +08:00
|
|
|
if (!std::filesystem::exists(filename)) {
|
2019-06-14 22:00:20 +08:00
|
|
|
LOG(INFO) << "Skip test because of missing " << filename << '\n';
|
|
|
|
GTEST_SKIP();
|
|
|
|
} else {
|
|
|
|
SetImage(filename.c_str());
|
|
|
|
// Test various rectangles around the inverse page number.
|
2019-07-09 17:48:56 +08:00
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1419, 264, 69, 34);
|
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1411, 252, 78, 62);
|
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_WORD, "183", 1396, 218, 114, 102);
|
2019-06-14 22:00:20 +08:00
|
|
|
// Test a random pair of words as a line
|
2021-03-13 05:06:34 +08:00
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_LINE, "What should", 237, 393, 256, 36);
|
2021-11-04 19:49:32 +08:00
|
|
|
#ifdef DISABLED_LEGACY_ENGINE
|
|
|
|
// Skip check as LSTM mode adds a space.
|
|
|
|
LOG(INFO) << "Skip `Whatshould` test in LSTM Mode\n";
|
|
|
|
#else
|
2019-06-14 22:00:20 +08:00
|
|
|
// Test a random pair of words as a word
|
2021-03-13 05:06:34 +08:00
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_WORD, "Whatshould", 237, 393, 256, 36);
|
2021-11-04 19:49:32 +08:00
|
|
|
#endif
|
2019-06-14 22:00:20 +08:00
|
|
|
// Test single block mode.
|
2021-03-13 05:06:34 +08:00
|
|
|
VerifyRectText(tesseract::PSM_SINGLE_BLOCK, "both the\nfrom the", 237, 450, 172, 94);
|
2019-06-14 22:00:20 +08:00
|
|
|
// But doesn't work in line or word mode.
|
2021-03-13 05:06:34 +08:00
|
|
|
NotRectText(tesseract::PSM_SINGLE_LINE, "both the\nfrom the", 237, 450, 172, 94);
|
|
|
|
NotRectText(tesseract::PSM_SINGLE_WORD, "both the\nfrom the", 237, 450, 172, 94);
|
2019-06-14 22:00:20 +08:00
|
|
|
}
|
2018-08-24 21:07:48 +08:00
|
|
|
}
|
|
|
|
|
2021-03-13 05:06:34 +08:00
|
|
|
} // namespace tesseract
|