mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-07 18:02:40 +08:00
Fixed issue 979
git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1034 d0cd1f9f-072b-0410-8dd7-cf729c803f20
This commit is contained in:
parent
d1e4f27acb
commit
2ad63776e5
@ -326,7 +326,7 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
|
||||
if (to_block->line_size >= 2) {
|
||||
finder = new ColumnFinder(static_cast<int>(to_block->line_size),
|
||||
blkbox.botleft(), blkbox.topright(),
|
||||
source_resolution_,
|
||||
source_resolution_, textord_use_cjk_fp_model,
|
||||
&v_lines, &h_lines, vertical_x, vertical_y);
|
||||
|
||||
finder->SetupAndFilterNoise(*photo_mask_pix, to_block);
|
||||
@ -357,13 +357,20 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation(
|
||||
osd_margin = osd_score - osr->orientations[i];
|
||||
}
|
||||
}
|
||||
int best_script_id = osr->best_result.script_id;
|
||||
const char* best_script_str =
|
||||
osd_tess->unicharset.get_script_from_script_id(best_script_id);
|
||||
bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
|
||||
best_script_id == osd_tess->unicharset.hiragana_sid() ||
|
||||
best_script_id == osd_tess->unicharset.katakana_sid() ||
|
||||
strcmp("Japanese", best_script_str) == 0 ||
|
||||
strcmp("Korean", best_script_str) == 0 ||
|
||||
strcmp("Hangul", best_script_str) == 0;
|
||||
if (cjk) {
|
||||
finder->set_cjk_script(true);
|
||||
}
|
||||
if (osd_margin < min_orientation_margin) {
|
||||
// The margin is weak.
|
||||
int best_script_id = osr->best_result.script_id;
|
||||
bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) ||
|
||||
(best_script_id == osd_tess->unicharset.hiragana_sid()) ||
|
||||
(best_script_id == osd_tess->unicharset.katakana_sid());
|
||||
|
||||
if (!cjk && !vertical_text && osd_orientation == 2) {
|
||||
// upside down latin text is improbable with such a weak margin.
|
||||
tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
|
||||
|
@ -83,11 +83,12 @@ ScrollView* ColumnFinder::blocks_win_ = NULL;
|
||||
// the sum logical vertical vector produced by LineFinder::FindVerticalLines.
|
||||
ColumnFinder::ColumnFinder(int gridsize,
|
||||
const ICOORD& bleft, const ICOORD& tright,
|
||||
int resolution,
|
||||
int resolution, bool cjk_script,
|
||||
TabVector_LIST* vlines, TabVector_LIST* hlines,
|
||||
int vertical_x, int vertical_y)
|
||||
: TabFind(gridsize, bleft, tright, vlines, vertical_x, vertical_y,
|
||||
resolution),
|
||||
cjk_script_(cjk_script),
|
||||
min_gutter_width_(static_cast<int>(kMinGutterWidthGrid * gridsize)),
|
||||
mean_column_gap_(tright.x() - bleft.x()),
|
||||
reskew_(1.0f, 0.0f), rotation_(1.0f, 0.0f), rerotate_(1.0f, 0.0f),
|
||||
@ -169,10 +170,7 @@ void ColumnFinder::SetupAndFilterNoise(Pix* photo_mask_pix,
|
||||
// Remove obvious noise and make the initial non-text map.
|
||||
nontext_map_ = nontext_detect.ComputeNonTextMask(textord_debug_tabfind,
|
||||
photo_mask_pix, input_block);
|
||||
// TODO(rays) experiment with making broken CJK fixing dependent on the
|
||||
// language, and keeping the merged blobs on output instead of exploding at
|
||||
// ColPartition::MakeBlock.
|
||||
stroke_width_->FindTextlineDirectionAndFixBrokenCJK(true, input_block);
|
||||
stroke_width_->FindTextlineDirectionAndFixBrokenCJK(cjk_script_, input_block);
|
||||
// Clear the strokewidth grid ready for rotation or leader finding.
|
||||
stroke_width_->Clear();
|
||||
}
|
||||
@ -297,7 +295,7 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode,
|
||||
FindInitialTabVectors(NULL, min_gutter_width_, input_block);
|
||||
SetBlockRuleEdges(input_block);
|
||||
stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_,
|
||||
denorm_, &projection_,
|
||||
denorm_, cjk_script_, &projection_,
|
||||
&part_grid_, &big_parts_);
|
||||
if (!PSM_SPARSE(pageseg_mode)) {
|
||||
ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_,
|
||||
|
@ -58,8 +58,8 @@ class ColumnFinder : public TabFind {
|
||||
// vlines is a (possibly empty) list of TabVector and vertical_x and y are
|
||||
// the sum logical vertical vector produced by LineFinder::FindVerticalLines.
|
||||
ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright,
|
||||
int resolution, TabVector_LIST* vlines, TabVector_LIST* hlines,
|
||||
int vertical_x, int vertical_y);
|
||||
int resolution, bool cjk_script, TabVector_LIST* vlines,
|
||||
TabVector_LIST* hlines, int vertical_x, int vertical_y);
|
||||
virtual ~ColumnFinder();
|
||||
|
||||
// Accessors for testing
|
||||
@ -69,6 +69,9 @@ class ColumnFinder : public TabFind {
|
||||
const TextlineProjection* projection() const {
|
||||
return &projection_;
|
||||
}
|
||||
void set_cjk_script(bool is_cjk) {
|
||||
cjk_script_ = is_cjk;
|
||||
}
|
||||
|
||||
// ======================================================================
|
||||
// The main function of ColumnFinder is broken into pieces to facilitate
|
||||
@ -284,6 +287,8 @@ class ColumnFinder : public TabFind {
|
||||
// them sit in the rotated block.
|
||||
FCOORD ComputeBlockAndClassifyRotation(BLOCK* block);
|
||||
|
||||
// True if this is most likely a cjk page with rectangular characters.
|
||||
bool cjk_script_;
|
||||
// The minimum gutter width to apply for finding columns.
|
||||
// Modified when vertical text is detected to prevent detection of
|
||||
// vertical text lines as columns.
|
||||
|
@ -356,6 +356,7 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
|
||||
TO_BLOCK* block,
|
||||
Pix* nontext_pix,
|
||||
const DENORM* denorm,
|
||||
bool cjk_script,
|
||||
TextlineProjection* projection,
|
||||
ColPartitionGrid* part_grid,
|
||||
ColPartition_LIST* big_parts) {
|
||||
@ -367,10 +368,8 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation,
|
||||
// Setup the strokewidth grid with the remaining non-noise, non-leader blobs.
|
||||
InsertBlobs(block);
|
||||
|
||||
// Run FixBrokenCJK() again if the page is rotated and the blobs
|
||||
// lists are reset and re-flitered, because we may have some new
|
||||
// blobs in the medium blob list.
|
||||
if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) {
|
||||
// Run FixBrokenCJK() again if the page is CJK.
|
||||
if (cjk_script) {
|
||||
FixBrokenCJK(block);
|
||||
}
|
||||
FindTextlineFlowDirection(true);
|
||||
|
@ -112,6 +112,7 @@ class StrokeWidth : public BlobGrid {
|
||||
TO_BLOCK* block,
|
||||
Pix* nontext_pix,
|
||||
const DENORM* denorm,
|
||||
bool cjk_script,
|
||||
TextlineProjection* projection,
|
||||
ColPartitionGrid* part_grid,
|
||||
ColPartition_LIST* big_parts);
|
||||
|
Loading…
Reference in New Issue
Block a user