From 2ad63776e55e4bd5d1ca85a1f17d69aa12d7227c Mon Sep 17 00:00:00 2001 From: "theraysmith@gmail.com" Date: Thu, 30 Jan 2014 02:20:59 +0000 Subject: [PATCH] Fixed issue 979 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@1034 d0cd1f9f-072b-0410-8dd7-cf729c803f20 --- ccmain/pagesegmain.cpp | 19 +++++++++++++------ textord/colfind.cpp | 10 ++++------ textord/colfind.h | 9 +++++++-- textord/strokewidth.cpp | 7 +++---- textord/strokewidth.h | 1 + 5 files changed, 28 insertions(+), 18 deletions(-) diff --git a/ccmain/pagesegmain.cpp b/ccmain/pagesegmain.cpp index 72b1a55b5..aaa78de83 100644 --- a/ccmain/pagesegmain.cpp +++ b/ccmain/pagesegmain.cpp @@ -326,7 +326,7 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( if (to_block->line_size >= 2) { finder = new ColumnFinder(static_cast(to_block->line_size), blkbox.botleft(), blkbox.topright(), - source_resolution_, + source_resolution_, textord_use_cjk_fp_model, &v_lines, &h_lines, vertical_x, vertical_y); finder->SetupAndFilterNoise(*photo_mask_pix, to_block); @@ -357,13 +357,20 @@ ColumnFinder* Tesseract::SetupPageSegAndDetectOrientation( osd_margin = osd_score - osr->orientations[i]; } } + int best_script_id = osr->best_result.script_id; + const char* best_script_str = + osd_tess->unicharset.get_script_from_script_id(best_script_id); + bool cjk = best_script_id == osd_tess->unicharset.han_sid() || + best_script_id == osd_tess->unicharset.hiragana_sid() || + best_script_id == osd_tess->unicharset.katakana_sid() || + strcmp("Japanese", best_script_str) == 0 || + strcmp("Korean", best_script_str) == 0 || + strcmp("Hangul", best_script_str) == 0; + if (cjk) { + finder->set_cjk_script(true); + } if (osd_margin < min_orientation_margin) { // The margin is weak. - int best_script_id = osr->best_result.script_id; - bool cjk = (best_script_id == osd_tess->unicharset.han_sid()) || - (best_script_id == osd_tess->unicharset.hiragana_sid()) || - (best_script_id == osd_tess->unicharset.katakana_sid()); - if (!cjk && !vertical_text && osd_orientation == 2) { // upside down latin text is improbable with such a weak margin. tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: " diff --git a/textord/colfind.cpp b/textord/colfind.cpp index d3806c419..b012164ba 100644 --- a/textord/colfind.cpp +++ b/textord/colfind.cpp @@ -83,11 +83,12 @@ ScrollView* ColumnFinder::blocks_win_ = NULL; // the sum logical vertical vector produced by LineFinder::FindVerticalLines. ColumnFinder::ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright, - int resolution, + int resolution, bool cjk_script, TabVector_LIST* vlines, TabVector_LIST* hlines, int vertical_x, int vertical_y) : TabFind(gridsize, bleft, tright, vlines, vertical_x, vertical_y, resolution), + cjk_script_(cjk_script), min_gutter_width_(static_cast(kMinGutterWidthGrid * gridsize)), mean_column_gap_(tright.x() - bleft.x()), reskew_(1.0f, 0.0f), rotation_(1.0f, 0.0f), rerotate_(1.0f, 0.0f), @@ -169,10 +170,7 @@ void ColumnFinder::SetupAndFilterNoise(Pix* photo_mask_pix, // Remove obvious noise and make the initial non-text map. nontext_map_ = nontext_detect.ComputeNonTextMask(textord_debug_tabfind, photo_mask_pix, input_block); - // TODO(rays) experiment with making broken CJK fixing dependent on the - // language, and keeping the merged blobs on output instead of exploding at - // ColPartition::MakeBlock. - stroke_width_->FindTextlineDirectionAndFixBrokenCJK(true, input_block); + stroke_width_->FindTextlineDirectionAndFixBrokenCJK(cjk_script_, input_block); // Clear the strokewidth grid ready for rotation or leader finding. stroke_width_->Clear(); } @@ -297,7 +295,7 @@ int ColumnFinder::FindBlocks(PageSegMode pageseg_mode, FindInitialTabVectors(NULL, min_gutter_width_, input_block); SetBlockRuleEdges(input_block); stroke_width_->GradeBlobsIntoPartitions(rerotate_, input_block, nontext_map_, - denorm_, &projection_, + denorm_, cjk_script_, &projection_, &part_grid_, &big_parts_); if (!PSM_SPARSE(pageseg_mode)) { ImageFind::FindImagePartitions(photo_mask_pix, rotation_, rerotate_, diff --git a/textord/colfind.h b/textord/colfind.h index 17daa38e0..b532bee96 100644 --- a/textord/colfind.h +++ b/textord/colfind.h @@ -58,8 +58,8 @@ class ColumnFinder : public TabFind { // vlines is a (possibly empty) list of TabVector and vertical_x and y are // the sum logical vertical vector produced by LineFinder::FindVerticalLines. ColumnFinder(int gridsize, const ICOORD& bleft, const ICOORD& tright, - int resolution, TabVector_LIST* vlines, TabVector_LIST* hlines, - int vertical_x, int vertical_y); + int resolution, bool cjk_script, TabVector_LIST* vlines, + TabVector_LIST* hlines, int vertical_x, int vertical_y); virtual ~ColumnFinder(); // Accessors for testing @@ -69,6 +69,9 @@ class ColumnFinder : public TabFind { const TextlineProjection* projection() const { return &projection_; } + void set_cjk_script(bool is_cjk) { + cjk_script_ = is_cjk; + } // ====================================================================== // The main function of ColumnFinder is broken into pieces to facilitate @@ -284,6 +287,8 @@ class ColumnFinder : public TabFind { // them sit in the rotated block. FCOORD ComputeBlockAndClassifyRotation(BLOCK* block); + // True if this is most likely a cjk page with rectangular characters. + bool cjk_script_; // The minimum gutter width to apply for finding columns. // Modified when vertical text is detected to prevent detection of // vertical text lines as columns. diff --git a/textord/strokewidth.cpp b/textord/strokewidth.cpp index 40b3283de..d60f5caeb 100644 --- a/textord/strokewidth.cpp +++ b/textord/strokewidth.cpp @@ -356,6 +356,7 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation, TO_BLOCK* block, Pix* nontext_pix, const DENORM* denorm, + bool cjk_script, TextlineProjection* projection, ColPartitionGrid* part_grid, ColPartition_LIST* big_parts) { @@ -367,10 +368,8 @@ void StrokeWidth::GradeBlobsIntoPartitions(const FCOORD& rerotation, // Setup the strokewidth grid with the remaining non-noise, non-leader blobs. InsertBlobs(block); - // Run FixBrokenCJK() again if the page is rotated and the blobs - // lists are reset and re-flitered, because we may have some new - // blobs in the medium blob list. - if (rerotation_.x() != 1.0f || rerotation_.y() != 0.0f) { + // Run FixBrokenCJK() again if the page is CJK. + if (cjk_script) { FixBrokenCJK(block); } FindTextlineFlowDirection(true); diff --git a/textord/strokewidth.h b/textord/strokewidth.h index 4d06be5c5..552e23a52 100644 --- a/textord/strokewidth.h +++ b/textord/strokewidth.h @@ -112,6 +112,7 @@ class StrokeWidth : public BlobGrid { TO_BLOCK* block, Pix* nontext_pix, const DENORM* denorm, + bool cjk_script, TextlineProjection* projection, ColPartitionGrid* part_grid, ColPartition_LIST* big_parts);