///////////////////////////////////////////////////////////////////////
// File:        textord.h
// Description: The Textord class definition gathers text line and word
//              finding functionality.
// Author:      Ray Smith
// Created:     Fri Mar 13 14:29:01 PDT 2009
//
// (C) Copyright 2009, Google Inc.
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
// http://www.apache.org/licenses/LICENSE-2.0
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
///////////////////////////////////////////////////////////////////////

#ifndef TESSERACT_TEXTORD_TEXTORD_H_
#define TESSERACT_TEXTORD_TEXTORD_H_

#include "ccstruct.h"
#include "bbgrid.h"
#include "blobbox.h"
#include "gap_map.h"
#include "publictypes.h"  // For PageSegMode.

class FCOORD;
class BLOCK_LIST;
class PAGE_RES;
class TO_BLOCK;
class TO_BLOCK_LIST;
class ScrollView;

namespace tesseract {

// A simple class that can be used by BBGrid to hold a word and an expanded
// bounding box that makes it easy to find words to put diacritics.
class WordWithBox {
 public:
  WordWithBox() : word_(NULL) {}
  explicit WordWithBox(WERD *word)
      : word_(word), bounding_box_(word->bounding_box()) {
    int height = bounding_box_.height();
    bounding_box_.pad(height, height);
  }

  const TBOX &bounding_box() const { return bounding_box_; }
  // Returns the bounding box of only the good blobs.
  TBOX true_bounding_box() const { return word_->true_bounding_box(); }
  C_BLOB_LIST *RejBlobs() const { return word_->rej_cblob_list(); }
  const WERD *word() const { return word_; }

 private:
  // Borrowed pointer to a real word somewhere that must outlive this class.
  WERD *word_;
  // Cached expanded bounding box of the word, padded all round by its height.
  TBOX bounding_box_;
};

// Make it usable by BBGrid.
CLISTIZEH(WordWithBox)
typedef BBGrid<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordGrid;
typedef GridSearch<WordWithBox, WordWithBox_CLIST, WordWithBox_C_IT> WordSearch;

class Textord {
 public:
  explicit Textord(CCStruct* ccstruct);
  ~Textord();

  // Make the textlines and words inside each block.
  // binary_pix is mandatory and is the binarized input after line removal.
  // grey_pix is optional, but if present must match the binary_pix in size,
  // and must be a *real* grey image instead of binary_pix * 255.
  // thresholds_pix is expected to be present iff grey_pix is present and
  // can be an integer factor reduction of the grey_pix. It represents the
  // thresholds that were used to create the binary_pix from the grey_pix.
  // diacritic_blobs contain small confusing components that should be added
  // to the appropriate word(s) in case they are really diacritics.
  void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width,
                   int height, Pix *binary_pix, Pix *thresholds_pix,
                   Pix *grey_pix, bool use_box_bottoms,
                   BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks,
                   TO_BLOCK_LIST *to_blocks);

  // If we were supposed to return only a single textline, and there is more
  // than one, clean up and leave only the best.
  void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES* page_res);

  bool use_cjk_fp_model() const {
    return use_cjk_fp_model_;
  }
  void set_use_cjk_fp_model(bool flag) {
    use_cjk_fp_model_ = flag;
  }

  // tospace.cpp ///////////////////////////////////////////
  void to_spacing(
      ICOORD page_tr,        //topright of page
      TO_BLOCK_LIST *blocks  //blocks on page
                                         );
  ROW *make_prop_words(TO_ROW *row,     // row to make
                       FCOORD rotation  // for drawing
                       );
  ROW *make_blob_words(TO_ROW *row,     // row to make
                       FCOORD rotation  // for drawing
                       );
  // tordmain.cpp ///////////////////////////////////////////
  void find_components(Pix* pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks);
  void filter_blobs(ICOORD page_tr, TO_BLOCK_LIST *blocks, BOOL8 testing_on);

 private:
  // For underlying memory management and other utilities.
  CCStruct* ccstruct_;

  // The size of the input image.
  ICOORD page_tr_;

  bool use_cjk_fp_model_;

  // makerow.cpp ///////////////////////////////////////////
  // Make the textlines inside each block.
  void MakeRows(PageSegMode pageseg_mode, const FCOORD& skew,
                int width, int height, TO_BLOCK_LIST* to_blocks);
  // Make the textlines inside a single block.
  void MakeBlockRows(int min_spacing, int max_spacing,
                     const FCOORD& skew, TO_BLOCK* block,
                     ScrollView* win);

 public:
  void compute_block_xheight(TO_BLOCK *block, float gradient);
  void compute_row_xheight(TO_ROW *row,          // row to do
                           const FCOORD& rotation,
                           float gradient,       // global skew
                           int block_line_size);
  void make_spline_rows(TO_BLOCK *block,   // block to do
                        float gradient,    // gradient to fit
                        BOOL8 testing_on);
 private:
  //// oldbasel.cpp ////////////////////////////////////////
  void make_old_baselines(TO_BLOCK *block,   // block to do
                          BOOL8 testing_on,  // correct orientation
                          float gradient);
  void correlate_lines(TO_BLOCK *block, float gradient);
  void correlate_neighbours(TO_BLOCK *block,  // block rows are in.
                            TO_ROW **rows,    // rows of block.
                            int rowcount);    // no of rows to do.
  int correlate_with_stats(TO_ROW **rows,  // rows of block.
                           int rowcount,   // no of rows to do.
                           TO_BLOCK* block);
  void find_textlines(TO_BLOCK *block,  // block row is in
                      TO_ROW *row,      // row to do
                      int degree,       // required approximation
                      QSPLINE *spline);  // starting spline
  // tospace.cpp ///////////////////////////////////////////
  //DEBUG USE ONLY
  void block_spacing_stats(TO_BLOCK *block,
                           GAPMAP *gapmap,
                           BOOL8 &old_text_ord_proportional,
                           //resulting estimate
                           inT16 &block_space_gap_width,
                           //resulting estimate
                           inT16 &block_non_space_gap_width
                           );
  void row_spacing_stats(TO_ROW *row,
                         GAPMAP *gapmap,
                         inT16 block_idx,
                         inT16 row_idx,
                         //estimate for block
                         inT16 block_space_gap_width,
                         //estimate for block
                         inT16 block_non_space_gap_width
                         );
  void old_to_method(TO_ROW *row,
                     STATS *all_gap_stats,
                     STATS *space_gap_stats,
                     STATS *small_gap_stats,
                     inT16 block_space_gap_width,
                     //estimate for block
                     inT16 block_non_space_gap_width
                     );
  BOOL8 isolated_row_stats(TO_ROW *row,
                           GAPMAP *gapmap,
                           STATS *all_gap_stats,
                           BOOL8 suspected_table,
                           inT16 block_idx,
                           inT16 row_idx);
  inT16 stats_count_under(STATS *stats, inT16 threshold);
  void improve_row_threshold(TO_ROW *row, STATS *all_gap_stats);
  BOOL8 make_a_word_break(TO_ROW *row,   // row being made
                          TBOX blob_box, // for next_blob // how many blanks?
                          inT16 prev_gap,
                          TBOX prev_blob_box,
                          inT16 real_current_gap,
                          inT16 within_xht_current_gap,
                          TBOX next_blob_box,
                          inT16 next_gap,
                          uinT8 &blanks,
                          BOOL8 &fuzzy_sp,
                          BOOL8 &fuzzy_non,
                          BOOL8& prev_gap_was_a_space,
                          BOOL8& break_at_next_gap);
  BOOL8 narrow_blob(TO_ROW *row, TBOX blob_box);
  BOOL8 wide_blob(TO_ROW *row, TBOX blob_box);
  BOOL8 suspected_punct_blob(TO_ROW *row, TBOX box);
  void peek_at_next_gap(TO_ROW *row,
                        BLOBNBOX_IT box_it,
                        TBOX &next_blob_box,
                        inT16 &next_gap,
                        inT16 &next_within_xht_gap);
  void mark_gap(TBOX blob,    //blob following gap
                inT16 rule,  // heuristic id
                inT16 prev_gap,
                inT16 prev_blob_width,
                inT16 current_gap,
                inT16 next_blob_width,
                inT16 next_gap);
  float find_mean_blob_spacing(WERD *word);
  BOOL8 ignore_big_gap(TO_ROW *row,
                       inT32 row_length,
                       GAPMAP *gapmap,
                       inT16 left,
                       inT16 right);
  //get bounding box
  TBOX reduced_box_next(TO_ROW *row,     //current row
                        BLOBNBOX_IT *it  //iterator to blobds
                        );
  TBOX reduced_box_for_blob(BLOBNBOX *blob, TO_ROW *row, inT16 *left_above_xht);
  // tordmain.cpp ///////////////////////////////////////////
  float filter_noise_blobs(BLOBNBOX_LIST *src_list,
                           BLOBNBOX_LIST *noise_list,
                           BLOBNBOX_LIST *small_list,
                           BLOBNBOX_LIST *large_list);
  // Fixes the block so it obeys all the rules:
  // Must have at least one ROW.
  // Must have at least one WERD.
  // WERDs contain a fake blob.
  void cleanup_nontext_block(BLOCK* block);
  void cleanup_blocks(bool clean_noise, BLOCK_LIST *blocks);
  BOOL8 clean_noise_from_row(ROW *row);
  void clean_noise_from_words(ROW *row);
  // Remove outlines that are a tiny fraction in either width or height
  // of the word height.
  void clean_small_noise_from_words(ROW *row);
  // Groups blocks by rotation, then, for each group, makes a WordGrid and calls
  // TransferDiacriticsToWords to copy the diacritic blobs to the most
  // appropriate words in the group of blocks. Source blobs are not touched.
  void TransferDiacriticsToBlockGroups(BLOBNBOX_LIST* diacritic_blobs,
                                       BLOCK_LIST* blocks);
  // Places a copy of blobs that are near a word (after applying rotation to the
  // blob) in the most appropriate word, unless there is doubt, in which case a
  // blob can end up in two words. Source blobs are not touched.
  void TransferDiacriticsToWords(BLOBNBOX_LIST *diacritic_blobs,
                                 const FCOORD &rotation, WordGrid *word_grid);

 public:
  // makerow.cpp ///////////////////////////////////////////
  BOOL_VAR_H(textord_single_height_mode, false,
             "Script has no xheight, so use a single mode for horizontal text");
  // tospace.cpp ///////////////////////////////////////////
  BOOL_VAR_H(tosp_old_to_method, false, "Space stats use prechopping?");
  BOOL_VAR_H(tosp_old_to_constrain_sp_kn, false,
             "Constrain relative values of inter and intra-word gaps for "
             "old_to_method.");
  BOOL_VAR_H(tosp_only_use_prop_rows, true,
             "Block stats to use fixed pitch rows?");
  BOOL_VAR_H(tosp_force_wordbreak_on_punct, false,
             "Force word breaks on punct to break long lines in non-space "
             "delimited langs");
  BOOL_VAR_H(tosp_use_pre_chopping, false,
             "Space stats use prechopping?");
  BOOL_VAR_H(tosp_old_to_bug_fix, false,
             "Fix suspected bug in old code");
  BOOL_VAR_H(tosp_block_use_cert_spaces, true,
             "Only stat OBVIOUS spaces");
  BOOL_VAR_H(tosp_row_use_cert_spaces, true,
             "Only stat OBVIOUS spaces");
  BOOL_VAR_H(tosp_narrow_blobs_not_cert, true,
             "Only stat OBVIOUS spaces");
  BOOL_VAR_H(tosp_row_use_cert_spaces1, true,
             "Only stat OBVIOUS spaces");
  BOOL_VAR_H(tosp_recovery_isolated_row_stats, true,
             "Use row alone when inadequate cert spaces");
  BOOL_VAR_H(tosp_only_small_gaps_for_kern, false, "Better guess");
  BOOL_VAR_H(tosp_all_flips_fuzzy, false, "Pass ANY flip to context?");
  BOOL_VAR_H(tosp_fuzzy_limit_all, true,
             "Don't restrict kn->sp fuzzy limit to tables");
  BOOL_VAR_H(tosp_stats_use_xht_gaps, true,
             "Use within xht gap for wd breaks");
  BOOL_VAR_H(tosp_use_xht_gaps, true,
             "Use within xht gap for wd breaks");
  BOOL_VAR_H(tosp_only_use_xht_gaps, false,
             "Only use within xht gap for wd breaks");
  BOOL_VAR_H(tosp_rule_9_test_punct, false,
             "Don't chng kn to space next to punct");
  BOOL_VAR_H(tosp_flip_fuzz_kn_to_sp, true, "Default flip");
  BOOL_VAR_H(tosp_flip_fuzz_sp_to_kn, true, "Default flip");
  BOOL_VAR_H(tosp_improve_thresh, false,
             "Enable improvement heuristic");
  INT_VAR_H(tosp_debug_level, 0, "Debug data");
  INT_VAR_H(tosp_enough_space_samples_for_median, 3,
            "or should we use mean");
  INT_VAR_H(tosp_redo_kern_limit, 10,
            "No.samples reqd to reestimate for row");
  INT_VAR_H(tosp_few_samples, 40,
            "No.gaps reqd with 1 large gap to treat as a table");
  INT_VAR_H(tosp_short_row, 20,
            "No.gaps reqd with few cert spaces to use certs");
  INT_VAR_H(tosp_sanity_method, 1, "How to avoid being silly");
  double_VAR_H(tosp_old_sp_kn_th_factor, 2.0,
               "Factor for defining space threshold in terms of space and "
               "kern sizes");
  double_VAR_H(tosp_threshold_bias1, 0,
               "how far between kern and space?");
  double_VAR_H(tosp_threshold_bias2, 0,
               "how far between kern and space?");
  double_VAR_H(tosp_narrow_fraction, 0.3,
               "Fract of xheight for narrow");
  double_VAR_H(tosp_narrow_aspect_ratio, 0.48,
               "narrow if w/h less than this");
  double_VAR_H(tosp_wide_fraction, 0.52, "Fract of xheight for wide");
  double_VAR_H(tosp_wide_aspect_ratio, 0.0,
               "wide if w/h less than this");
  double_VAR_H(tosp_fuzzy_space_factor, 0.6,
               "Fract of xheight for fuzz sp");
  double_VAR_H(tosp_fuzzy_space_factor1, 0.5,
               "Fract of xheight for fuzz sp");
  double_VAR_H(tosp_fuzzy_space_factor2, 0.72,
               "Fract of xheight for fuzz sp");
  double_VAR_H(tosp_gap_factor, 0.83, "gap ratio to flip sp->kern");
  double_VAR_H(tosp_kern_gap_factor1, 2.0,
               "gap ratio to flip kern->sp");
  double_VAR_H(tosp_kern_gap_factor2, 1.3,
               "gap ratio to flip kern->sp");
  double_VAR_H(tosp_kern_gap_factor3, 2.5,
               "gap ratio to flip kern->sp");
  double_VAR_H(tosp_ignore_big_gaps, -1, "xht multiplier");
  double_VAR_H(tosp_ignore_very_big_gaps, 3.5, "xht multiplier");
  double_VAR_H(tosp_rep_space, 1.6, "rep gap multiplier for space");
  double_VAR_H(tosp_enough_small_gaps, 0.65,
               "Fract of kerns reqd for isolated row stats");
  double_VAR_H(tosp_table_kn_sp_ratio, 2.25,
               "Min difference of kn & sp in table");
  double_VAR_H(tosp_table_xht_sp_ratio, 0.33,
               "Expect spaces bigger than this");
  double_VAR_H(tosp_table_fuzzy_kn_sp_ratio, 3.0,
               "Fuzzy if less than this");
  double_VAR_H(tosp_fuzzy_kn_fraction, 0.5, "New fuzzy kn alg");
  double_VAR_H(tosp_fuzzy_sp_fraction, 0.5, "New fuzzy sp alg");
  double_VAR_H(tosp_min_sane_kn_sp, 1.5,
               "Don't trust spaces less than this time kn");
  double_VAR_H(tosp_init_guess_kn_mult, 2.2,
               "Thresh guess - mult kn by this");
  double_VAR_H(tosp_init_guess_xht_mult, 0.28,
               "Thresh guess - mult xht by this");
  double_VAR_H(tosp_max_sane_kn_thresh, 5.0,
               "Multiplier on kn to limit thresh");
  double_VAR_H(tosp_flip_caution, 0.0,
               "Don't autoflip kn to sp when large separation");
  double_VAR_H(tosp_large_kerning, 0.19,
               "Limit use of xht gap with large kns");
  double_VAR_H(tosp_dont_fool_with_small_kerns, -1,
               "Limit use of xht gap with odd small kns");
  double_VAR_H(tosp_near_lh_edge, 0,
               "Don't reduce box if the top left is non blank");
  double_VAR_H(tosp_silly_kn_sp_gap, 0.2,
               "Don't let sp minus kn get too small");
  double_VAR_H(tosp_pass_wide_fuzz_sp_to_context, 0.75,
               "How wide fuzzies need context");
  // tordmain.cpp ///////////////////////////////////////////
  BOOL_VAR_H(textord_no_rejects, false, "Don't remove noise blobs");
  BOOL_VAR_H(textord_show_blobs, false, "Display unsorted blobs");
  BOOL_VAR_H(textord_show_boxes, false, "Display boxes");
  INT_VAR_H(textord_max_noise_size, 7, "Pixel size of noise");
  INT_VAR_H(textord_baseline_debug, 0, "Baseline debug level");
  double_VAR_H(textord_blob_size_bigile, 95, "Percentile for large blobs");
  double_VAR_H(textord_noise_area_ratio, 0.7,
               "Fraction of bounding box for noise");
  double_VAR_H(textord_blob_size_smallile, 20, "Percentile for small blobs");
  double_VAR_H(textord_initialx_ile, 0.75, "Ile of sizes for xheight guess");
  double_VAR_H(textord_initialasc_ile, 0.90, "Ile of sizes for xheight guess");
  INT_VAR_H(textord_noise_sizefraction, 10, "Fraction of size for maxima");
  double_VAR_H(textord_noise_sizelimit, 0.5, "Fraction of x for big t count");
  INT_VAR_H(textord_noise_translimit, 16, "Transitions for normal blob");
  double_VAR_H(textord_noise_normratio, 2.0, "Dot to norm ratio for deletion");
  BOOL_VAR_H(textord_noise_rejwords, true, "Reject noise-like words");
  BOOL_VAR_H(textord_noise_rejrows, true, "Reject noise-like rows");
  double_VAR_H(textord_noise_syfract, 0.2, "xh fract error for norm blobs");
  double_VAR_H(textord_noise_sxfract, 0.4,
               "xh fract width error for norm blobs");
  double_VAR_H(textord_noise_hfract, 1.0/64,
               "Height fraction to discard outlines as speckle noise");
  INT_VAR_H(textord_noise_sncount, 1, "super norm blobs to save row");
  double_VAR_H(textord_noise_rowratio, 6.0, "Dot to norm ratio for deletion");
  BOOL_VAR_H(textord_noise_debug, FALSE, "Debug row garbage detector");
  double_VAR_H(textord_blshift_maxshift, 0.00, "Max baseline shift");
  double_VAR_H(textord_blshift_xfraction, 9.99, "Min size of baseline shift");
};
}  // namespace tesseract.

#endif  // TESSERACT_TEXTORD_TEXTORD_H_