tesseract/dict/context.cpp

/* -*-C-*-
 ********************************************************************************
 *
 * File:        context.c  (Formerly context.c)
 * Description:  Context checking functions
 * Author:       Mark Seaman, OCR Technology
 * Created:      Thu Feb 15 11:18:24 1990
 * Modified:     Tue Jul  9 17:38:16 1991 (Mark Seaman) marks@hpgrlt
 * Language:     C
 * Package:      N/A
 * Status:       Experimental (Do Not Distribute)
 *
 * (c) Copyright 1990, Hewlett-Packard Company.
 ** Licensed under the Apache License, Version 2.0 (the "License");
 ** you may not use this file except in compliance with the License.
 ** You may obtain a copy of the License at
 ** http://www.apache.org/licenses/LICENSE-2.0
 ** Unless required by applicable law or agreed to in writing, software
 ** distributed under the License is distributed on an "AS IS" BASIS,
 ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 ** See the License for the specific language governing permissions and
 ** limitations under the License.
 *
 *********************************************************************************/

#include "dict.h"
#include "tprintf.h"
#include "unicharset.h"

namespace tesseract {

static const int kMinAbsoluteGarbageWordLength = 10;
static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;

const int case_state_table[6][4] = {
    {/*  0. Beginning of word       */
     /*    P   U   L   D                                          */
     /* -1. Error on case           */
     0, 1, 5, 4},
    {/*  1. After initial capital    */
     0, 3, 2, 4},
    {/*  2. After lower case         */
     0, -1, 2, -1},
    {/*  3. After upper case         */
     0, 3, -1, 4},
    {/*  4. After a digit            */
     0, -1, -1, 4},
    {/*  5. After initial lower case */
     5, -1, 2, -1},
};

int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {
  int state = 0;
  int x;
  for (x = 0; x < word.length(); ++x) {
    UNICHAR_ID ch_id = word.unichar_id(x);
    if (unicharset.get_isupper(ch_id))
      state = case_state_table[state][1];
    else if (unicharset.get_islower(ch_id))
      state = case_state_table[state][2];
    else if (unicharset.get_isdigit(ch_id))
      state = case_state_table[state][3];
    else
      state = case_state_table[state][0];
    if (state == -1) return false;
  }
  return state != 5; // single lower is bad
}

bool Dict::absolute_garbage(const WERD_CHOICE &word,
                            const UNICHARSET &unicharset) {
  if (word.length() < kMinAbsoluteGarbageWordLength) return false;
  int num_alphanum = 0;
  for (int x = 0; x < word.length(); ++x) {
    num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) ||
                     unicharset.get_isdigit(word.unichar_id(x)));
  }
  return (static_cast<float>(num_alphanum) /
          static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);
}

}  // namespace tesseract
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`/* --C--`
			`********************************************************************************`
			`*`
			`* File: context.c (Formerly context.c)`
			`* Description: Context checking functions`
			`* Author: Mark Seaman, OCR Technology`
			`* Created: Thu Feb 15 11:18:24 1990`
			`* Modified: Tue Jul 9 17:38:16 1991 (Mark Seaman) marks@hpgrlt`
			`* Language: C`
			`* Package: N/A`
			`* Status: Experimental (Do Not Distribute)`
			`*`
			`* (c) Copyright 1990, Hewlett-Packard Company.`
			`** Licensed under the Apache License, Version 2.0 (the "License");`
			`** you may not use this file except in compliance with the License.`
			`** You may obtain a copy of the License at`
			`** http://www.apache.org/licenses/LICENSE-2.0`
			`** Unless required by applicable law or agreed to in writing, software`
			`** distributed under the License is distributed on an "AS IS" BASIS,`
			`** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.`
			`** See the License for the specific language governing permissions and`
			`** limitations under the License.`
			`*`
			`*********************************************************************************/`
Changes to dict for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:20:33 +08:00
			`#include "dict.h"`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`#include "tprintf.h"`
Changes to dict for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:20:33 +08:00			`#include "unicharset.h"`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
Changes to dict for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:20:33 +08:00			`namespace tesseract {`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`static const int kMinAbsoluteGarbageWordLength = 10;`
			`static const float kMinAbsoluteGarbageAlphanumFrac = 0.5f;`

Result of clang tidy on recent merge 2016-11-08 02:46:33 +08:00			`const int case_state_table[6][4] = {`
			`{/* 0. Beginning of word */`
			`/* P U L D */`
			`/* -1. Error on case */`
			`0, 1, 5, 4},`
			`{/* 1. After initial capital */`
			`0, 3, 2, 4},`
			`{/* 2. After lower case */`
			`0, -1, 2, -1},`
			`{/* 3. After upper case */`
			`0, 3, -1, 4},`
			`{/* 4. After a digit */`
			`0, -1, -1, 4},`
			`{/* 5. After initial lower case */`
			`5, -1, 2, -1},`
			`};`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`int Dict::case_ok(const WERD_CHOICE &word, const UNICHARSET &unicharset) {`
			`int state = 0;`
			`int x;`
Changes to dict for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:20:33 +08:00			`for (x = 0; x < word.length(); ++x) {`
			`UNICHAR_ID ch_id = word.unichar_id(x);`
Fixed name collision with jpeg library git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@164 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2008-04-22 08:42:51 +08:00			`if (unicharset.get_isupper(ch_id))`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`state = case_state_table[state][1];`
Changes to dict for 3.00 git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@293 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2009-07-11 10:20:33 +08:00			`else if (unicharset.get_islower(ch_id))`
Fixed name collision with jpeg library git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@164 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2008-04-22 08:42:51 +08:00			`state = case_state_table[state][2];`
			`else if (unicharset.get_isdigit(ch_id))`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`state = case_state_table[state][3];`
			`else`
			`state = case_state_table[state][0];`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`if (state == -1) return false;`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`return state != 5; // single lower is bad`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`

3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`bool Dict::absolute_garbage(const WERD_CHOICE &word,`
			`const UNICHARSET &unicharset) {`
			`if (word.length() < kMinAbsoluteGarbageWordLength) return false;`
			`int num_alphanum = 0;`
			`for (int x = 0; x < word.length(); ++x) {`
			`num_alphanum += (unicharset.get_isalpha(word.unichar_id(x)) \|\|`
			`unicharset.get_isdigit(word.unichar_id(x)));`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00			`return (static_cast<float>(num_alphanum) /`
			`static_cast<float>(word.length()) < kMinAbsoluteGarbageAlphanumFrac);`
top-skimming import from sf.net git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk/trunk@2 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2007-03-08 04:03:40 +08:00			`}`
3.01 code from http://github.com/jimregan/tesseract-ocr with addaptions related to Linux and Windows (VC2008) compile process git-svn-id: https://tesseract-ocr.googlecode.com/svn/trunk@526 d0cd1f9f-072b-0410-8dd7-cf729c803f20 2010-11-24 02:34:14 +08:00
			`} // namespace tesseract`