mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
Merge pull request #2305 from stweil/fuzz
Fix Heap-buffer-overflow in GenericVector<int>::size (issue #2298)
This commit is contained in:
commit
0e72733121
@ -107,8 +107,10 @@ class Dict {
|
||||
|
||||
// Returns true if unichar_id is a word compounding character like - or /.
|
||||
inline bool compound_marker(UNICHAR_ID unichar_id) {
|
||||
const UNICHARSET& unicharset = getUnicharset();
|
||||
ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
|
||||
const GenericVector<UNICHAR_ID>& normed_ids =
|
||||
getUnicharset().normed_ids(unichar_id);
|
||||
unicharset.normed_ids(unichar_id);
|
||||
return normed_ids.size() == 1 &&
|
||||
(normed_ids[0] == hyphen_unichar_id_ ||
|
||||
normed_ids[0] == slash_unichar_id_);
|
||||
@ -116,8 +118,10 @@ class Dict {
|
||||
// Returns true if unichar_id is an apostrophe-like character that may
|
||||
// separate prefix/suffix words from a main body word.
|
||||
inline bool is_apostrophe(UNICHAR_ID unichar_id) {
|
||||
const UNICHARSET& unicharset = getUnicharset();
|
||||
ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
|
||||
const GenericVector<UNICHAR_ID>& normed_ids =
|
||||
getUnicharset().normed_ids(unichar_id);
|
||||
unicharset.normed_ids(unichar_id);
|
||||
return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
|
||||
}
|
||||
|
||||
@ -141,17 +145,20 @@ class Dict {
|
||||
}
|
||||
}
|
||||
/// Check whether the word has a hyphen at the end.
|
||||
inline bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const {
|
||||
inline bool has_hyphen_end(const UNICHARSET* unicharset,
|
||||
UNICHAR_ID unichar_id, bool first_pos) const {
|
||||
if (!last_word_on_line_ || first_pos)
|
||||
return false;
|
||||
ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
|
||||
const GenericVector<UNICHAR_ID>& normed_ids =
|
||||
getUnicharset().normed_ids(unichar_id);
|
||||
unicharset->normed_ids(unichar_id);
|
||||
return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
|
||||
}
|
||||
/// Same as above, but check the unichar at the end of the word.
|
||||
inline bool has_hyphen_end(const WERD_CHOICE &word) const {
|
||||
int word_index = word.length() - 1;
|
||||
return has_hyphen_end(word.unichar_id(word_index), word_index == 0);
|
||||
return has_hyphen_end(word.unicharset(), word.unichar_id(word_index),
|
||||
word_index == 0);
|
||||
}
|
||||
/// Unless the previous word was the last one on the line, and the current
|
||||
/// one is not (thus it is the first one on the line), erase hyphen_word_,
|
||||
|
@ -3,7 +3,6 @@
|
||||
// Description: Functions that utilize the knowledge about the properties,
|
||||
// structure and statistics of the language to help recognition.
|
||||
// Author: Daria Antonova
|
||||
// Created: Mon Nov 11 11:26:43 PST 2009
|
||||
//
|
||||
// (C) Copyright 2009, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -803,7 +802,8 @@ LanguageModelDawgInfo *LanguageModel::GenerateDawgInfo(
|
||||
}
|
||||
|
||||
// Deal with hyphenated words.
|
||||
if (word_end && dict_->has_hyphen_end(b.unichar_id(), curr_col == 0)) {
|
||||
if (word_end && dict_->has_hyphen_end(&dict_->getUnicharset(),
|
||||
b.unichar_id(), curr_col == 0)) {
|
||||
if (language_model_debug_level > 0) tprintf("Hyphenated word found\n");
|
||||
return new LanguageModelDawgInfo(dawg_args_.active_dawgs, COMPOUND_PERM);
|
||||
}
|
||||
|
Loading…
Reference in New Issue
Block a user