mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-06-22 20:50:40 +08:00
trying to add tessedit_char_whitelist etc. again:
- ignore matrix outputs in ComputeTopN if they belong to a disabled unichar_id - pass UNICHARSET refs to check that - in SetBlackAndWhitelist, also update the unicharset of the lstm_recognizer_ instance, if any
This commit is contained in:
parent
fe5c82fd24
commit
6ac2ff083e
@ -619,6 +619,12 @@ void Tesseract::SetBlackAndWhitelist() {
|
|||||||
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
|
unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
|
||||||
tessedit_char_whitelist.string(),
|
tessedit_char_whitelist.string(),
|
||||||
tessedit_char_unblacklist.string());
|
tessedit_char_unblacklist.string());
|
||||||
|
if (lstm_recognizer_) {
|
||||||
|
UNICHARSET& lstm_unicharset = const_cast<UNICHARSET&> (lstm_recognizer_->GetUnicharset());
|
||||||
|
lstm_unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
|
||||||
|
tessedit_char_whitelist.string(),
|
||||||
|
tessedit_char_unblacklist.string());
|
||||||
|
}
|
||||||
// Black and white lists should apply to all loaded classifiers.
|
// Black and white lists should apply to all loaded classifiers.
|
||||||
for (int i = 0; i < sub_langs_.size(); ++i) {
|
for (int i = 0; i < sub_langs_.size(); ++i) {
|
||||||
sub_langs_[i]->unicharset.set_black_and_whitelist(
|
sub_langs_[i]->unicharset.set_black_and_whitelist(
|
||||||
|
@ -87,7 +87,7 @@ void RecodeBeamSearch::Decode(const NetworkIO& output, double dict_ratio,
|
|||||||
if (lstm_choice_mode)
|
if (lstm_choice_mode)
|
||||||
timesteps.clear();
|
timesteps.clear();
|
||||||
for (int t = 0; t < width; ++t) {
|
for (int t = 0; t < width; ++t) {
|
||||||
ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0]);
|
ComputeTopN(output.f(t), output.NumFeatures(), kBeamWidths[0], charset);
|
||||||
DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
|
DecodeStep(output.f(t), t, dict_ratio, cert_offset, worst_dict_cert,
|
||||||
charset);
|
charset);
|
||||||
if (lstm_choice_mode) {
|
if (lstm_choice_mode) {
|
||||||
@ -102,7 +102,7 @@ void RecodeBeamSearch::Decode(const GENERIC_2D_ARRAY<float>& output,
|
|||||||
beam_size_ = 0;
|
beam_size_ = 0;
|
||||||
int width = output.dim1();
|
int width = output.dim1();
|
||||||
for (int t = 0; t < width; ++t) {
|
for (int t = 0; t < width; ++t) {
|
||||||
ComputeTopN(output[t], output.dim2(), kBeamWidths[0]);
|
ComputeTopN(output[t], output.dim2(), kBeamWidths[0], charset);
|
||||||
DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset);
|
DecodeStep(output[t], t, dict_ratio, cert_offset, worst_dict_cert, charset);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@ -456,12 +456,19 @@ WERD_RES* RecodeBeamSearch::InitializeWord(bool leading_space,
|
|||||||
// Fills top_n_flags_ with bools that are true iff the corresponding output
|
// Fills top_n_flags_ with bools that are true iff the corresponding output
|
||||||
// is one of the top_n.
|
// is one of the top_n.
|
||||||
void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
|
void RecodeBeamSearch::ComputeTopN(const float* outputs, int num_outputs,
|
||||||
int top_n) {
|
int top_n, const UNICHARSET* charset) {
|
||||||
top_n_flags_.init_to_size(num_outputs, TN_ALSO_RAN);
|
top_n_flags_.init_to_size(num_outputs, TN_ALSO_RAN);
|
||||||
top_code_ = -1;
|
top_code_ = -1;
|
||||||
second_code_ = -1;
|
second_code_ = -1;
|
||||||
top_heap_.clear();
|
top_heap_.clear();
|
||||||
for (int i = 0; i < num_outputs; ++i) {
|
for (int i = 0; i < num_outputs; ++i) {
|
||||||
|
// Decode label via recoder_.
|
||||||
|
RecodedCharID code;
|
||||||
|
code.Set(0, i);
|
||||||
|
int label = recoder_.DecodeUnichar(code);
|
||||||
|
if (label != INVALID_UNICHAR_ID && // not part of a bigger code.
|
||||||
|
!charset->get_enabled(label)) // disabled
|
||||||
|
continue;
|
||||||
if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key) {
|
if (top_heap_.size() < top_n || outputs[i] > top_heap_.PeekTop().key) {
|
||||||
TopPair entry(outputs[i], i);
|
TopPair entry(outputs[i], i);
|
||||||
top_heap_.Push(&entry);
|
top_heap_.Push(&entry);
|
||||||
|
@ -293,7 +293,7 @@ class RecodeBeamSearch {
|
|||||||
|
|
||||||
// Fills top_n_flags_ with bools that are true iff the corresponding output
|
// Fills top_n_flags_ with bools that are true iff the corresponding output
|
||||||
// is one of the top_n.
|
// is one of the top_n.
|
||||||
void ComputeTopN(const float* outputs, int num_outputs, int top_n);
|
void ComputeTopN(const float* outputs, int num_outputs, int top_n, const UNICHARSET* unicharset);
|
||||||
|
|
||||||
// Adds the computation for the current time-step to the beam. Call at each
|
// Adds the computation for the current time-step to the beam. Call at each
|
||||||
// time-step in sequence from left to right. outputs is the activation vector
|
// time-step in sequence from left to right. outputs is the activation vector
|
||||||
|
Loading…
Reference in New Issue
Block a user