mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2024-11-27 20:59:36 +08:00
unittest: Add paragraphs_test
Signed-off-by: Stefan Weil <sw@weilnetz.de>
This commit is contained in:
parent
53f0e7658f
commit
de6a759744
@ -2,7 +2,6 @@
|
||||
* File: paragraphs_internal.h
|
||||
* Description: Paragraph Detection internal data structures.
|
||||
* Author: David Eger
|
||||
* Created: 11 March 2011
|
||||
*
|
||||
* (C) Copyright 2011, Google Inc.
|
||||
** Licensed under the Apache License, Version 2.0 (the "License");
|
||||
@ -21,10 +20,12 @@
|
||||
#define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
|
||||
|
||||
#include "paragraphs.h"
|
||||
#include "publictypes.h" // for ParagraphJustification
|
||||
|
||||
// NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
|
||||
// DATA STRUCTURES OR FUNCTIONS IN THIS FILE.
|
||||
|
||||
class UNICHARSET;
|
||||
class WERD_CHOICE;
|
||||
|
||||
namespace tesseract {
|
||||
@ -299,4 +300,5 @@ void CanonicalizeDetectionResults(
|
||||
PARA_LIST *paragraphs);
|
||||
|
||||
} // namespace
|
||||
|
||||
#endif // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
|
||||
|
@ -106,6 +106,7 @@ check_PROGRAMS = \
|
||||
matrix_test \
|
||||
nthitem_test \
|
||||
osd_test \
|
||||
paragraphs_test \
|
||||
progress_test \
|
||||
qrsequence_test \
|
||||
rect_test \
|
||||
@ -183,6 +184,12 @@ matrix_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
|
||||
nthitem_test_SOURCES = nthitem_test.cc
|
||||
nthitem_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
|
||||
|
||||
pango_font_info_test_SOURCES = pango_font_info_test.cc
|
||||
pango_font_info_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS)
|
||||
|
||||
paragraphs_test_SOURCES = paragraphs_test.cc
|
||||
paragraphs_test_LDADD = $(ABSEIL_LIBS) $(GTEST_LIBS) $(TESS_LIBS)
|
||||
|
||||
osd_test_SOURCES = osd_test.cc
|
||||
osd_test_LDADD = $(GTEST_LIBS) $(TESS_LIBS) $(LEPTONICA_LIBS)
|
||||
|
||||
|
@ -1,9 +1,29 @@
|
||||
// (C) Copyright 2017, Google Inc.
|
||||
// Licensed under the Apache License, Version 2.0 (the "License");
|
||||
// you may not use this file except in compliance with the License.
|
||||
// You may obtain a copy of the License at
|
||||
// http://www.apache.org/licenses/LICENSE-2.0
|
||||
// Unless required by applicable law or agreed to in writing, software
|
||||
// distributed under the License is distributed on an "AS IS" BASIS,
|
||||
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
// See the License for the specific language governing permissions and
|
||||
// limitations under the License.
|
||||
|
||||
#include <string>
|
||||
#include <string> // for std::string
|
||||
|
||||
#include "tesseract/ccmain/paragraphs.h"
|
||||
#include "tesseract/ccmain/paragraphs_internal.h"
|
||||
#include "tesseract/ccstruct/ocrpara.h"
|
||||
#include "absl/strings/str_cat.h" // for absl::StrCat
|
||||
#include "absl/strings/str_join.h" // for absl::StrJoin
|
||||
#include "absl/strings/str_split.h" // for absl::StrSplit
|
||||
|
||||
#include "include_gunit.h" // for TEST
|
||||
#include "log.h" // for LOG
|
||||
|
||||
#include "genericvector.h"
|
||||
// ccmain
|
||||
#include "paragraphs.h"
|
||||
#include "paragraphs_internal.h"
|
||||
// ccstruct
|
||||
#include "ocrpara.h"
|
||||
|
||||
namespace { // anonymous namespace
|
||||
|
||||
@ -47,7 +67,7 @@ void AsciiToRowInfo(const char* text, int row_number,
|
||||
info->lword_text = info->rword_text = "";
|
||||
info->ltr = true;
|
||||
|
||||
std::vector<string> words = absl::StrSplit(text, ' ', absl::SkipEmpty());
|
||||
std::vector<std::string> words = absl::StrSplit(text, ' ', absl::SkipEmpty());
|
||||
info->num_words = words.size();
|
||||
if (info->num_words < 1) return;
|
||||
|
||||
@ -135,7 +155,7 @@ void EvaluateParagraphDetection(const TextAndModel* correct, int n,
|
||||
EXPECT_EQ(bad_crowns, 0);
|
||||
if (incorrect_breaks || missed_breaks || poorly_matched_models ||
|
||||
bad_list_items || bad_crowns) {
|
||||
std::vector<string> dbg_lines;
|
||||
std::vector<std::string> dbg_lines;
|
||||
dbg_lines.push_back("# ==========================");
|
||||
dbg_lines.push_back("# Correct paragraph breaks:");
|
||||
dbg_lines.push_back("# ==========================");
|
||||
@ -154,7 +174,7 @@ void EvaluateParagraphDetection(const TextAndModel* correct, int n,
|
||||
dbg_lines.push_back("# Paragraph detector output:");
|
||||
dbg_lines.push_back("# ==========================");
|
||||
for (int i = 0; i < n; i++) {
|
||||
string annotation;
|
||||
std::string annotation;
|
||||
if (i == 0 || (detector_output[i - 1] != detector_output[i])) {
|
||||
if (detector_output[i] && detector_output[i]->model) {
|
||||
annotation += absl::StrCat(
|
||||
@ -211,15 +231,15 @@ TEST(ParagraphsTest, ListItemsIdentified) {
|
||||
typedef ParagraphModel PModel;
|
||||
|
||||
const TextAndModel kTwoSimpleParagraphs[] = {
|
||||
{" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"This paragraph starts at the top"},
|
||||
{"of the page and takes 3 lines. "},
|
||||
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"which indicates that the first "},
|
||||
{"paragraph is not a continuation "},
|
||||
{"from a previous page, as it is "},
|
||||
{"indented just like this second "},
|
||||
{"paragraph. "},
|
||||
{" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"This paragraph starts at the top", PCONT, PModel(), false, false},
|
||||
{"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
|
||||
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"which indicates that the first ", PCONT, PModel(), false, false},
|
||||
{"paragraph is not a continuation ", PCONT, PModel(), false, false},
|
||||
{"from a previous page, as it is ", PCONT, PModel(), false, false},
|
||||
{"indented just like this second ", PCONT, PModel(), false, false},
|
||||
{"paragraph. ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestSimpleParagraphDetection) {
|
||||
@ -228,15 +248,15 @@ TEST(ParagraphsTest, TestSimpleParagraphDetection) {
|
||||
}
|
||||
|
||||
const TextAndModel kFewCluesWithCrown[] = {
|
||||
{"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
true},
|
||||
{"of the page and takes two lines."},
|
||||
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"which indicates that the first "},
|
||||
{"paragraph is a continuation from"},
|
||||
{"a previous page, as it is "},
|
||||
{"indented just like this second "},
|
||||
{"paragraph. "},
|
||||
{"This paragraph starts at the top", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
true, false},
|
||||
{"of the page and takes two lines.", PCONT, PModel(), false, false},
|
||||
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"which indicates that the first ", PCONT, PModel(), false, false},
|
||||
{"paragraph is a continuation from", PCONT, PModel(), false, false},
|
||||
{"a previous page, as it is ", PCONT, PModel(), false, false},
|
||||
{"indented just like this second ", PCONT, PModel(), false, false},
|
||||
{"paragraph. ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestFewCluesWithCrown) {
|
||||
@ -245,17 +265,17 @@ TEST(ParagraphsTest, TestFewCluesWithCrown) {
|
||||
}
|
||||
|
||||
const TextAndModel kCrownedParagraph[] = {
|
||||
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
true},
|
||||
{"often not indented as the rest "},
|
||||
{"of the paragraphs are. Nonethe-"},
|
||||
{"less it should be counted as the"},
|
||||
{"same type of paragraph. "},
|
||||
{" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"graphs are both indented two "},
|
||||
{"spaces. "},
|
||||
{" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"fmt refers to as a 'crown.' "},
|
||||
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
true, false},
|
||||
{"often not indented as the rest ", PCONT, PModel(), false, false},
|
||||
{"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
|
||||
{"less it should be counted as the", PCONT, PModel(), false, false},
|
||||
{"same type of paragraph. ", PCONT, PModel(), false, false},
|
||||
{" The second and third para- ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"graphs are both indented two ", PCONT, PModel(), false, false},
|
||||
{"spaces. ", PCONT, PModel(), false, false},
|
||||
{" The first paragraph has what ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"fmt refers to as a 'crown.' ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestCrownParagraphDetection) {
|
||||
@ -263,40 +283,40 @@ TEST(ParagraphsTest, TestCrownParagraphDetection) {
|
||||
}
|
||||
|
||||
const TextAndModel kFlushLeftParagraphs[] = {
|
||||
{"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0)},
|
||||
{"flush left paragraphs (those"},
|
||||
{"with no body indent) are not"},
|
||||
{"actually crowns. "},
|
||||
{"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0)},
|
||||
{"also flush left aligned. Usual-"},
|
||||
{"ly, these paragraphs are set"},
|
||||
{"apart vertically by some white-"},
|
||||
{"space, but you can also detect"},
|
||||
{"them by observing the big empty"},
|
||||
{"space at the ends of the para-"},
|
||||
{"graphs. "},
|
||||
{"It is sometimes the case that", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
|
||||
{"flush left paragraphs (those", PCONT, PModel(), false, false},
|
||||
{"with no body indent) are not", PCONT, PModel(), false, false},
|
||||
{"actually crowns. ", PCONT, PModel(), false, false},
|
||||
{"Instead, further paragraphs are", PSTART, PModel(kLeft, 0, 0, 0, 0), false, false},
|
||||
{"also flush left aligned. Usual-", PCONT, PModel(), false, false},
|
||||
{"ly, these paragraphs are set", PCONT, PModel(), false, false},
|
||||
{"apart vertically by some white-", PCONT, PModel(), false, false},
|
||||
{"space, but you can also detect", PCONT, PModel(), false, false},
|
||||
{"them by observing the big empty", PCONT, PModel(), false, false},
|
||||
{"space at the ends of the para-", PCONT, PModel(), false, false},
|
||||
{"graphs. ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsText, TestRealFlushLeftParagraphs) {
|
||||
TestParagraphDetection(kFlushLeftParagraphs,
|
||||
ABSL_ARRAYSIZE(kFlushLeftParagraphs));
|
||||
};
|
||||
}
|
||||
|
||||
const TextAndModel kSingleFullPageContinuation[] = {
|
||||
{"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true},
|
||||
{"continuation. It flows from"},
|
||||
{"line to line, using the full"},
|
||||
{"column width with no clear"},
|
||||
{"paragraph break, because it"},
|
||||
{"actually doesn't have one. It"},
|
||||
{"is the middle of one monster"},
|
||||
{"paragraph continued from the"},
|
||||
{"previous page and continuing"},
|
||||
{"onto the next page. There-"},
|
||||
{"fore, it ends up getting"},
|
||||
{"marked as a crown and then"},
|
||||
{"getting re-marked as any ex-"},
|
||||
{"isting model. Not great, but"},
|
||||
{"sometimes a page is one giant", PSTART, PModel(kLeft, 0, 20, 0, 0), true, false},
|
||||
{"continuation. It flows from", PCONT, PModel(), false, false},
|
||||
{"line to line, using the full", PCONT, PModel(), false, false},
|
||||
{"column width with no clear", PCONT, PModel(), false, false},
|
||||
{"paragraph break, because it", PCONT, PModel(), false, false},
|
||||
{"actually doesn't have one. It", PCONT, PModel(), false, false},
|
||||
{"is the middle of one monster", PCONT, PModel(), false, false},
|
||||
{"paragraph continued from the", PCONT, PModel(), false, false},
|
||||
{"previous page and continuing", PCONT, PModel(), false, false},
|
||||
{"onto the next page. There-", PCONT, PModel(), false, false},
|
||||
{"fore, it ends up getting", PCONT, PModel(), false, false},
|
||||
{"marked as a crown and then", PCONT, PModel(), false, false},
|
||||
{"getting re-marked as any ex-", PCONT, PModel(), false, false},
|
||||
{"isting model. Not great, but", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestSingleFullPageContinuation) {
|
||||
@ -314,14 +334,14 @@ TEST(ParagraphsTest, TestSingleFullPageContinuation) {
|
||||
}
|
||||
|
||||
const TextAndModel kRightAligned[] = {
|
||||
{"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0)},
|
||||
{" uncommon in Left-to-Right"},
|
||||
{" languages, but they do"},
|
||||
{" exist."},
|
||||
{" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0)},
|
||||
{" horribly tiny paragraphs in"},
|
||||
{" tables on which we have no"},
|
||||
{" chance anyways."},
|
||||
{"Right-aligned paragraphs are", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
|
||||
{" uncommon in Left-to-Right", PCONT, PModel(), false, false},
|
||||
{" languages, but they do", PCONT, PModel(), false, false},
|
||||
{" exist.", PCONT, PModel(), false, false},
|
||||
{" Mostly, however, they're", PSTART, PModel(kRight, 0, 0, 0, 0), false, false},
|
||||
{" horribly tiny paragraphs in", PCONT, PModel(), false, false},
|
||||
{" tables on which we have no", PCONT, PModel(), false, false},
|
||||
{" chance anyways.", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestRightAlignedParagraph) {
|
||||
@ -329,21 +349,21 @@ TEST(ParagraphsTest, TestRightAlignedParagraph) {
|
||||
}
|
||||
|
||||
const TextAndModel kTinyParagraphs[] = {
|
||||
{" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"obvious paragraph text, you might"},
|
||||
{"find short exchanges of dialogue "},
|
||||
{"between characters. "},
|
||||
{" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"mark a new paragraph whenever one"},
|
||||
{"of the statistics (left, right or"},
|
||||
{"center) changes from one text-"},
|
||||
{"line to the next. Such an"},
|
||||
{"approach would misclassify the"},
|
||||
{"tiny paragraphs above as a single"},
|
||||
{"paragraph. "},
|
||||
{" Occasionally, interspersed with", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"obvious paragraph text, you might", PCONT, PModel(), false, false},
|
||||
{"find short exchanges of dialogue ", PCONT, PModel(), false, false},
|
||||
{"between characters. ", PCONT, PModel(), false, false},
|
||||
{" 'Oh?' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{" 'Don't be confused!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{" 'Not me!' ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{" One naive approach would be to ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"mark a new paragraph whenever one", PCONT, PModel(), false, false},
|
||||
{"of the statistics (left, right or", PCONT, PModel(), false, false},
|
||||
{"center) changes from one text-", PCONT, PModel(), false, false},
|
||||
{"line to the next. Such an", PCONT, PModel(), false, false},
|
||||
{"approach would misclassify the", PCONT, PModel(), false, false},
|
||||
{"tiny paragraphs above as a single", PCONT, PModel(), false, false},
|
||||
{"paragraph. ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestTinyParagraphs) {
|
||||
@ -351,49 +371,49 @@ TEST(ParagraphsTest, TestTinyParagraphs) {
|
||||
}
|
||||
|
||||
const TextAndModel kComplexPage1[] = {
|
||||
{" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0)},
|
||||
{" Centered Title "},
|
||||
{" Paragraph Detection "},
|
||||
{" OCR TEAM "},
|
||||
{" 10 November 2010 "},
|
||||
{" ", PNONE},
|
||||
{" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"This paragraph starts at the top"},
|
||||
{"of the page and takes 3 lines. "},
|
||||
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"which indicates that the first "},
|
||||
{"paragraph is not a continuation "},
|
||||
{"from a previous page, as it is "},
|
||||
{"indented just like this second "},
|
||||
{"paragraph. "},
|
||||
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
|
||||
true},
|
||||
{" looks like the prior text "},
|
||||
{" but it is indented more "},
|
||||
{" and is fully justified. "},
|
||||
{" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"centered text, block quotes, "},
|
||||
{"normal paragraphs, and lists "},
|
||||
{"like what follows? "},
|
||||
{"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" looking for lines where the "},
|
||||
{" first word of the next line "},
|
||||
{" would fit on the previous "},
|
||||
{" line. "},
|
||||
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" Python and try it out. "},
|
||||
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" mistakes. "},
|
||||
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"you can try to identify source "},
|
||||
{"code. Ouch! "},
|
||||
{" Awesome ", PSTART, PModel(kCenter, 0, 0, 0, 0), false, false},
|
||||
{" Centered Title ", PCONT, PModel(), false, false},
|
||||
{" Paragraph Detection ", PCONT, PModel(), false, false},
|
||||
{" OCR TEAM ", PCONT, PModel(), false, false},
|
||||
{" 10 November 2010 ", PCONT, PModel(), false, false},
|
||||
{" ", PNONE, PModel(), false, false},
|
||||
{" Look here, I have a paragraph.", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"This paragraph starts at the top", PCONT, PModel(), false, false},
|
||||
{"of the page and takes 3 lines. ", PCONT, PModel(), false, false},
|
||||
{" Here I have a second paragraph", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"which indicates that the first ", PCONT, PModel(), false, false},
|
||||
{"paragraph is not a continuation ", PCONT, PModel(), false, false},
|
||||
{"from a previous page, as it is ", PCONT, PModel(), false, false},
|
||||
{"indented just like this second ", PCONT, PModel(), false, false},
|
||||
{"paragraph. ", PCONT, PModel(), false, false},
|
||||
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
|
||||
true, false},
|
||||
{" looks like the prior text ", PCONT, PModel(), false, false},
|
||||
{" but it is indented more ", PCONT, PModel(), false, false},
|
||||
{" and is fully justified. ", PCONT, PModel(), false, false},
|
||||
{" So how does one deal with ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"centered text, block quotes, ", PCONT, PModel(), false, false},
|
||||
{"normal paragraphs, and lists ", PCONT, PModel(), false, false},
|
||||
{"like what follows? ", PCONT, PModel(), false, false},
|
||||
{"1. Make a plan. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{"2. Use a heuristic, for example,", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" looking for lines where the ", PCONT, PModel(), false, false},
|
||||
{" first word of the next line ", PCONT, PModel(), false, false},
|
||||
{" would fit on the previous ", PCONT, PModel(), false, false},
|
||||
{" line. ", PCONT, PModel(), false, false},
|
||||
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" Python and try it out. ", PCONT, PModel(), false, false},
|
||||
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" mistakes. ", PCONT, PModel(), false, false},
|
||||
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" For extra painful penalty work", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"you can try to identify source ", PCONT, PModel(), false, false},
|
||||
{"code. Ouch! ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestComplexPage1) {
|
||||
@ -402,47 +422,47 @@ TEST(ParagraphsTest, TestComplexPage1) {
|
||||
|
||||
// The same as above, but wider.
|
||||
const TextAndModel kComplexPage2[] = {
|
||||
{" Awesome ", PSTART,
|
||||
PModel(kCenter, 0, 0, 0, 0)},
|
||||
{" Centered Title "},
|
||||
{" Paragraph Detection "},
|
||||
{" OCR TEAM "},
|
||||
{" 10 November 2010 "},
|
||||
{" ", PNONE},
|
||||
{" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"This paragraph starts at the top of"},
|
||||
{"the page and takes 3 lines. "},
|
||||
{" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"which indicates that the first "},
|
||||
{"paragraph is not a continuation "},
|
||||
{"from a previous page, as it is in- "},
|
||||
{"dented just like this second para- "},
|
||||
{"graph. "},
|
||||
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
|
||||
true},
|
||||
{" looks like the prior text "},
|
||||
{" but it is indented more "},
|
||||
{" and is fully justified. "},
|
||||
{" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"ed text, block quotes, normal para-"},
|
||||
{"graphs, and lists like what follow?"},
|
||||
{"1. Make a plan. "}, // BUG!!
|
||||
{"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" looking for lines where the "},
|
||||
{" first word of the next line "},
|
||||
{" would fit on the previous line. "},
|
||||
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" Python and try it out. "},
|
||||
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" mistakes. "},
|
||||
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"you can try to identify source "},
|
||||
{"code. Ouch! "},
|
||||
{" Awesome ", PSTART,
|
||||
PModel(kCenter, 0, 0, 0, 0), false, false},
|
||||
{" Centered Title ", PCONT, PModel(), false, false},
|
||||
{" Paragraph Detection ", PCONT, PModel(), false, false},
|
||||
{" OCR TEAM ", PCONT, PModel(), false, false},
|
||||
{" 10 November 2010 ", PCONT, PModel(), false, false},
|
||||
{" ", PNONE, PModel(), false, false},
|
||||
{" Look here, I have a paragraph. ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"This paragraph starts at the top of", PCONT, PModel(), false, false},
|
||||
{"the page and takes 3 lines. ", PCONT, PModel(), false, false},
|
||||
{" Here I have a second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"which indicates that the first ", PCONT, PModel(), false, false},
|
||||
{"paragraph is not a continuation ", PCONT, PModel(), false, false},
|
||||
{"from a previous page, as it is in- ", PCONT, PModel(), false, false},
|
||||
{"dented just like this second para- ", PCONT, PModel(), false, false},
|
||||
{"graph. ", PCONT, PModel(), false, false},
|
||||
{" Here is a block quote. It ", PSTART, PModel(kLeft, 30, 0, 0, 0),
|
||||
true, false},
|
||||
{" looks like the prior text ", PCONT, PModel(), false, false},
|
||||
{" but it is indented more ", PCONT, PModel(), false, false},
|
||||
{" and is fully justified. ", PCONT, PModel(), false, false},
|
||||
{" So how does one deal with center-", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"ed text, block quotes, normal para-", PCONT, PModel(), false, false},
|
||||
{"graphs, and lists like what follow?", PCONT, PModel(), false, false},
|
||||
{"1. Make a plan. ", PCONT, PModel(), false, false}, // BUG!!
|
||||
{"2. Use a heuristic, for example, ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" looking for lines where the ", PCONT, PModel(), false, false},
|
||||
{" first word of the next line ", PCONT, PModel(), false, false},
|
||||
{" would fit on the previous line. ", PCONT, PModel(), false, false},
|
||||
{"8. Try to implement the plan in ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" Python and try it out. ", PCONT, PModel(), false, false},
|
||||
{"4. Determine how to fix the ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" mistakes. ", PCONT, PModel(), false, false},
|
||||
{"5. Repeat. ", PSTART, PModel(kLeft, 0, 0, 30, 0),
|
||||
false, true},
|
||||
{" For extra painful penalty work ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"you can try to identify source ", PCONT, PModel(), false, false},
|
||||
{"code. Ouch! ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestComplexPage2) {
|
||||
@ -450,15 +470,15 @@ TEST(ParagraphsTest, TestComplexPage2) {
|
||||
}
|
||||
|
||||
const TextAndModel kSubtleCrown[] = {
|
||||
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
true},
|
||||
{"often not indented as the rest "},
|
||||
{"of the paragraphs are. Nonethe-"},
|
||||
{"less it should be counted as the"},
|
||||
{"same type of paragraph. "},
|
||||
{" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"should suffice. "},
|
||||
{" 1235 ", PNONE},
|
||||
{"The first paragraph on a page is", PSTART, PModel(kLeft, 0, 20, 0, 0),
|
||||
true, false},
|
||||
{"often not indented as the rest ", PCONT, PModel(), false, false},
|
||||
{"of the paragraphs are. Nonethe-", PCONT, PModel(), false, false},
|
||||
{"less it should be counted as the", PCONT, PModel(), false, false},
|
||||
{"same type of paragraph. ", PCONT, PModel(), false, false},
|
||||
{" Even a short second paragraph ", PSTART, PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"should suffice. ", PCONT, PModel(), false, false},
|
||||
{" 1235 ", PNONE, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestSubtleCrown) {
|
||||
@ -470,43 +490,43 @@ TEST(ParagraphsTest, TestStrayLineInBlock) {
|
||||
}
|
||||
|
||||
const TextAndModel kUnlvRep3AO[] = {
|
||||
{" Defined contribution plans cover employees in Australia, New", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. "},
|
||||
{"In addition, employees in the U.S. are eligible to participate in "},
|
||||
{"defined contribution plans (Employee Savings Plans) by contribut-"},
|
||||
{"ing a portion of their compensation. The Company matches com- "},
|
||||
{"pensation, depending on Company profit levels. Contributions "},
|
||||
{"charged to income for defined contribution plans were $92 in "},
|
||||
{"1993, $98 in 1992 and $89 in 1991. "},
|
||||
{" In addition to providing pension benefits, the Company pro- ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"vides certain health care and life insurance benefits to retired "},
|
||||
{"employees. As discussed in Note A, the Company adopted FASB "},
|
||||
{"Statement No. 106 effective January 1, 1992. Previously, the "},
|
||||
{"Company recognized the cost of providing these benefits as the "},
|
||||
{"benefits were paid. These pretax costs amounted to $53 in 1991. "},
|
||||
{"The Company continues to fund most of the cost of these medical "},
|
||||
{"and life insurance benefits in the year incurred. "},
|
||||
{" The U.S. plan covering the parent company is the largest plan.",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"It provides medical and life insurance benefits including hospital, "},
|
||||
{"physicians’ services and major medical expense benefits and life "},
|
||||
{"insurance benefits. The plan provides benefits supplemental to "},
|
||||
{"Medicare after retirees are eligible for these benefits. The cost of "},
|
||||
{"these benefits are shared by the Company and the retiree, with the "},
|
||||
{"Company portion increasing as the retiree has increased years of "},
|
||||
{"credited service. The Company has the ability to change these "},
|
||||
{"benefits at any time. "},
|
||||
{" Effective October 1993, the Company amended its health ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"benefits plan in the U.S. to cap the cost absorbed by the Company "},
|
||||
{"at approximately twice the 1993 cost per person for employees who"},
|
||||
{"retire after December 31, 1993. The effect of this amendment was "},
|
||||
{"to reduce the December 31, 1993 accumulated postretirement "},
|
||||
{"benefit obligation by $327. It also reduced the net periodic postre- "},
|
||||
{"tirement cost by $21 for 1993 and is estimated to reduce this cost "},
|
||||
{"for 1994 by approximately $83. "},
|
||||
{" Defined contribution plans cover employees in Australia, New", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"Zealand, Spain, the United Kingdom and some U.S. subsidiaries. ", PCONT, PModel(), false, false},
|
||||
{"In addition, employees in the U.S. are eligible to participate in ", PCONT, PModel(), false, false},
|
||||
{"defined contribution plans (Employee Savings Plans) by contribut-", PCONT, PModel(), false, false},
|
||||
{"ing a portion of their compensation. The Company matches com- ", PCONT, PModel(), false, false},
|
||||
{"pensation, depending on Company profit levels. Contributions ", PCONT, PModel(), false, false},
|
||||
{"charged to income for defined contribution plans were $92 in ", PCONT, PModel(), false, false},
|
||||
{"1993, $98 in 1992 and $89 in 1991. ", PCONT, PModel(), false, false},
|
||||
{" In addition to providing pension benefits, the Company pro- ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"vides certain health care and life insurance benefits to retired ", PCONT, PModel(), false, false},
|
||||
{"employees. As discussed in Note A, the Company adopted FASB ", PCONT, PModel(), false, false},
|
||||
{"Statement No. 106 effective January 1, 1992. Previously, the ", PCONT, PModel(), false, false},
|
||||
{"Company recognized the cost of providing these benefits as the ", PCONT, PModel(), false, false},
|
||||
{"benefits were paid. These pretax costs amounted to $53 in 1991. ", PCONT, PModel(), false, false},
|
||||
{"The Company continues to fund most of the cost of these medical ", PCONT, PModel(), false, false},
|
||||
{"and life insurance benefits in the year incurred. ", PCONT, PModel(), false, false},
|
||||
{" The U.S. plan covering the parent company is the largest plan.",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"It provides medical and life insurance benefits including hospital, ", PCONT, PModel(), false, false},
|
||||
{"physicians’ services and major medical expense benefits and life ", PCONT, PModel(), false, false},
|
||||
{"insurance benefits. The plan provides benefits supplemental to ", PCONT, PModel(), false, false},
|
||||
{"Medicare after retirees are eligible for these benefits. The cost of ", PCONT, PModel(), false, false},
|
||||
{"these benefits are shared by the Company and the retiree, with the ", PCONT, PModel(), false, false},
|
||||
{"Company portion increasing as the retiree has increased years of ", PCONT, PModel(), false, false},
|
||||
{"credited service. The Company has the ability to change these ", PCONT, PModel(), false, false},
|
||||
{"benefits at any time. ", PCONT, PModel(), false, false},
|
||||
{" Effective October 1993, the Company amended its health ", PSTART,
|
||||
PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"benefits plan in the U.S. to cap the cost absorbed by the Company ", PCONT, PModel(), false, false},
|
||||
{"at approximately twice the 1993 cost per person for employees who", PCONT, PModel(), false, false},
|
||||
{"retire after December 31, 1993. The effect of this amendment was ", PCONT, PModel(), false, false},
|
||||
{"to reduce the December 31, 1993 accumulated postretirement ", PCONT, PModel(), false, false},
|
||||
{"benefit obligation by $327. It also reduced the net periodic postre- ", PCONT, PModel(), false, false},
|
||||
{"tirement cost by $21 for 1993 and is estimated to reduce this cost ", PCONT, PModel(), false, false},
|
||||
{"for 1994 by approximately $83. ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestUnlvInsurance) {
|
||||
@ -519,19 +539,19 @@ TEST(ParagraphsTest, TestUnlvInsurance) {
|
||||
// paragraph or two.
|
||||
// This example comes from Volume 9886293, Page 5
|
||||
const TextAndModel kTableOfContents[] = {
|
||||
{"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0)},
|
||||
{"1 Hmong People ........... 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Hmong Origins . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Language . . . . . . . 1", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Proverbs . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Discussion . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Riddles . . . . . . . 2", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Discussion . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Appearance . . . . . 3", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Hmong History . . . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Hmong in SE Asia . . . 4", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Hmong in the West . . .5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Hmong in the USA . . . 5", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
{" Discussion . . . . 6", PSTART, PModel(kUnknown, 0, 0, 0, 0), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
|
||||
@ -539,34 +559,34 @@ TEST(ParagraphsTest, TestSplitsOutLeaderLines) {
|
||||
}
|
||||
|
||||
const TextAndModel kTextWithSourceCode[] = {
|
||||
{" A typical page of a programming book may contain", PSTART,
|
||||
PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"examples of source code to exemplify an algorithm "},
|
||||
{"being described in prose. Such examples should be"},
|
||||
{"rendered as lineated text, meaning text with "},
|
||||
{"explicit line breaks but without extra inter-line "},
|
||||
{"spacing. Accidentally finding stray paragraphs in"},
|
||||
{"source code would lead to a bad reading experience"},
|
||||
{"when the text is re-flowed. "},
|
||||
{" Let's show this by describing the function fact-", PSTART,
|
||||
PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"orial. Factorial is a simple recursive function "},
|
||||
{"which grows very quickly. So quickly, in fact, "},
|
||||
{"that the typical C implementation will only work "},
|
||||
{"for values less than about 12: "},
|
||||
{" ", PNONE},
|
||||
{" # Naive implementation in C "},
|
||||
{" int factorial(int n) { "},
|
||||
{" if (n < 2) "},
|
||||
{" return 1; "},
|
||||
{" return n * factorial(n - 1); "},
|
||||
{" } "},
|
||||
{" "},
|
||||
{" The C programming language does not have built- ", PSTART,
|
||||
PModel(kLeft, 0, 20, 0, 0)},
|
||||
{"in support for detecting integer overflow, so this"},
|
||||
{"naive implementation simply returns random values "},
|
||||
{"if even a moderate sized n is provided. "},
|
||||
{" A typical page of a programming book may contain", PSTART,
|
||||
PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"examples of source code to exemplify an algorithm ", PCONT, PModel(), false, false},
|
||||
{"being described in prose. Such examples should be", PCONT, PModel(), false, false},
|
||||
{"rendered as lineated text, meaning text with ", PCONT, PModel(), false, false},
|
||||
{"explicit line breaks but without extra inter-line ", PCONT, PModel(), false, false},
|
||||
{"spacing. Accidentally finding stray paragraphs in", PCONT, PModel(), false, false},
|
||||
{"source code would lead to a bad reading experience", PCONT, PModel(), false, false},
|
||||
{"when the text is re-flowed. ", PCONT, PModel(), false, false},
|
||||
{" Let's show this by describing the function fact-", PSTART,
|
||||
PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"orial. Factorial is a simple recursive function ", PCONT, PModel(), false, false},
|
||||
{"which grows very quickly. So quickly, in fact, ", PCONT, PModel(), false, false},
|
||||
{"that the typical C implementation will only work ", PCONT, PModel(), false, false},
|
||||
{"for values less than about 12: ", PCONT, PModel(), false, false},
|
||||
{" ", PNONE, PModel(), false, false},
|
||||
{" # Naive implementation in C ", PCONT, PModel(), false, false},
|
||||
{" int factorial(int n) { ", PCONT, PModel(), false, false},
|
||||
{" if (n < 2) ", PCONT, PModel(), false, false},
|
||||
{" return 1; ", PCONT, PModel(), false, false},
|
||||
{" return n * factorial(n - 1); ", PCONT, PModel(), false, false},
|
||||
{" } ", PCONT, PModel(), false, false},
|
||||
{" ", PCONT, PModel(), false, false},
|
||||
{" The C programming language does not have built- ", PSTART,
|
||||
PModel(kLeft, 0, 20, 0, 0), false, false},
|
||||
{"in support for detecting integer overflow, so this", PCONT, PModel(), false, false},
|
||||
{"naive implementation simply returns random values ", PCONT, PModel(), false, false},
|
||||
{"if even a moderate sized n is provided. ", PCONT, PModel(), false, false},
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, NotDistractedBySourceCode) {
|
||||
@ -575,103 +595,105 @@ TEST(ParagraphsTest, NotDistractedBySourceCode) {
|
||||
}
|
||||
|
||||
const TextAndModel kOldManAndSea[] = {
|
||||
{"royal palm which are called guano and in it there was a bed, a",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"table, one chair, and a place on the dirt floor to cook with charcoal."},
|
||||
{"On the brown walls of the flattened, overlapping leaves of the"},
|
||||
{"sturdy fibered guano there was a picture in color of the Sacred"},
|
||||
{"Heart of Jesus and another of the Virgin of Cobre. These were"},
|
||||
{"relics of his wife. Once there had been a tinted photograph of his"},
|
||||
{"wife on the wall but he had taken it down because it made him too"},
|
||||
{"lonely to see it and it was on the shelf in the corner under his clean"},
|
||||
{"shirt. "},
|
||||
{" \"What do you have to eat?\" the boy asked. ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" \"A pot of yellow rice with fish. Do you want some?\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" \"No. I will eat at home. Do you want me to make the fire?\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" \"No. I will make it later on. Or I may eat the rice cold.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" \"May I take the cast net?\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" \"Of course.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" There was no cast net and the boy remembered when they had",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"sold it. But they went through this fiction every day. There was no"},
|
||||
{"pot of yellow rice and fish and the boy knew this too. "
|
||||
" "},
|
||||
{" \"Eighty-five is a lucky number,\" the old man said. \"How",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"would you like to see me bring one in that dressed out over a "
|
||||
"thou-"},
|
||||
{"sand pounds? "
|
||||
" "},
|
||||
{" \"I'll get the cast net and go for sardines. Will you sit in the "
|
||||
"sun",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"in the doorway?\" "
|
||||
" "},
|
||||
{" \"Yes. I have yesterday's paper and I will read the baseball.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" The boy did not know whether yesterday's paper was a fiction",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"too. But the old man brought it out from under the bed. "},
|
||||
{" \"Pedrico gave it to me at the bodega,\" he explained. "
|
||||
" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" \"I'll be back when I have the sardines. I'll keep yours and mine",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"together on ice and we can share them in the morning. When I"},
|
||||
{"come back you can tell me about the baseball.\" "},
|
||||
{" \"The Yankees cannot lose.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" \"But I fear the Indians of Cleveland.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{" \"Have faith in the Yankees my son. Think of the great Di-",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"Maggio.\" "},
|
||||
{" \"I fear both the Tigers of Detroit and the Indians of Cleve-",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0)},
|
||||
{"land.\" "}};
|
||||
{"royal palm which are called guano and in it there was a bed, a",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"table, one chair, and a place on the dirt floor to cook with charcoal.", PCONT, PModel(), false, false},
|
||||
{"On the brown walls of the flattened, overlapping leaves of the", PCONT, PModel(), false, false},
|
||||
{"sturdy fibered guano there was a picture in color of the Sacred", PCONT, PModel(), false, false},
|
||||
{"Heart of Jesus and another of the Virgin of Cobre. These were", PCONT, PModel(), false, false},
|
||||
{"relics of his wife. Once there had been a tinted photograph of his", PCONT, PModel(), false, false},
|
||||
{"wife on the wall but he had taken it down because it made him too", PCONT, PModel(), false, false},
|
||||
{"lonely to see it and it was on the shelf in the corner under his clean", PCONT, PModel(), false, false},
|
||||
{"shirt. ", PCONT, PModel(), false, false},
|
||||
{" \"What do you have to eat?\" the boy asked. ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"A pot of yellow rice with fish. Do you want some?\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"No. I will eat at home. Do you want me to make the fire?\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"No. I will make it later on. Or I may eat the rice cold.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"May I take the cast net?\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"Of course.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" There was no cast net and the boy remembered when they had",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"sold it. But they went through this fiction every day. There was no", PCONT, PModel(), false, false},
|
||||
{"pot of yellow rice and fish and the boy knew this too. "
|
||||
" ", PCONT, PModel(), false, false},
|
||||
{" \"Eighty-five is a lucky number,\" the old man said. \"How",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"would you like to see me bring one in that dressed out over a "
|
||||
"thou-", PCONT, PModel(), false, false},
|
||||
{"sand pounds? "
|
||||
" ", PCONT, PModel(), false, false},
|
||||
{" \"I'll get the cast net and go for sardines. Will you sit in the "
|
||||
"sun",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"in the doorway?\" "
|
||||
" ", PCONT, PModel(), false, false},
|
||||
{" \"Yes. I have yesterday's paper and I will read the baseball.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" The boy did not know whether yesterday's paper was a fiction",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"too. But the old man brought it out from under the bed. ", PCONT, PModel(), false, false},
|
||||
{" \"Pedrico gave it to me at the bodega,\" he explained. "
|
||||
" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"I'll be back when I have the sardines. I'll keep yours and mine",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"together on ice and we can share them in the morning. When I", PCONT, PModel(), false, false},
|
||||
{"come back you can tell me about the baseball.\" ", PCONT, PModel(), false, false},
|
||||
{" \"The Yankees cannot lose.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"But I fear the Indians of Cleveland.\" ",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{" \"Have faith in the Yankees my son. Think of the great Di-",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"Maggio.\" ", PCONT, PModel(), false, false},
|
||||
{" \"I fear both the Tigers of Detroit and the Indians of Cleve-",
|
||||
PSTART, PModel(kLeft, 0, 50, 0, 0), false, false},
|
||||
{"land.\" ", PCONT, PModel(), false, false}
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, NotOverlyAggressiveWithBlockQuotes) {
|
||||
TestParagraphDetection(kOldManAndSea, ABSL_ARRAYSIZE(kOldManAndSea));
|
||||
}
|
||||
|
||||
const TextAndModel kNewZealandIndex[] = {
|
||||
{"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{" 138 "},
|
||||
{"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{" 145 "},
|
||||
{"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{" 85 "},
|
||||
{"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0)},
|
||||
{"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0)}};
|
||||
{"Oats, 51 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"O'Brien, Gregory, 175 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Occupational composition, 110,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{" 138 ", PCONT, PModel(), false, false},
|
||||
{"OECD rankings, 155, 172 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Okiato (original capital), 47 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Oil shock: 1974, xxx, 143; 1979,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{" 145 ", PCONT, PModel(), false, false},
|
||||
{"Old Age Pensions, xxii, 89-90 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Old World evils, 77 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Oliver, W. H., 39, 77, 89 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Olssen, Erik, 45, 64, 84 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Olympic Games, 1924, 111, 144 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Once on Chunuk Bair, 149 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Once Were Warriors, xxxiii, 170", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"On—shore whaling, xvi ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Opotiki, xix ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Orakau battle of, xviii, 57 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"O’Regan, Tipene, 170, 198-99 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Organic agriculture, 177 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Orwell, George, 151 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Otago, xvii, 45, 49-50, 70 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Otago block, xvii ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Otago Daily Times, 67 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Otago Girls’ High School, xix, 61,", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{" 85 ", PCONT, PModel(), false, false},
|
||||
{"Otago gold rushes, 61-63 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Otago Peninsula, xx ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Otago Provincial Council, 68 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Otaki, 33 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false},
|
||||
{"Owls Do Cry, 139 ", PSTART, PModel(kLeft, 0, 0, 30, 0), false, false}
|
||||
};
|
||||
|
||||
TEST(ParagraphsTest, IndexPageTest) {
|
||||
TestParagraphDetection(kNewZealandIndex, ABSL_ARRAYSIZE(kNewZealandIndex));
|
||||
|
Loading…
Reference in New Issue
Block a user