mirror of
https://github.com/tesseract-ocr/tesseract.git
synced 2025-01-18 14:41:36 +08:00
commit
c53add706e
@ -1402,12 +1402,25 @@ static void AddBoxTohOCR(const PageIterator *it,
|
||||
* STL removed from original patch submission and refactored by rays.
|
||||
*/
|
||||
char* TessBaseAPI::GetHOCRText(int page_number) {
|
||||
return GetHOCRText(NULL,page_number);
|
||||
}
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* Image name/input_file_ can be set by SetInputName before calling
|
||||
* GetHOCRText
|
||||
* STL removed from original patch submission and refactored by rays.
|
||||
*/
|
||||
char* TessBaseAPI::GetHOCRText(struct ETEXT_DESC* monitor, int page_number) {
|
||||
if (tesseract_ == NULL ||
|
||||
(page_res_ == NULL && Recognize(NULL) < 0))
|
||||
(page_res_ == NULL && Recognize(monitor) < 0))
|
||||
return NULL;
|
||||
|
||||
int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
|
||||
int page_id = page_number + 1; // hOCR uses 1-based page numbers.
|
||||
float row_height, descenders, ascenders; // row attributes
|
||||
bool font_info = false;
|
||||
GetBoolVariable("hocr_font_info", &font_info);
|
||||
|
||||
@ -1473,7 +1486,12 @@ char* TessBaseAPI::GetHOCRText(int page_number) {
|
||||
AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
|
||||
}
|
||||
if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
|
||||
int fontsize;
|
||||
hocr_str.add_str_int("\n <span class='ocr_line' id='line_", page_id);
|
||||
res_it->RowAttributes(&row_height, &descenders, &ascenders);
|
||||
hocr_str.add_str_int("' size='", row_height);
|
||||
hocr_str.add_str_int("' descenders='", descenders * -1);
|
||||
hocr_str.add_str_int("' ascenders='", ascenders);
|
||||
hocr_str.add_str_int("_", lcnt);
|
||||
AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
|
||||
}
|
||||
|
@ -585,6 +585,16 @@ class TESS_API TessBaseAPI {
|
||||
*/
|
||||
char* GetUTF8Text();
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
* page_number is 0-based but will appear in the output as 1-based.
|
||||
* monitor can be used to
|
||||
* cancel the recognition
|
||||
* receive progress callbacks
|
||||
*/
|
||||
char* GetHOCRText(struct ETEXT_DESC* monitor, int page_number);
|
||||
|
||||
/**
|
||||
* Make a HTML-formatted string with hOCR markup from the internal
|
||||
* data structures.
|
||||
|
@ -456,7 +456,7 @@ TESS_API char* TESS_CALL TessBaseAPIGetUTF8Text(TessBaseAPI* handle)
|
||||
|
||||
TESS_API char* TESS_CALL TessBaseAPIGetHOCRText(TessBaseAPI* handle, int page_number)
|
||||
{
|
||||
return handle->GetHOCRText(page_number);
|
||||
return handle->GetHOCRText(NULL, page_number);
|
||||
}
|
||||
|
||||
TESS_API char* TESS_CALL TessBaseAPIGetBoxText(TessBaseAPI* handle, int page_number)
|
||||
|
@ -218,10 +218,22 @@ bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
|
||||
if (w > 0) word->prev_word = &(*words)[w - 1];
|
||||
if (monitor != NULL) {
|
||||
monitor->ocr_alive = TRUE;
|
||||
if (pass_n == 1)
|
||||
monitor->progress = 30 + 50 * w / words->size();
|
||||
else
|
||||
monitor->progress = 80 + 10 * w / words->size();
|
||||
if (pass_n == 1) {
|
||||
// monitor->progress = 30 + 50 * w / words->size();
|
||||
monitor->progress = 70 * w / words->size();
|
||||
if (monitor->progress_callback != NULL) {
|
||||
TBOX box = pr_it->word()->word->bounding_box();
|
||||
(*monitor->progress_callback)(monitor->progress,
|
||||
box.left(), box.right(),
|
||||
box.top(), box.bottom());
|
||||
}
|
||||
} else {
|
||||
monitor->progress = 70 + 30 * w / words->size();
|
||||
if (monitor->progress_callback!=NULL) {
|
||||
(*monitor->progress_callback)(monitor->progress,
|
||||
0, 0, 0, 0);
|
||||
}
|
||||
}
|
||||
if (monitor->deadline_exceeded() ||
|
||||
(monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
|
||||
words->size()))) {
|
||||
|
@ -145,6 +145,15 @@ float LTRResultIterator::Confidence(PageIteratorLevel level) const {
|
||||
return 0.0f;
|
||||
}
|
||||
|
||||
void LTRResultIterator::RowAttributes(float* row_height,
|
||||
float* descenders,
|
||||
float* ascenders) const {
|
||||
*row_height = it_->row()->row->x_height() + it_->row()-> row->ascenders()
|
||||
- it_->row()->row->descenders();
|
||||
*descenders = it_->row()->row->descenders();
|
||||
*ascenders = it_->row()->row->ascenders();
|
||||
}
|
||||
|
||||
// Returns the font attributes of the current word. If iterating at a higher
|
||||
// level object than words, eg textlines, then this will return the
|
||||
// attributes of the first word in that textline.
|
||||
|
@ -91,6 +91,11 @@ class TESS_API LTRResultIterator : public PageIterator {
|
||||
// The number should be interpreted as a percent probability. (0.0f-100.0f)
|
||||
float Confidence(PageIteratorLevel level) const;
|
||||
|
||||
// Returns the attributes of the current row.
|
||||
void RowAttributes(float* row_height,
|
||||
float* descenders,
|
||||
float* ascenders) const;
|
||||
|
||||
// ============= Functions that refer to words only ============.
|
||||
|
||||
// Returns the font attributes of the current word. If iterating at a higher
|
||||
|
@ -101,6 +101,8 @@ typedef struct { /*single character */
|
||||
* the OCR engine is storing its output to shared memory.
|
||||
* During progress, all the buffer info is -1.
|
||||
* Progress starts at 0 and increases to 100 during OCR. No other constraint.
|
||||
* Additionally the progress callback contains the bounding box of the word that
|
||||
* is currently being processed.
|
||||
* Every progress callback, the OCR engine must set ocr_alive to 1.
|
||||
* The HP side will set ocr_alive to 0. Repeated failure to reset
|
||||
* to 1 indicates that the OCR engine is dead.
|
||||
@ -108,6 +110,8 @@ typedef struct { /*single character */
|
||||
* user words found. If it returns true then operation is cancelled.
|
||||
**********************************************************************/
|
||||
typedef bool (*CANCEL_FUNC)(void* cancel_this, int words);
|
||||
typedef bool (*PROGRESS_FUNC)(int progress,
|
||||
int left, int right, int top, int bottom);
|
||||
|
||||
class ETEXT_DESC { // output header
|
||||
public:
|
||||
@ -117,6 +121,7 @@ class ETEXT_DESC { // output header
|
||||
volatile inT8 ocr_alive; // ocr sets to 1, HP 0
|
||||
inT8 err_code; // for errcode use
|
||||
CANCEL_FUNC cancel; // returns true to cancel
|
||||
PROGRESS_FUNC progress_callback; // called whenever progress increases
|
||||
void* cancel_this; // this or other data for cancel
|
||||
struct timeval end_time; // time to stop. expected to be set only by call
|
||||
// to set_deadline_msecs()
|
||||
|
Loading…
Reference in New Issue
Block a user