Skip to content

Commit

Permalink
Merge pull request tesseract-ocr#2231 from Shreeshrii/wordstr
Browse files Browse the repository at this point in the history
Add renderer to create WordStr box files from images
  • Loading branch information
zdenop committed Feb 16, 2019
2 parents c78e947 + f3362a4 commit 15f2a4b
Show file tree
Hide file tree
Showing 9 changed files with 141 additions and 1 deletion.
1 change: 1 addition & 0 deletions CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -266,6 +266,7 @@ set(tesseract_src ${tesseract_src}
src/api/hocrrenderer.cpp
src/api/lstmboxrenderer.cpp
src/api/pdfrenderer.cpp
src/api/wordstrboxrenderer.cpp
)

if (WIN32)
Expand Down
1 change: 1 addition & 0 deletions src/api/Makefile.am
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ libtesseract_api_la_SOURCES += altorenderer.cpp
libtesseract_api_la_SOURCES += hocrrenderer.cpp
libtesseract_api_la_SOURCES += lstmboxrenderer.cpp
libtesseract_api_la_SOURCES += pdfrenderer.cpp
libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp
libtesseract_api_la_SOURCES += renderer.cpp

lib_LTLIBRARIES += libtesseract.la
Expand Down
10 changes: 9 additions & 1 deletion src/api/baseapi.h
Original file line number Diff line number Diff line change
Expand Up @@ -630,7 +630,15 @@ class TESS_API TessBaseAPI {
* Returned string must be freed with the delete [] operator.
*/
char* GetBoxText(int page_number);


/**
* The recognized text is returned as a char* which is coded in the same
* format as a WordStr box file used in training.
* page_number is a 0-based page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/
char* GetWordStrBoxText(int page_number);

/**
* The recognized text is returned as a char* which is coded
* as UNLV format Latin-1 with specific reject and suspect codes.
Expand Down
11 changes: 11 additions & 0 deletions src/api/renderer.h
Original file line number Diff line number Diff line change
Expand Up @@ -269,6 +269,17 @@ class TESS_API TessBoxTextRenderer : public TessResultRenderer {
virtual bool AddImageHandler(TessBaseAPI* api);
};

/**
* Renders tesseract output into a plain UTF-8 text string in WordStr format
*/
class TESS_API TessWordStrBoxRenderer : public TessResultRenderer {
public:
explicit TessWordStrBoxRenderer(const char* outputbase);

protected:
virtual bool AddImageHandler(TessBaseAPI* api);
};

#ifndef DISABLED_LEGACY_ENGINE

/**
Expand Down
14 changes: 14 additions & 0 deletions src/api/tesseractmain.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -524,6 +524,20 @@ static void PreloadRenderers(
}
}

api->GetBoolVariable("tessedit_create_wordstrbox", &b);
if (b) {
tesseract::TessWordStrBoxRenderer* renderer =
new tesseract::TessWordStrBoxRenderer(outputbase);
if (renderer->happy()) {
renderers->push_back(renderer);
} else {
delete renderer;
tprintf("Error, could not create WordStr BOX output file: %s\n",
strerror(errno));
error = true;
}
}

api->GetBoolVariable("tessedit_create_txt", &b);
if (b || (!error && renderers->empty())) {
tesseract::TessTextRenderer* renderer =
Expand Down
101 changes: 101 additions & 0 deletions src/api/wordstrboxrenderer.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,101 @@
/**********************************************************************
* File: wordstrboxrenderer.cpp
* Description: Renderer for creating box file with WordStr strings.
* based on the tsv renderer.
*
* (C) Copyright 2006, Google Inc.
** Licensed under the Apache License, Version 2.0 (the "License");
** you may not use this file except in compliance with the License.
** You may obtain a copy of the License at
** http://www.apache.org/licenses/LICENSE-2.0
** Unless required by applicable law or agreed to in writing, software
** distributed under the License is distributed on an "AS IS" BASIS,
** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
** See the License for the specific language governing permissions and
** limitations under the License.
*
**********************************************************************/

#include "baseapi.h" // for TessBaseAPI
#include "renderer.h"
#include "tesseractclass.h" // for Tesseract

namespace tesseract {

/**
* Create a UTF8 box file with WordStr strings from the internal data structures.
* page_number is a 0-base page index that will appear in the box file.
* Returned string must be freed with the delete [] operator.
*/

char* TessBaseAPI::GetWordStrBoxText(int page_number) {
if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0))
return nullptr;

STRING wordstr_box_str("");
int left, top, right, bottom;
int page_num = page_number;
bool first_line = true;

LTRResultIterator* res_it = GetLTRIterator();
while (!res_it->Empty(RIL_BLOCK)) {
if (res_it->Empty(RIL_WORD)) {
res_it->Next(RIL_WORD);
continue;
}

if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
if (!first_line) {
wordstr_box_str.add_str_int("\n\t ", right + 1);
wordstr_box_str.add_str_int(" ", image_height_ - bottom);
wordstr_box_str.add_str_int(" ", right + 5);
wordstr_box_str.add_str_int(" ", image_height_ - top);
wordstr_box_str.add_str_int(" ", page_num); // row for tab for EOL
wordstr_box_str += "\n";
} else {
first_line = false;
}
// Use bounding box for whole line for WordStr
res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom);
wordstr_box_str.add_str_int("WordStr ", left);
wordstr_box_str.add_str_int(" ", image_height_ - bottom);
wordstr_box_str.add_str_int(" ", right);
wordstr_box_str.add_str_int(" ", image_height_ - top);
wordstr_box_str.add_str_int(" ", page_num); // word
wordstr_box_str += " #";
}
do { wordstr_box_str +=
std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_WORD)).get();
wordstr_box_str += " ";
res_it->Next(RIL_WORD);
} while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
}
wordstr_box_str.add_str_int("\n\t ", right + 1);
wordstr_box_str.add_str_int(" ", image_height_ - bottom);
wordstr_box_str.add_str_int(" ", right + 5);
wordstr_box_str.add_str_int(" ", image_height_ - top);
wordstr_box_str.add_str_int(" ", page_num); // row for tab for EOL
wordstr_box_str += "\n";
char* ret = new char[wordstr_box_str.length() + 1];
strcpy(ret, wordstr_box_str.string());
delete res_it;
return ret;
}

/**********************************************************************
* WordStrBox Renderer interface implementation
**********************************************************************/
TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase)
: TessResultRenderer(outputbase, "box") {
}

bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI* api) {
const std::unique_ptr<const char[]> wordstrbox(api->GetWordStrBoxText(imagenum()));
if (wordstrbox == nullptr) return false;

AppendString(wordstrbox.get());

return true;
}

} // namespace tesseract.
2 changes: 2 additions & 0 deletions src/ccmain/tesseractclass.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -395,6 +395,8 @@ Tesseract::Tesseract()
this->params()),
BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
this->params()),
BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file",
this->params()),
BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
this->params()),
BOOL_MEMBER(textonly_pdf, false,
Expand Down
1 change: 1 addition & 0 deletions src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -1042,6 +1042,7 @@ class Tesseract : public Wordrec {
BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file");
BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training");
BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file");
BOOL_VAR_H(tessedit_create_wordstrbox, false, "Write WordStr format .box output file");
BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file");
BOOL_VAR_H(textonly_pdf, false,
"Create PDF with only one invisible text layer");
Expand Down
1 change: 1 addition & 0 deletions tessdata/configs/wordstrbox
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
tessedit_create_wordstrbox 1

0 comments on commit 15f2a4b

Please sign in to comment.