From f3362a4b5b3b3d1545fafd11f3c62cf8ff6c5b52 Mon Sep 17 00:00:00 2001 From: Shree Devi Kumar Date: Sun, 10 Feb 2019 18:31:31 +0000 Subject: [PATCH] Add renderer to create WordStr box files from images --- CMakeLists.txt | 1 + src/api/Makefile.am | 1 + src/api/baseapi.h | 10 +++- src/api/renderer.h | 11 ++++ src/api/tesseractmain.cpp | 14 +++++ src/api/wordstrboxrenderer.cpp | 101 +++++++++++++++++++++++++++++++++ src/ccmain/tesseractclass.cpp | 2 + src/ccmain/tesseractclass.h | 1 + tessdata/configs/wordstrbox | 1 + 9 files changed, 141 insertions(+), 1 deletion(-) create mode 100644 src/api/wordstrboxrenderer.cpp create mode 100644 tessdata/configs/wordstrbox diff --git a/CMakeLists.txt b/CMakeLists.txt index 62a6f5a363..73d4ed8091 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -254,6 +254,7 @@ set(tesseract_src ${tesseract_src} src/api/hocrrenderer.cpp src/api/lstmboxrenderer.cpp src/api/pdfrenderer.cpp + src/api/wordstrboxrenderer.cpp ) if (WIN32) diff --git a/src/api/Makefile.am b/src/api/Makefile.am index 894d957e56..2ab9118447 100644 --- a/src/api/Makefile.am +++ b/src/api/Makefile.am @@ -37,6 +37,7 @@ libtesseract_api_la_SOURCES += altorenderer.cpp libtesseract_api_la_SOURCES += hocrrenderer.cpp libtesseract_api_la_SOURCES += lstmboxrenderer.cpp libtesseract_api_la_SOURCES += pdfrenderer.cpp +libtesseract_api_la_SOURCES += wordstrboxrenderer.cpp libtesseract_api_la_SOURCES += renderer.cpp lib_LTLIBRARIES += libtesseract.la diff --git a/src/api/baseapi.h b/src/api/baseapi.h index d32ded8161..31c43b1d9a 100644 --- a/src/api/baseapi.h +++ b/src/api/baseapi.h @@ -630,7 +630,15 @@ class TESS_API TessBaseAPI { * Returned string must be freed with the delete [] operator. */ char* GetBoxText(int page_number); - + + /** + * The recognized text is returned as a char* which is coded in the same + * format as a WordStr box file used in training. + * page_number is a 0-based page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ + char* GetWordStrBoxText(int page_number); + /** * The recognized text is returned as a char* which is coded * as UNLV format Latin-1 with specific reject and suspect codes. diff --git a/src/api/renderer.h b/src/api/renderer.h index 95998f5a8a..6d5e0a5310 100644 --- a/src/api/renderer.h +++ b/src/api/renderer.h @@ -269,6 +269,17 @@ class TESS_API TessBoxTextRenderer : public TessResultRenderer { virtual bool AddImageHandler(TessBaseAPI* api); }; +/** + * Renders tesseract output into a plain UTF-8 text string in WordStr format + */ +class TESS_API TessWordStrBoxRenderer : public TessResultRenderer { + public: + explicit TessWordStrBoxRenderer(const char* outputbase); + + protected: + virtual bool AddImageHandler(TessBaseAPI* api); +}; + #ifndef DISABLED_LEGACY_ENGINE /** diff --git a/src/api/tesseractmain.cpp b/src/api/tesseractmain.cpp index bc3084c0a4..ef565934f9 100644 --- a/src/api/tesseractmain.cpp +++ b/src/api/tesseractmain.cpp @@ -524,6 +524,20 @@ static void PreloadRenderers( } } + api->GetBoolVariable("tessedit_create_wordstrbox", &b); + if (b) { + tesseract::TessWordStrBoxRenderer* renderer = + new tesseract::TessWordStrBoxRenderer(outputbase); + if (renderer->happy()) { + renderers->push_back(renderer); + } else { + delete renderer; + tprintf("Error, could not create WordStr BOX output file: %s\n", + strerror(errno)); + error = true; + } + } + api->GetBoolVariable("tessedit_create_txt", &b); if (b || (!error && renderers->empty())) { tesseract::TessTextRenderer* renderer = diff --git a/src/api/wordstrboxrenderer.cpp b/src/api/wordstrboxrenderer.cpp new file mode 100644 index 0000000000..fc0881fe6d --- /dev/null +++ b/src/api/wordstrboxrenderer.cpp @@ -0,0 +1,101 @@ +/********************************************************************** + * File: wordstrboxrenderer.cpp + * Description: Renderer for creating box file with WordStr strings. + * based on the tsv renderer. + * + * (C) Copyright 2006, Google Inc. + ** Licensed under the Apache License, Version 2.0 (the "License"); + ** you may not use this file except in compliance with the License. + ** You may obtain a copy of the License at + ** http://www.apache.org/licenses/LICENSE-2.0 + ** Unless required by applicable law or agreed to in writing, software + ** distributed under the License is distributed on an "AS IS" BASIS, + ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + ** See the License for the specific language governing permissions and + ** limitations under the License. + * + **********************************************************************/ + +#include "baseapi.h" // for TessBaseAPI +#include "renderer.h" +#include "tesseractclass.h" // for Tesseract + +namespace tesseract { + +/** + * Create a UTF8 box file with WordStr strings from the internal data structures. + * page_number is a 0-base page index that will appear in the box file. + * Returned string must be freed with the delete [] operator. + */ + +char* TessBaseAPI::GetWordStrBoxText(int page_number) { + if (tesseract_ == nullptr || (page_res_ == nullptr && Recognize(nullptr) < 0)) + return nullptr; + + STRING wordstr_box_str(""); + int left, top, right, bottom; + int page_num = page_number; + bool first_line = true; + + LTRResultIterator* res_it = GetLTRIterator(); + while (!res_it->Empty(RIL_BLOCK)) { + if (res_it->Empty(RIL_WORD)) { + res_it->Next(RIL_WORD); + continue; + } + + if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) { + if (!first_line) { + wordstr_box_str.add_str_int("\n\t ", right + 1); + wordstr_box_str.add_str_int(" ", image_height_ - bottom); + wordstr_box_str.add_str_int(" ", right + 5); + wordstr_box_str.add_str_int(" ", image_height_ - top); + wordstr_box_str.add_str_int(" ", page_num); // row for tab for EOL + wordstr_box_str += "\n"; + } else { + first_line = false; + } + // Use bounding box for whole line for WordStr + res_it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom); + wordstr_box_str.add_str_int("WordStr ", left); + wordstr_box_str.add_str_int(" ", image_height_ - bottom); + wordstr_box_str.add_str_int(" ", right); + wordstr_box_str.add_str_int(" ", image_height_ - top); + wordstr_box_str.add_str_int(" ", page_num); // word + wordstr_box_str += " #"; + } + do { wordstr_box_str += + std::unique_ptr(res_it->GetUTF8Text(RIL_WORD)).get(); + wordstr_box_str += " "; + res_it->Next(RIL_WORD); + } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD)); + } + wordstr_box_str.add_str_int("\n\t ", right + 1); + wordstr_box_str.add_str_int(" ", image_height_ - bottom); + wordstr_box_str.add_str_int(" ", right + 5); + wordstr_box_str.add_str_int(" ", image_height_ - top); + wordstr_box_str.add_str_int(" ", page_num); // row for tab for EOL + wordstr_box_str += "\n"; + char* ret = new char[wordstr_box_str.length() + 1]; + strcpy(ret, wordstr_box_str.string()); + delete res_it; + return ret; +} + +/********************************************************************** + * WordStrBox Renderer interface implementation + **********************************************************************/ + TessWordStrBoxRenderer::TessWordStrBoxRenderer(const char *outputbase) + : TessResultRenderer(outputbase, "box") { +} + +bool TessWordStrBoxRenderer::AddImageHandler(TessBaseAPI* api) { + const std::unique_ptr wordstrbox(api->GetWordStrBoxText(imagenum())); + if (wordstrbox == nullptr) return false; + + AppendString(wordstrbox.get()); + + return true; +} + +} // namespace tesseract. diff --git a/src/ccmain/tesseractclass.cpp b/src/ccmain/tesseractclass.cpp index 83ecbc2a4f..128114d3f2 100644 --- a/src/ccmain/tesseractclass.cpp +++ b/src/ccmain/tesseractclass.cpp @@ -395,6 +395,8 @@ Tesseract::Tesseract() this->params()), BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file", this->params()), + BOOL_MEMBER(tessedit_create_wordstrbox, false, "Write WordStr format .box output file", + this->params()), BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file", this->params()), BOOL_MEMBER(textonly_pdf, false, diff --git a/src/ccmain/tesseractclass.h b/src/ccmain/tesseractclass.h index 0fc1d04aeb..a9b1f8a85e 100644 --- a/src/ccmain/tesseractclass.h +++ b/src/ccmain/tesseractclass.h @@ -1042,6 +1042,7 @@ class Tesseract : public Wordrec { BOOL_VAR_H(tessedit_create_alto, false, "Write .xml ALTO output file"); BOOL_VAR_H(tessedit_create_lstmbox, false, "Write .box file for LSTM training"); BOOL_VAR_H(tessedit_create_tsv, false, "Write .tsv output file"); + BOOL_VAR_H(tessedit_create_wordstrbox, false, "Write WordStr format .box output file"); BOOL_VAR_H(tessedit_create_pdf, false, "Write .pdf output file"); BOOL_VAR_H(textonly_pdf, false, "Create PDF with only one invisible text layer"); diff --git a/tessdata/configs/wordstrbox b/tessdata/configs/wordstrbox new file mode 100644 index 0000000000..38cd41cd60 --- /dev/null +++ b/tessdata/configs/wordstrbox @@ -0,0 +1 @@ +tessedit_create_wordstrbox 1