Skip to content

Commit

Permalink
Fail if no valid lstmf file was written (fix issue tesseract-ocr#2741)
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Nov 1, 2019
1 parent 94d0f77 commit a306cd7
Show file tree
Hide file tree
Showing 3 changed files with 12 additions and 6 deletions.
4 changes: 3 additions & 1 deletion src/api/baseapi.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -855,7 +855,9 @@ int TessBaseAPI::Recognize(ETEXT_DESC* monitor) {
}

if (tesseract_->tessedit_train_line_recognizer) {
tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_);
if (!tesseract_->TrainLineRecognizer(*input_file_, *output_file_, block_list_)) {
return -1;
}
tesseract_->CorrectClassifyWords(page_res_);
return 0;
}
Expand Down
11 changes: 7 additions & 4 deletions src/ccmain/linerec.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,8 @@ const float kWorstDictCertainty = -25.0f;
// Generates training data for training a line recognizer, eg LSTM.
// Breaks the page into lines, according to the boxes, and writes them to a
// serialized DocumentData based on output_basename.
void Tesseract::TrainLineRecognizer(const STRING& input_imagename,
// Return true if successful, false if an error occurred.
bool Tesseract::TrainLineRecognizer(const STRING& input_imagename,
const STRING& output_basename,
BLOCK_LIST *block_list) {
STRING lstmf_name = output_basename + ".lstmf";
Expand All @@ -48,7 +49,7 @@ void Tesseract::TrainLineRecognizer(const STRING& input_imagename,
// Load existing document for the previous pages.
if (!images.LoadDocument(lstmf_name.c_str(), 0, 0, nullptr)) {
tprintf("Failed to read training data from %s!\n", lstmf_name.c_str());
return;
return false;
}
}
GenericVector<TBOX> boxes;
Expand All @@ -58,17 +59,19 @@ void Tesseract::TrainLineRecognizer(const STRING& input_imagename,
nullptr) ||
boxes.empty()) {
tprintf("Failed to read boxes from %s\n", input_imagename.c_str());
return;
return false;
}
TrainFromBoxes(boxes, texts, block_list, &images);
if (images.NumPages() <= 0) {
tprintf("Failed to read pages from %s\n", input_imagename.c_str());
return;
return false;
}
images.Shuffle();
if (!images.SaveDocument(lstmf_name.c_str(), nullptr)) {
tprintf("Failed to write training data to %s!\n", lstmf_name.c_str());
return false;
}
return true;
}

// Generates training data for training a line recognizer, eg LSTM.
Expand Down
3 changes: 2 additions & 1 deletion src/ccmain/tesseractclass.h
Original file line number Diff line number Diff line change
Expand Up @@ -334,7 +334,8 @@ class Tesseract : public Wordrec {
// Generates training data for training a line recognizer, eg LSTM.
// Breaks the page into lines, according to the boxes, and writes them to a
// serialized DocumentData based on output_basename.
void TrainLineRecognizer(const STRING& input_imagename,
// Return true if successful, false if an error occurred.
bool TrainLineRecognizer(const STRING& input_imagename,
const STRING& output_basename,
BLOCK_LIST* block_list);
// Generates training data for training a line recognizer, eg LSTM.
Expand Down

0 comments on commit a306cd7

Please sign in to comment.