Skip to content

Commit

Permalink
Replace remaining STRING by std::string in src/dict
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Mar 15, 2021
1 parent 21d9aad commit 21cf7cf
Show file tree
Hide file tree
Showing 17 changed files with 65 additions and 74 deletions.
18 changes: 8 additions & 10 deletions src/ccutil/unicharcompress.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -47,11 +47,10 @@ using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
// A hash map to count occurrences of each radical encoding.
using RSCounts = std::unordered_map<int, int>;

static bool DecodeRadicalLine(STRING *radical_data_line, RSMap *radical_map) {
if (radical_data_line->length() == 0 || (*radical_data_line)[0] == '#')
static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) {
if (radical_data_line.length() == 0 || (radical_data_line)[0] == '#')
return true;
std::vector<STRING> entries;
radical_data_line->split(' ', &entries);
std::vector<std::string> entries = split(radical_data_line, ' ');
if (entries.size() < 2)
return false;
char *end = nullptr;
Expand All @@ -73,11 +72,10 @@ static bool DecodeRadicalLine(STRING *radical_data_line, RSMap *radical_map) {
// already been read into a STRING. Returns false on error.
// The radical_stroke_table is non-const because it gets split and the caller
// is unlikely to want to use it again.
static bool DecodeRadicalTable(STRING *radical_data, RSMap *radical_map) {
std::vector<STRING> lines;
radical_data->split('\n', &lines);
static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {
std::vector<std::string> lines = split(radical_data, '\n');
for (int i = 0; i < lines.size(); ++i) {
if (!DecodeRadicalLine(&lines[i], radical_map)) {
if (!DecodeRadicalLine(lines[i], radical_map)) {
tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str());
return false;
}
Expand Down Expand Up @@ -105,9 +103,9 @@ UnicharCompress &UnicharCompress::operator=(const UnicharCompress &src) {
// input string radical_stroke_table.
// Returns false if the encoding cannot be constructed.
bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
STRING *radical_stroke_table) {
std::string *radical_stroke_table) {
RSMap radical_map;
if (radical_stroke_table != nullptr && !DecodeRadicalTable(radical_stroke_table, &radical_map))
if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map))
return false;
encoder_.clear();
UNICHARSET direct_set;
Expand Down
2 changes: 1 addition & 1 deletion src/ccutil/unicharcompress.h
Original file line number Diff line number Diff line change
Expand Up @@ -152,7 +152,7 @@ class TESS_API UnicharCompress {
// the file training/langdata/radical-stroke.txt have been read into the
// input string radical_stroke_table.
// Returns false if the encoding cannot be constructed.
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table);
bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table);
// Sets up an encoder that doesn't change the unichars at all, so it just
// passes them through unchanged.
void SetupPassThrough(const UNICHARSET &unicharset);
Expand Down
3 changes: 1 addition & 2 deletions src/dict/dawg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

#include "dict.h"
#include "helpers.h"
#include "strngs.h"
#include "tprintf.h"

#include <memory>
Expand Down Expand Up @@ -105,7 +104,7 @@ void Dawg::iterate_words(const UNICHARSET &unicharset,
}

static void CallWithUTF8(std::function<void(const char *)> cb, const WERD_CHOICE *wc) {
STRING s;
std::string s;
wc->string_and_lengths(&s, nullptr);
cb(s.c_str());
}
Expand Down
12 changes: 6 additions & 6 deletions src/dict/dawg.h
Original file line number Diff line number Diff line change
Expand Up @@ -119,7 +119,7 @@ class TESS_API Dawg {
inline DawgType type() const {
return type_;
}
inline const STRING &lang() const {
inline const std::string &lang() const {
return lang_;
}
inline PermuterType permuter() const {
Expand Down Expand Up @@ -194,7 +194,7 @@ class TESS_API Dawg {
}

protected:
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Dawg(DawgType type, const std::string &lang, PermuterType perm, int debug_level)
: lang_(lang), type_(type), perm_(perm), unicharset_size_(0), debug_level_(debug_level) {}

/// Returns the next node visited by following this edge.
Expand Down Expand Up @@ -280,7 +280,7 @@ class TESS_API Dawg {
std::function<void(const WERD_CHOICE *)> cb) const;

// Member Variables.
STRING lang_;
std::string lang_;
DawgType type_;
/// Permuter code that should be used if the word is found in this Dawg.
PermuterType perm_;
Expand Down Expand Up @@ -384,17 +384,17 @@ class DawgPositionVector : public GenericVector<DawgPosition> {
//
class TESS_API SquishedDawg : public Dawg {
public:
SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
SquishedDawg(DawgType type, const std::string &lang, PermuterType perm, int debug_level)
: Dawg(type, lang, perm, debug_level) {}
SquishedDawg(const char *filename, DawgType type, const STRING &lang, PermuterType perm,
SquishedDawg(const char *filename, DawgType type, const std::string &lang, PermuterType perm,
int debug_level)
: Dawg(type, lang, perm, debug_level) {
TFile file;
ASSERT_HOST(file.Open(filename, nullptr));
ASSERT_HOST(read_squished_dawg(&file));
num_forward_edges_in_node0 = num_forward_edges(0);
}
SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang,
SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, const std::string &lang,
PermuterType perm, int unicharset_size, int debug_level)
: Dawg(type, lang, perm, debug_level), edges_(edges), num_edges_(num_edges) {
init(unicharset_size);
Expand Down
7 changes: 3 additions & 4 deletions src/dict/dawg_cache.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -20,13 +20,12 @@

#include "dawg.h"
#include "object_cache.h"
#include "strngs.h"
#include "tessdatamanager.h"

namespace tesseract {

struct DawgLoader {
DawgLoader(const STRING &lang, TessdataType tessdata_dawg_type, int dawg_debug_level,
DawgLoader(const std::string &lang, TessdataType tessdata_dawg_type, int dawg_debug_level,
TessdataManager *data_file)
: lang_(lang)
, data_file_(data_file)
Expand All @@ -35,13 +34,13 @@ struct DawgLoader {

Dawg *Load();

STRING lang_;
std::string lang_;
TessdataManager *data_file_;
TessdataType tessdata_dawg_type_;
int dawg_debug_level_;
};

Dawg *DawgCache::GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type,
Dawg *DawgCache::GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type,
int debug_level, TessdataManager *data_file) {
std::string data_id = data_file->GetDataFileName();
data_id += kTessdataFileSuffixes[tessdata_dawg_type];
Expand Down
3 changes: 1 addition & 2 deletions src/dict/dawg_cache.h
Original file line number Diff line number Diff line change
Expand Up @@ -22,14 +22,13 @@

#include "dawg.h"
#include "object_cache.h"
#include "strngs.h"
#include "tessdatamanager.h"

namespace tesseract {

class DawgCache {
public:
Dawg *GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level,
Dawg *GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type, int debug_level,
TessdataManager *data_file);

// If we manage the given dawg, decrement its count,
Expand Down
4 changes: 2 additions & 2 deletions src/dict/dict.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -195,7 +195,7 @@ void Dict::SetupForLoad(DawgCache *dawg_cache) {
}

// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
void Dict::Load(const STRING &lang, TessdataManager *data_file) {
void Dict::Load(const std::string &lang, TessdataManager *data_file) {
// Load dawgs_.
if (load_punc_dawg) {
punc_dawg_ =
Expand Down Expand Up @@ -281,7 +281,7 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) {
}

// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
// Load dawgs_.
if (load_punc_dawg) {
punc_dawg_ =
Expand Down
4 changes: 2 additions & 2 deletions src/dict/dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -283,9 +283,9 @@ class TESS_API Dict {
// Sets up ready for a Load or LoadLSTM.
void SetupForLoad(DawgCache *dawg_cache);
// Loads the dawgs needed by Tesseract. Call FinishLoad() after.
void Load(const STRING &lang, TessdataManager *data_file);
void Load(const std::string &lang, TessdataManager *data_file);
// Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
void LoadLSTM(const STRING &lang, TessdataManager *data_file);
void LoadLSTM(const std::string &lang, TessdataManager *data_file);
// Completes the loading process after Load() and/or LoadLSTM().
// Returns false if no dictionaries were loaded.
bool FinishLoad();
Expand Down
4 changes: 2 additions & 2 deletions src/dict/permdawg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -113,12 +113,12 @@ void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &
tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
exit(1);
}
STRING word_str;
std::string word_str;
word->string_and_lengths(&word_str, nullptr);
word_str += " ";
fprintf(output_ambig_words_file_, "%s", word_str.c_str());
}
STRING word_str;
std::string word_str;
word->string_and_lengths(&word_str, nullptr);
word_str += " ";
fprintf(output_ambig_words_file_, "%s", word_str.c_str());
Expand Down
8 changes: 4 additions & 4 deletions src/dict/trie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -260,15 +260,15 @@ NODE_REF Trie::new_dawg_node() {

bool Trie::read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,
Trie::RTLReversePolicy reverse_policy) {
std::vector<STRING> word_list;
std::vector<std::string> word_list;
if (!read_word_list(filename, &word_list))
return false;
std::sort(word_list.begin(), word_list.end(),
[](auto &s1, auto &s2) { return s1.size() > s2.size(); });
return add_word_list(word_list, unicharset, reverse_policy);
}

bool Trie::read_word_list(const char *filename, std::vector<STRING> *words) {
bool Trie::read_word_list(const char *filename, std::vector<std::string> *words) {
FILE *word_file;
char line_str[CHARS_PER_LINE];
int word_count = 0;
Expand All @@ -279,7 +279,7 @@ bool Trie::read_word_list(const char *filename, std::vector<STRING> *words) {

while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {
chomp_string(line_str); // remove newline
STRING word_str(line_str);
std::string word_str(line_str);
++word_count;
if (debug_level_ && word_count % 10000 == 0)
tprintf("Read %d words so far\n", word_count);
Expand All @@ -291,7 +291,7 @@ bool Trie::read_word_list(const char *filename, std::vector<STRING> *words) {
return true;
}

bool Trie::add_word_list(const std::vector<STRING> &words, const UNICHARSET &unicharset,
bool Trie::add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
Trie::RTLReversePolicy reverse_policy) {
for (int i = 0; i < words.size(); ++i) {
WERD_CHOICE word(words[i].c_str(), unicharset);
Expand Down
6 changes: 3 additions & 3 deletions src/dict/trie.h
Original file line number Diff line number Diff line change
Expand Up @@ -79,7 +79,7 @@ class TESS_API Trie : public Dawg {
// Trie can consume (if a new word insert would cause the Trie to
// contain more edges than max_num_edges, all the edges are cleared
// so that new inserts can proceed).
Trie(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
: Dawg(type, lang, perm, debug_level) {
init(unicharset_size);
num_edges_ = 0;
Expand Down Expand Up @@ -173,11 +173,11 @@ class TESS_API Trie : public Dawg {

// Reads a list of words from the given file.
// Returns false on error.
bool read_word_list(const char *filename, std::vector<STRING> *words);
bool read_word_list(const char *filename, std::vector<std::string> *words);
// Adds a list of words previously read using read_word_list to the trie
// using the given unicharset and reverse_policy to convert to unichar-ids.
// Returns false on error.
bool add_word_list(const std::vector<STRING> &words, const UNICHARSET &unicharset,
bool add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
Trie::RTLReversePolicy reverse_policy);

// Inserts the list of patterns from the given file into the Trie.
Expand Down
7 changes: 3 additions & 4 deletions src/training/combine_lang_model.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -50,11 +50,10 @@ int main(int argc, char **argv) {
tesseract::CheckSharedLibraryVersion();
tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);

std::vector<STRING> words, puncs, numbers;
// If these reads fail, we get a warning message and an empty list of words.
tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words);
tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs);
tesseract::ReadFile(FLAGS_numbers.c_str(), nullptr).split('\n', &numbers);
std::vector<std::string> words = split(tesseract::ReadFile(FLAGS_words.c_str()), '\n');
std::vector<std::string> puncs = split(tesseract::ReadFile(FLAGS_puncs.c_str()), '\n');
std::vector<std::string> numbers = split(tesseract::ReadFile(FLAGS_numbers.c_str()), '\n');
// Load the input unicharset
UNICHARSET unicharset;
if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {
Expand Down
30 changes: 15 additions & 15 deletions src/training/unicharset/lang_model_helpers.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -56,21 +56,21 @@ bool WriteFile(const std::string &output_dir, const std::string &lang, const std
return (*writer)(data, filename.c_str());
}

// Helper reads a file with optional reader and returns a STRING.
// On failure emits a warning message and returns and empty STRING.
STRING ReadFile(const std::string &filename, FileReader reader) {
// Helper reads a file with optional reader and returns a string.
// On failure emits a warning message and returns an empty string.
std::string ReadFile(const std::string &filename, FileReader reader) {
if (filename.empty())
return STRING();
return std::string();
std::vector<char> data;
bool read_result;
if (reader == nullptr)
read_result = LoadDataFromFile(filename.c_str(), &data);
else
read_result = (*reader)(filename.c_str(), &data);
if (read_result)
return STRING(&data[0], data.size());
return std::string(&data[0], data.size());
tprintf("Failed to read data from: %s\n", filename.c_str());
return STRING();
return std::string();
}

// Helper writes the unicharset to file and to the traineddata.
Expand All @@ -89,7 +89,7 @@ bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir
// Helper creates the recoder and writes it to the traineddata, and a human-
// readable form to file.
bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir,
const std::string &lang, FileWriter writer, STRING *radical_table_data,
const std::string &lang, FileWriter writer, std::string *radical_table_data,
TessdataManager *traineddata) {
UnicharCompress recoder;
// Where the unicharset is carefully setup already to contain a good
Expand All @@ -116,7 +116,7 @@ bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::st
if (!recoder.Serialize(&fp))
return false;
traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0], recoder_data.size());
STRING encoding = recoder.GetEncodingAsString(unicharset);
std::string encoding = recoder.GetEncodingAsString(unicharset);
recoder_data.resize(encoding.length(), 0);
memcpy(&recoder_data[0], &encoding[0], encoding.length());
std::string suffix;
Expand All @@ -127,7 +127,7 @@ bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::st

// Helper builds a dawg from the given words, using the unicharset as coding,
// and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.
static bool WriteDawg(const std::vector<STRING> &words, const UNICHARSET &unicharset,
static bool WriteDawg(const std::vector<std::string> &words, const UNICHARSET &unicharset,
Trie::RTLReversePolicy reverse_policy, TessdataType file_type,
TessdataManager *traineddata) {
// The first 3 arguments are not used in this case.
Expand All @@ -149,8 +149,8 @@ static bool WriteDawg(const std::vector<STRING> &words, const UNICHARSET &unicha
// Builds and writes the dawgs, given a set of words, punctuation
// patterns, number patterns, to the traineddata. Encoding uses the given
// unicharset, and the punc dawgs is reversed if lang_is_rtl.
static bool WriteDawgs(const std::vector<STRING> &words, const std::vector<STRING> &puncs,
const std::vector<STRING> &numbers, bool lang_is_rtl,
static bool WriteDawgs(const std::vector<std::string> &words, const std::vector<std::string> &puncs,
const std::vector<std::string> &numbers, bool lang_is_rtl,
const UNICHARSET &unicharset, TessdataManager *traineddata) {
if (puncs.empty()) {
tprintf("Must have non-empty puncs list to use language models!!\n");
Expand Down Expand Up @@ -185,8 +185,8 @@ static bool WriteDawgs(const std::vector<STRING> &words, const std::vector<STRIN
int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir,
const std::string &version_str, const std::string &output_dir,
const std::string &lang, bool pass_through_recoder,
const std::vector<STRING> &words, const std::vector<STRING> &puncs,
const std::vector<STRING> &numbers, bool lang_is_rtl, FileReader reader,
const std::vector<std::string> &words, const std::vector<std::string> &puncs,
const std::vector<std::string> &numbers, bool lang_is_rtl, FileReader reader,
FileWriter writer) {
// Build the traineddata file.
TessdataManager traineddata;
Expand All @@ -202,12 +202,12 @@ int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir
}
// If there is a config file, read it and add to traineddata.
std::string config_filename = script_dir + "/" + lang + "/" + lang + ".config";
STRING config_file = ReadFile(config_filename, reader);
std::string config_file = ReadFile(config_filename, reader);
if (config_file.length() > 0) {
traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0], config_file.length());
}
std::string radical_filename = script_dir + "/radical-stroke.txt";
STRING radical_data = ReadFile(radical_filename, reader);
std::string radical_data = ReadFile(radical_filename, reader);
if (radical_data.length() == 0) {
tprintf("Error reading radical code table %s\n", radical_filename.c_str());
return EXIT_FAILURE;
Expand Down
Loading

0 comments on commit 21cf7cf

Please sign in to comment.