Replace remaining STRING by std::string in src/dict

Signed-off-by: Stefan Weil <sw@weilnetz.de>
rvrsh3ll · Mar 15, 2021 · 21cf7cf · 21cf7cf
1 parent 21d9aad
commit 21cf7cf
Show file tree

Hide file tree

Showing 17 changed files with 65 additions and 74 deletions.
diff --git a/src/ccutil/unicharcompress.cpp b/src/ccutil/unicharcompress.cpp
@@ -47,11 +47,10 @@ using RSMap = std::unordered_map<int, std::unique_ptr<std::vector<int>>>;
 // A hash map to count occurrences of each radical encoding.
 using RSCounts = std::unordered_map<int, int>;
 
-static bool DecodeRadicalLine(STRING *radical_data_line, RSMap *radical_map) {
-  if (radical_data_line->length() == 0 || (*radical_data_line)[0] == '#')
+static bool DecodeRadicalLine(std::string &radical_data_line, RSMap *radical_map) {
+  if (radical_data_line.length() == 0 || (radical_data_line)[0] == '#')
     return true;
-  std::vector<STRING> entries;
-  radical_data_line->split(' ', &entries);
+  std::vector<std::string> entries = split(radical_data_line, ' ');
   if (entries.size() < 2)
     return false;
   char *end = nullptr;
@@ -73,11 +72,10 @@ static bool DecodeRadicalLine(STRING *radical_data_line, RSMap *radical_map) {
 // already been read into a STRING. Returns false on error.
 // The radical_stroke_table is non-const because it gets split and the caller
 // is unlikely to want to use it again.
-static bool DecodeRadicalTable(STRING *radical_data, RSMap *radical_map) {
-  std::vector<STRING> lines;
-  radical_data->split('\n', &lines);
+static bool DecodeRadicalTable(std::string &radical_data, RSMap *radical_map) {
+  std::vector<std::string> lines = split(radical_data, '\n');
   for (int i = 0; i < lines.size(); ++i) {
-    if (!DecodeRadicalLine(&lines[i], radical_map)) {
+    if (!DecodeRadicalLine(lines[i], radical_map)) {
       tprintf("Invalid format in radical table at line %d: %s\n", i, lines[i].c_str());
       return false;
     }
@@ -105,9 +103,9 @@ UnicharCompress &UnicharCompress::operator=(const UnicharCompress &src) {
 // input string radical_stroke_table.
 // Returns false if the encoding cannot be constructed.
 bool UnicharCompress::ComputeEncoding(const UNICHARSET &unicharset, int null_id,
-                                      STRING *radical_stroke_table) {
+                                      std::string *radical_stroke_table) {
   RSMap radical_map;
-  if (radical_stroke_table != nullptr && !DecodeRadicalTable(radical_stroke_table, &radical_map))
+  if (radical_stroke_table != nullptr && !DecodeRadicalTable(*radical_stroke_table, &radical_map))
     return false;
   encoder_.clear();
   UNICHARSET direct_set;

diff --git a/src/ccutil/unicharcompress.h b/src/ccutil/unicharcompress.h
@@ -152,7 +152,7 @@ class TESS_API UnicharCompress {
   // the file training/langdata/radical-stroke.txt have been read into the
   // input string radical_stroke_table.
   // Returns false if the encoding cannot be constructed.
-  bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, STRING *radical_stroke_table);
+  bool ComputeEncoding(const UNICHARSET &unicharset, int null_id, std::string *radical_stroke_table);
   // Sets up an encoder that doesn't change the unichars at all, so it just
   // passes them through unchanged.
   void SetupPassThrough(const UNICHARSET &unicharset);

diff --git a/src/dict/dawg.cpp b/src/dict/dawg.cpp
@@ -24,7 +24,6 @@
 
 #include "dict.h"
 #include "helpers.h"
-#include "strngs.h"
 #include "tprintf.h"
 
 #include <memory>
@@ -105,7 +104,7 @@ void Dawg::iterate_words(const UNICHARSET &unicharset,
 }
 
 static void CallWithUTF8(std::function<void(const char *)> cb, const WERD_CHOICE *wc) {
-  STRING s;
+  std::string s;
   wc->string_and_lengths(&s, nullptr);
   cb(s.c_str());
 }

diff --git a/src/dict/dawg.h b/src/dict/dawg.h
@@ -119,7 +119,7 @@ class TESS_API Dawg {
   inline DawgType type() const {
     return type_;
   }
-  inline const STRING &lang() const {
+  inline const std::string &lang() const {
     return lang_;
   }
   inline PermuterType permuter() const {
@@ -194,7 +194,7 @@ class TESS_API Dawg {
   }
 
 protected:
-  Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
+  Dawg(DawgType type, const std::string &lang, PermuterType perm, int debug_level)
       : lang_(lang), type_(type), perm_(perm), unicharset_size_(0), debug_level_(debug_level) {}
 
   /// Returns the next node visited by following this edge.
@@ -280,7 +280,7 @@ class TESS_API Dawg {
                          std::function<void(const WERD_CHOICE *)> cb) const;
 
   // Member Variables.
-  STRING lang_;
+  std::string lang_;
   DawgType type_;
   /// Permuter code that should be used if the word is found in this Dawg.
   PermuterType perm_;
@@ -384,17 +384,17 @@ class DawgPositionVector : public GenericVector<DawgPosition> {
 //
 class TESS_API SquishedDawg : public Dawg {
 public:
-  SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
+  SquishedDawg(DawgType type, const std::string &lang, PermuterType perm, int debug_level)
       : Dawg(type, lang, perm, debug_level) {}
-  SquishedDawg(const char *filename, DawgType type, const STRING &lang, PermuterType perm,
+  SquishedDawg(const char *filename, DawgType type, const std::string &lang, PermuterType perm,
                int debug_level)
       : Dawg(type, lang, perm, debug_level) {
     TFile file;
     ASSERT_HOST(file.Open(filename, nullptr));
     ASSERT_HOST(read_squished_dawg(&file));
     num_forward_edges_in_node0 = num_forward_edges(0);
   }
-  SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang,
+  SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, const std::string &lang,
                PermuterType perm, int unicharset_size, int debug_level)
       : Dawg(type, lang, perm, debug_level), edges_(edges), num_edges_(num_edges) {
     init(unicharset_size);

diff --git a/src/dict/dawg_cache.cpp b/src/dict/dawg_cache.cpp
@@ -20,13 +20,12 @@
 
 #include "dawg.h"
 #include "object_cache.h"
-#include "strngs.h"
 #include "tessdatamanager.h"
 
 namespace tesseract {
 
 struct DawgLoader {
-  DawgLoader(const STRING &lang, TessdataType tessdata_dawg_type, int dawg_debug_level,
+  DawgLoader(const std::string &lang, TessdataType tessdata_dawg_type, int dawg_debug_level,
              TessdataManager *data_file)
       : lang_(lang)
       , data_file_(data_file)
@@ -35,13 +34,13 @@ struct DawgLoader {
 
   Dawg *Load();
 
-  STRING lang_;
+  std::string lang_;
   TessdataManager *data_file_;
   TessdataType tessdata_dawg_type_;
   int dawg_debug_level_;
 };
 
-Dawg *DawgCache::GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type,
+Dawg *DawgCache::GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type,
                                  int debug_level, TessdataManager *data_file) {
   std::string data_id = data_file->GetDataFileName();
   data_id += kTessdataFileSuffixes[tessdata_dawg_type];

diff --git a/src/dict/dawg_cache.h b/src/dict/dawg_cache.h
@@ -22,14 +22,13 @@
 
 #include "dawg.h"
 #include "object_cache.h"
-#include "strngs.h"
 #include "tessdatamanager.h"
 
 namespace tesseract {
 
 class DawgCache {
 public:
-  Dawg *GetSquishedDawg(const STRING &lang, TessdataType tessdata_dawg_type, int debug_level,
+  Dawg *GetSquishedDawg(const std::string &lang, TessdataType tessdata_dawg_type, int debug_level,
                         TessdataManager *data_file);
 
   // If we manage the given dawg, decrement its count,

diff --git a/src/dict/dict.cpp b/src/dict/dict.cpp
@@ -195,7 +195,7 @@ void Dict::SetupForLoad(DawgCache *dawg_cache) {
 }
 
 // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
-void Dict::Load(const STRING &lang, TessdataManager *data_file) {
+void Dict::Load(const std::string &lang, TessdataManager *data_file) {
   // Load dawgs_.
   if (load_punc_dawg) {
     punc_dawg_ =
@@ -281,7 +281,7 @@ void Dict::Load(const STRING &lang, TessdataManager *data_file) {
 }
 
 // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
-void Dict::LoadLSTM(const STRING &lang, TessdataManager *data_file) {
+void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
   // Load dawgs_.
   if (load_punc_dawg) {
     punc_dawg_ =

diff --git a/src/dict/dict.h b/src/dict/dict.h
@@ -283,9 +283,9 @@ class TESS_API Dict {
   // Sets up ready for a Load or LoadLSTM.
   void SetupForLoad(DawgCache *dawg_cache);
   // Loads the dawgs needed by Tesseract. Call FinishLoad() after.
-  void Load(const STRING &lang, TessdataManager *data_file);
+  void Load(const std::string &lang, TessdataManager *data_file);
   // Loads the dawgs needed by the LSTM model. Call FinishLoad() after.
-  void LoadLSTM(const STRING &lang, TessdataManager *data_file);
+  void LoadLSTM(const std::string &lang, TessdataManager *data_file);
   // Completes the loading process after Load() and/or LoadLSTM().
   // Returns false if no dictionaries were loaded.
   bool FinishLoad();

diff --git a/src/dict/permdawg.cpp b/src/dict/permdawg.cpp
@@ -113,12 +113,12 @@ void Dict::go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &
             tprintf("Failed to open output_ambig_words_file %s\n", output_ambig_words_file.c_str());
             exit(1);
           }
-          STRING word_str;
+	  std::string word_str;
           word->string_and_lengths(&word_str, nullptr);
           word_str += " ";
           fprintf(output_ambig_words_file_, "%s", word_str.c_str());
         }
-        STRING word_str;
+	std::string word_str;
         word->string_and_lengths(&word_str, nullptr);
         word_str += " ";
         fprintf(output_ambig_words_file_, "%s", word_str.c_str());

diff --git a/src/dict/trie.cpp b/src/dict/trie.cpp
@@ -260,15 +260,15 @@ NODE_REF Trie::new_dawg_node() {
 
 bool Trie::read_and_add_word_list(const char *filename, const UNICHARSET &unicharset,
                                   Trie::RTLReversePolicy reverse_policy) {
-  std::vector<STRING> word_list;
+  std::vector<std::string> word_list;
   if (!read_word_list(filename, &word_list))
     return false;
   std::sort(word_list.begin(), word_list.end(),
             [](auto &s1, auto &s2) { return s1.size() > s2.size(); });
   return add_word_list(word_list, unicharset, reverse_policy);
 }
 
-bool Trie::read_word_list(const char *filename, std::vector<STRING> *words) {
+bool Trie::read_word_list(const char *filename, std::vector<std::string> *words) {
   FILE *word_file;
   char line_str[CHARS_PER_LINE];
   int word_count = 0;
@@ -279,7 +279,7 @@ bool Trie::read_word_list(const char *filename, std::vector<STRING> *words) {
 
   while (fgets(line_str, sizeof(line_str), word_file) != nullptr) {
     chomp_string(line_str); // remove newline
-    STRING word_str(line_str);
+    std::string word_str(line_str);
     ++word_count;
     if (debug_level_ && word_count % 10000 == 0)
       tprintf("Read %d words so far\n", word_count);
@@ -291,7 +291,7 @@ bool Trie::read_word_list(const char *filename, std::vector<STRING> *words) {
   return true;
 }
 
-bool Trie::add_word_list(const std::vector<STRING> &words, const UNICHARSET &unicharset,
+bool Trie::add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
                          Trie::RTLReversePolicy reverse_policy) {
   for (int i = 0; i < words.size(); ++i) {
     WERD_CHOICE word(words[i].c_str(), unicharset);

diff --git a/src/dict/trie.h b/src/dict/trie.h
@@ -79,7 +79,7 @@ class TESS_API Trie : public Dawg {
   // Trie can consume (if a new word insert would cause the Trie to
   // contain more edges than max_num_edges, all the edges are cleared
   // so that new inserts can proceed).
-  Trie(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
+  Trie(DawgType type, const std::string &lang, PermuterType perm, int unicharset_size, int debug_level)
       : Dawg(type, lang, perm, debug_level) {
     init(unicharset_size);
     num_edges_ = 0;
@@ -173,11 +173,11 @@ class TESS_API Trie : public Dawg {
 
   // Reads a list of words from the given file.
   // Returns false on error.
-  bool read_word_list(const char *filename, std::vector<STRING> *words);
+  bool read_word_list(const char *filename, std::vector<std::string> *words);
   // Adds a list of words previously read using read_word_list to the trie
   // using the given unicharset and reverse_policy to convert to unichar-ids.
   // Returns false on error.
-  bool add_word_list(const std::vector<STRING> &words, const UNICHARSET &unicharset,
+  bool add_word_list(const std::vector<std::string> &words, const UNICHARSET &unicharset,
                      Trie::RTLReversePolicy reverse_policy);
 
   // Inserts the list of patterns from the given file into the Trie.

diff --git a/src/training/combine_lang_model.cpp b/src/training/combine_lang_model.cpp
@@ -50,11 +50,10 @@ int main(int argc, char **argv) {
   tesseract::CheckSharedLibraryVersion();
   tesseract::ParseCommandLineFlags(argv[0], &argc, &argv, true);
 
-  std::vector<STRING> words, puncs, numbers;
   // If these reads fail, we get a warning message and an empty list of words.
-  tesseract::ReadFile(FLAGS_words.c_str(), nullptr).split('\n', &words);
-  tesseract::ReadFile(FLAGS_puncs.c_str(), nullptr).split('\n', &puncs);
-  tesseract::ReadFile(FLAGS_numbers.c_str(), nullptr).split('\n', &numbers);
+  std::vector<std::string> words = split(tesseract::ReadFile(FLAGS_words.c_str()), '\n');
+  std::vector<std::string> puncs = split(tesseract::ReadFile(FLAGS_puncs.c_str()), '\n');
+  std::vector<std::string> numbers = split(tesseract::ReadFile(FLAGS_numbers.c_str()), '\n');
   // Load the input unicharset
   UNICHARSET unicharset;
   if (!unicharset.load_from_file(FLAGS_input_unicharset.c_str(), false)) {

diff --git a/src/training/unicharset/lang_model_helpers.cpp b/src/training/unicharset/lang_model_helpers.cpp
@@ -56,21 +56,21 @@ bool WriteFile(const std::string &output_dir, const std::string &lang, const std
     return (*writer)(data, filename.c_str());
 }
 
-// Helper reads a file with optional reader and returns a STRING.
-// On failure emits a warning message and returns and empty STRING.
-STRING ReadFile(const std::string &filename, FileReader reader) {
+// Helper reads a file with optional reader and returns a string.
+// On failure emits a warning message and returns an empty string.
+std::string ReadFile(const std::string &filename, FileReader reader) {
   if (filename.empty())
-    return STRING();
+    return std::string();
   std::vector<char> data;
   bool read_result;
   if (reader == nullptr)
     read_result = LoadDataFromFile(filename.c_str(), &data);
   else
     read_result = (*reader)(filename.c_str(), &data);
   if (read_result)
-    return STRING(&data[0], data.size());
+    return std::string(&data[0], data.size());
   tprintf("Failed to read data from: %s\n", filename.c_str());
-  return STRING();
+  return std::string();
 }
 
 // Helper writes the unicharset to file and to the traineddata.
@@ -89,7 +89,7 @@ bool WriteUnicharset(const UNICHARSET &unicharset, const std::string &output_dir
 // Helper creates the recoder and writes it to the traineddata, and a human-
 // readable form to file.
 bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::string &output_dir,
-                  const std::string &lang, FileWriter writer, STRING *radical_table_data,
+                  const std::string &lang, FileWriter writer, std::string *radical_table_data,
                   TessdataManager *traineddata) {
   UnicharCompress recoder;
   // Where the unicharset is carefully setup already to contain a good
@@ -116,7 +116,7 @@ bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::st
   if (!recoder.Serialize(&fp))
     return false;
   traineddata->OverwriteEntry(TESSDATA_LSTM_RECODER, &recoder_data[0], recoder_data.size());
-  STRING encoding = recoder.GetEncodingAsString(unicharset);
+  std::string encoding = recoder.GetEncodingAsString(unicharset);
   recoder_data.resize(encoding.length(), 0);
   memcpy(&recoder_data[0], &encoding[0], encoding.length());
   std::string suffix;
@@ -127,7 +127,7 @@ bool WriteRecoder(const UNICHARSET &unicharset, bool pass_through, const std::st
 
 // Helper builds a dawg from the given words, using the unicharset as coding,
 // and reverse_policy for LTR/RTL, and overwrites file_type in the traineddata.
-static bool WriteDawg(const std::vector<STRING> &words, const UNICHARSET &unicharset,
+static bool WriteDawg(const std::vector<std::string> &words, const UNICHARSET &unicharset,
                       Trie::RTLReversePolicy reverse_policy, TessdataType file_type,
                       TessdataManager *traineddata) {
   // The first 3 arguments are not used in this case.
@@ -149,8 +149,8 @@ static bool WriteDawg(const std::vector<STRING> &words, const UNICHARSET &unicha
 // Builds and writes the dawgs, given a set of words, punctuation
 // patterns, number patterns, to the traineddata. Encoding uses the given
 // unicharset, and the punc dawgs is reversed if lang_is_rtl.
-static bool WriteDawgs(const std::vector<STRING> &words, const std::vector<STRING> &puncs,
-                       const std::vector<STRING> &numbers, bool lang_is_rtl,
+static bool WriteDawgs(const std::vector<std::string> &words, const std::vector<std::string> &puncs,
+                       const std::vector<std::string> &numbers, bool lang_is_rtl,
                        const UNICHARSET &unicharset, TessdataManager *traineddata) {
   if (puncs.empty()) {
     tprintf("Must have non-empty puncs list to use language models!!\n");
@@ -185,8 +185,8 @@ static bool WriteDawgs(const std::vector<STRING> &words, const std::vector<STRIN
 int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir,
                      const std::string &version_str, const std::string &output_dir,
                      const std::string &lang, bool pass_through_recoder,
-                     const std::vector<STRING> &words, const std::vector<STRING> &puncs,
-                     const std::vector<STRING> &numbers, bool lang_is_rtl, FileReader reader,
+                     const std::vector<std::string> &words, const std::vector<std::string> &puncs,
+                     const std::vector<std::string> &numbers, bool lang_is_rtl, FileReader reader,
                      FileWriter writer) {
   // Build the traineddata file.
   TessdataManager traineddata;
@@ -202,12 +202,12 @@ int CombineLangModel(const UNICHARSET &unicharset, const std::string &script_dir
   }
   // If there is a config file, read it and add to traineddata.
   std::string config_filename = script_dir + "/" + lang + "/" + lang + ".config";
-  STRING config_file = ReadFile(config_filename, reader);
+  std::string config_file = ReadFile(config_filename, reader);
   if (config_file.length() > 0) {
     traineddata.OverwriteEntry(TESSDATA_LANG_CONFIG, &config_file[0], config_file.length());
   }
   std::string radical_filename = script_dir + "/radical-stroke.txt";
-  STRING radical_data = ReadFile(radical_filename, reader);
+  std::string radical_data = ReadFile(radical_filename, reader);
   if (radical_data.length() == 0) {
     tprintf("Error reading radical code table %s\n", radical_filename.c_str());
     return EXIT_FAILURE;