More std::vector.

rvrsh3ll · Jan 7, 2021 · 9710bc0 · 9710bc0
1 parent 154ea6b
commit 9710bc0
Show file tree

Hide file tree

Showing 28 changed files with 236 additions and 208 deletions.
diff --git a/src/ccmain/recogtraining.cpp b/src/ccmain/recogtraining.cpp
@@ -221,7 +221,7 @@ void Tesseract::ambigs_classify_and_output(const char* label,
   ASSERT_HOST(best_choice != nullptr);
 
   // Compute the number of unichars in the label.
-  GenericVector<UNICHAR_ID> encoding;
+  std::vector<UNICHAR_ID> encoding;
   if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
     tprintf("Not outputting illegal unichar %s\n", label);
     return;

diff --git a/src/ccstruct/blamer.cpp b/src/ccstruct/blamer.cpp
@@ -78,8 +78,8 @@ void BlamerBundle::SetWordTruth(const UNICHARSET& unicharset,
   truth_word_.InsertBox(0, word_box);
   truth_has_char_boxes_ = false;
   // Encode the string as UNICHAR_IDs.
-  GenericVector<UNICHAR_ID> encoding;
-  GenericVector<char> lengths;
+  std::vector<UNICHAR_ID> encoding;
+  std::vector<char> lengths;
   unicharset.encode_string(truth_str, false, &encoding, &lengths, nullptr);
   int total_length = 0;
   for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {

diff --git a/src/ccstruct/ratngs.cpp b/src/ccstruct/ratngs.cpp
@@ -217,8 +217,8 @@ const char *ScriptPosToString(enum ScriptPos script_pos) {
 WERD_CHOICE::WERD_CHOICE(const char *src_string,
                          const UNICHARSET &unicharset)
     : unicharset_(&unicharset){
-  GenericVector<UNICHAR_ID> encoding;
-  GenericVector<char> lengths;
+  std::vector<UNICHAR_ID> encoding;
+  std::vector<char> lengths;
   std::string cleaned = unicharset.CleanupString(src_string);
   if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
                                nullptr)) {

diff --git a/src/ccutil/ambigs.cpp b/src/ccutil/ambigs.cpp
@@ -130,7 +130,7 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set,
     }
     // Update ambigs_for_adaption_.
     if (use_ambigs_for_adaption) {
-      GenericVector<UNICHAR_ID> encoding;
+      std::vector<UNICHAR_ID> encoding;
       // Silently ignore invalid strings, as before, so it is safe to use a
       // universal ambigs file.
       if (unicharset->encode_string(replacement_string, true, &encoding,
@@ -235,7 +235,7 @@ bool UnicharAmbigs::ParseAmbiguityLine(
       return false;
     }
     // Encode wrong-string.
-    GenericVector<UNICHAR_ID> unichars;
+    std::vector<UNICHAR_ID> unichars;
     if (!unicharset.encode_string(fields[0].c_str(), true, &unichars, nullptr,
                                   nullptr)) {
       return false;

diff --git a/src/ccutil/serialis.h b/src/ccutil/serialis.h
@@ -98,6 +98,34 @@ class TESS_API TFile {
       return FWrite(data, sizeof(T), count) == count;
   }
 
+  template <typename T>
+  bool Serialize(const std::vector<T>& data) {
+    auto size_used_ = data.size();
+    if (FWrite(&size_used_, sizeof(size_used_), 1) != 1) {
+      return false;
+    }
+    if (FWrite(data.data(), sizeof(T), size_used_) != size_used_) {
+      return false;
+    }
+    return true;
+  }
+
+  template <typename T>
+  bool DeSerialize(std::vector<T>& data) {
+    uint32_t reserved;
+    if (FReadEndian(&reserved, sizeof(reserved), 1) != 1) {
+      return false;
+    }
+    // Arbitrarily limit the number of elements to protect against bad data.
+    const uint32_t limit = 50000000;
+    //assert(reserved <= limit);
+    if (reserved > limit) {
+      return false;
+    }
+    data.reserve(reserved);
+    return FReadEndian(data.data(), sizeof(T), reserved) == reserved;
+  }
+
   // Skip data.
   bool Skip(size_t count);
 

diff --git a/src/ccutil/unicharset.cpp b/src/ccutil/unicharset.cpp
@@ -212,8 +212,8 @@ UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
 // WARNING: this function now encodes the whole string for precision.
 // Use encode_string in preference to repeatedly calling step.
 int UNICHARSET::step(const char* str) const {
-  GenericVector<UNICHAR_ID> encoding;
-  GenericVector<char> lengths;
+  std::vector<UNICHAR_ID> encoding;
+  std::vector<char> lengths;
   encode_string(str, true, &encoding, &lengths, nullptr);
   if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
   return lengths[0];
@@ -224,7 +224,7 @@ int UNICHARSET::step(const char* str) const {
 // into the second (return) argument.
 bool UNICHARSET::encodable_string(const char *str,
                                   int *first_bad_position) const {
-  GenericVector<UNICHAR_ID> encoding;
+  std::vector<UNICHAR_ID> encoding;
   return encode_string(str, true, &encoding, nullptr, first_bad_position);
 }
 
@@ -238,13 +238,13 @@ bool UNICHARSET::encodable_string(const char *str,
 // that do not belong in the unicharset, or encoding may fail.
 // Use CleanupString to perform the cleaning.
 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
-                               GenericVector<UNICHAR_ID>* encoding,
-                               GenericVector<char>* lengths,
+                               std::vector<UNICHAR_ID>* encoding,
+                               std::vector<char>* lengths,
                                int* encoded_length) const {
-  GenericVector<UNICHAR_ID> working_encoding;
-  GenericVector<char> working_lengths;
-  GenericVector<char> best_lengths;
-  encoding->truncate(0);  // Just in case str is empty.
+  std::vector<UNICHAR_ID> working_encoding;
+  std::vector<char> working_lengths;
+  std::vector<char> best_lengths;
+  encoding->resize(0);  // Just in case str is empty.
   int str_length = strlen(str);
   int str_pos = 0;
   bool perfect = true;
@@ -352,13 +352,13 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
 // Sets the normed_ids vector from the normed string. normed_ids is not
 // stored in the file, and needs to be set when the UNICHARSET is loaded.
 void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
-  unichars[unichar_id].properties.normed_ids.truncate(0);
+  unichars[unichar_id].properties.normed_ids.resize(0);
   if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
     unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
   } else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
                             true, &unichars[unichar_id].properties.normed_ids,
                             nullptr, nullptr)) {
-    unichars[unichar_id].properties.normed_ids.truncate(0);
+    unichars[unichar_id].properties.normed_ids.resize(0);
     unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
   }
 }
@@ -481,11 +481,11 @@ bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {
 // the overall process of encoding a partially failed string more efficient.
 // See unicharset.h for definition of the args.
 void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
-                               GenericVector<UNICHAR_ID>* encoding,
-                               GenericVector<char>* lengths,
+                               std::vector<UNICHAR_ID>* encoding,
+                               std::vector<char>* lengths,
                                int* best_total_length,
-                               GenericVector<UNICHAR_ID>* best_encoding,
-                               GenericVector<char>* best_lengths) const {
+                               std::vector<UNICHAR_ID>* best_encoding,
+                               std::vector<char>* best_lengths) const {
   if (str_index > *best_total_length) {
     // This is the best result so far.
     *best_total_length = str_index;
@@ -509,8 +509,8 @@ void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
       if (*best_total_length == str_length)
         return;  // Tail recursion success!
       // Failed with that length, truncate back and try again.
-      encoding->truncate(encoding_index);
-      lengths->truncate(encoding_index);
+      encoding->resize(encoding_index);
+      lengths->resize(encoding_index);
     }
     int step = UNICHAR::utf8_step(str + str_index + length);
     if (step == 0) step = 1;
@@ -528,7 +528,7 @@ bool UNICHARSET::GetStrProperties(const char* utf8_str,
   props->Init();
   props->SetRangesEmpty();
   int total_unicodes = 0;
-  GenericVector<UNICHAR_ID> encoding;
+  std::vector<UNICHAR_ID> encoding;
   if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr))
     return false;  // Some part was invalid.
   for (int i = 0; i < encoding.size(); ++i) {
@@ -611,7 +611,7 @@ void UNICHARSET::unichar_insert(const char* const unichar_repr,
       old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
   if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
     const char* str = cleaned.c_str();
-    GenericVector<int> encoding;
+    std::vector<int> encoding;
     if (!old_style_included_ &&
         encode_string(str, true, &encoding, nullptr, nullptr))
       return;
@@ -950,7 +950,7 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
     unichars[ch].properties.enabled = def_enabled;
   if (!def_enabled) {
     // Enable the whitelist.
-    GenericVector<UNICHAR_ID> encoding;
+    std::vector<UNICHAR_ID> encoding;
     encode_string(whitelist, false, &encoding, nullptr, nullptr);
     for (int i = 0; i < encoding.size(); ++i) {
       if (encoding[i] != INVALID_UNICHAR_ID)
@@ -959,7 +959,7 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
   }
   if (blacklist != nullptr && blacklist[0] != '\0') {
     // Disable the blacklist.
-    GenericVector<UNICHAR_ID> encoding;
+    std::vector<UNICHAR_ID> encoding;
     encode_string(blacklist, false, &encoding, nullptr, nullptr);
     for (int i = 0; i < encoding.size(); ++i) {
       if (encoding[i] != INVALID_UNICHAR_ID)
@@ -968,7 +968,7 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
   }
   if (unblacklist != nullptr && unblacklist[0] != '\0') {
     // Re-enable the unblacklist.
-    GenericVector<UNICHAR_ID> encoding;
+    std::vector<UNICHAR_ID> encoding;
     encode_string(unblacklist, false, &encoding, nullptr, nullptr);
     for (int i = 0; i < encoding.size(); ++i) {
       if (encoding[i] != INVALID_UNICHAR_ID)

diff --git a/src/ccutil/unicharset.h b/src/ccutil/unicharset.h
@@ -227,8 +227,8 @@ class TESS_API UNICHARSET {
   // that do not belong in the unicharset, or encoding may fail.
   // Use CleanupString to perform the cleaning.
   bool encode_string(const char* str, bool give_up_on_failure,
-                     GenericVector<UNICHAR_ID>* encoding,
-                     GenericVector<char>* lengths,
+                     std::vector<UNICHAR_ID>* encoding,
+                     std::vector<char>* lengths,
                      int* encoded_length) const;
 
   // Return the unichar representation corresponding to the given UNICHAR_ID
@@ -467,7 +467,7 @@ class TESS_API UNICHARSET {
   // Record normalized version of unichar with the given unichar_id.
   void set_normed(UNICHAR_ID unichar_id, const char* normed) {
     unichars[unichar_id].properties.normed = normed;
-    unichars[unichar_id].properties.normed_ids.truncate(0);
+    unichars[unichar_id].properties.normed_ids.resize(0);
   }
   // Sets the normed_ids vector from the normed string. normed_ids is not
   // stored in the file, and needs to be set when the UNICHARSET is loaded.
@@ -818,7 +818,7 @@ class TESS_API UNICHARSET {
   // Returns a vector of UNICHAR_IDs that represent the ids of the normalized
   // version of the given id. There may be more than one UNICHAR_ID in the
   // vector if unichar_id represents a ligature.
-  const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {
+  const std::vector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {
     return unichars[unichar_id].properties.normed_ids;
   }
 
@@ -946,7 +946,7 @@ class TESS_API UNICHARSET {
     // A string of unichar_ids that represent the corresponding normed string.
     // For awkward characters like em-dash, this gives hyphen.
     // For ligatures, this gives the string of normal unichars.
-    GenericVector<UNICHAR_ID> normed_ids;
+    std::vector<UNICHAR_ID> normed_ids;
     STRING normed;  // normalized version of this unichar
     // Contains meta information about the fragment if a unichar represents
     // a fragment of a character, otherwise should be set to nullptr.
@@ -972,11 +972,11 @@ class TESS_API UNICHARSET {
   // best_encoding contains the encoding that used the longest part of str.
   // best_lengths (may be null) contains the lengths of best_encoding.
   void encode_string(const char* str, int str_index, int str_length,
-                     GenericVector<UNICHAR_ID>* encoding,
-                     GenericVector<char>* lengths,
+                     std::vector<UNICHAR_ID>* encoding,
+                     std::vector<char>* lengths,
                      int* best_total_length,
-                     GenericVector<UNICHAR_ID>* best_encoding,
-                     GenericVector<char>* best_lengths) const;
+                     std::vector<UNICHAR_ID>* best_encoding,
+                     std::vector<char>* best_lengths) const;
 
   // Gets the properties for a grapheme string, combining properties for
   // multiple characters in a meaningful way where possible.

diff --git a/src/dict/dict.cpp b/src/dict/dict.cpp
@@ -824,24 +824,24 @@ bool Dict::valid_bigram(const WERD_CHOICE& word1,
   if (w2start >= w2end) return word2.length() < 3;
 
   const UNICHARSET& uchset = getUnicharset();
-  GenericVector<UNICHAR_ID> bigram_string;
+  std::vector<UNICHAR_ID> bigram_string;
   bigram_string.reserve(w1end + w2end + 1);
   for (int i = w1start; i < w1end; i++) {
-    const GenericVector<UNICHAR_ID>& normed_ids =
+    const auto &normed_ids =
         getUnicharset().normed_ids(word1.unichar_id(i));
     if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
       bigram_string.push_back(question_unichar_id_);
     else
-      bigram_string += normed_ids;
+      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
   }
   bigram_string.push_back(UNICHAR_SPACE);
   for (int i = w2start; i < w2end; i++) {
-    const GenericVector<UNICHAR_ID>& normed_ids =
+    const auto &normed_ids =
         getUnicharset().normed_ids(word2.unichar_id(i));
     if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
       bigram_string.push_back(question_unichar_id_);
     else
-      bigram_string += normed_ids;
+      bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
   }
   WERD_CHOICE normalized_word(&uchset, bigram_string.size());
   for (int i = 0; i < bigram_string.size(); ++i) {

diff --git a/src/dict/dict.h b/src/dict/dict.h
@@ -116,7 +116,7 @@ class TESS_API Dict {
   inline bool compound_marker(UNICHAR_ID unichar_id) {
     const UNICHARSET& unicharset = getUnicharset();
     ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
-    const GenericVector<UNICHAR_ID>& normed_ids =
+    const auto &normed_ids =
         unicharset.normed_ids(unichar_id);
     return normed_ids.size() == 1 &&
         (normed_ids[0] == hyphen_unichar_id_ ||
@@ -127,7 +127,7 @@ class TESS_API Dict {
   inline bool is_apostrophe(UNICHAR_ID unichar_id) {
     const UNICHARSET& unicharset = getUnicharset();
     ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
-    const GenericVector<UNICHAR_ID>& normed_ids =
+    const auto &normed_ids =
         unicharset.normed_ids(unichar_id);
     return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
   }
@@ -157,7 +157,7 @@ class TESS_API Dict {
     if (!last_word_on_line_ || first_pos)
       return false;
     ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
-    const GenericVector<UNICHAR_ID>& normed_ids =
+    const auto &normed_ids =
         unicharset->normed_ids(unichar_id);
     return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
   }

diff --git a/src/dict/permdawg.cpp b/src/dict/permdawg.cpp
@@ -62,7 +62,7 @@ void Dict::go_deeper_dawg_fxn(
     }
     int num_unigrams = 0;
     word->remove_last_unichar_id();
-    GenericVector<UNICHAR_ID> encoding;
+    std::vector<UNICHAR_ID> encoding;
     const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
     // Since the string came out of the unicharset, failure is impossible.
     ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,