Skip to content

Commit

Permalink
More std::vector.
Browse files Browse the repository at this point in the history
  • Loading branch information
egorpugin committed Jan 7, 2021
1 parent 154ea6b commit 9710bc0
Show file tree
Hide file tree
Showing 28 changed files with 236 additions and 208 deletions.
2 changes: 1 addition & 1 deletion src/ccmain/recogtraining.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -221,7 +221,7 @@ void Tesseract::ambigs_classify_and_output(const char* label,
ASSERT_HOST(best_choice != nullptr);

// Compute the number of unichars in the label.
GenericVector<UNICHAR_ID> encoding;
std::vector<UNICHAR_ID> encoding;
if (!unicharset.encode_string(label, true, &encoding, nullptr, nullptr)) {
tprintf("Not outputting illegal unichar %s\n", label);
return;
Expand Down
4 changes: 2 additions & 2 deletions src/ccstruct/blamer.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,8 @@ void BlamerBundle::SetWordTruth(const UNICHARSET& unicharset,
truth_word_.InsertBox(0, word_box);
truth_has_char_boxes_ = false;
// Encode the string as UNICHAR_IDs.
GenericVector<UNICHAR_ID> encoding;
GenericVector<char> lengths;
std::vector<UNICHAR_ID> encoding;
std::vector<char> lengths;
unicharset.encode_string(truth_str, false, &encoding, &lengths, nullptr);
int total_length = 0;
for (int i = 0; i < encoding.size(); total_length += lengths[i++]) {
Expand Down
4 changes: 2 additions & 2 deletions src/ccstruct/ratngs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -217,8 +217,8 @@ const char *ScriptPosToString(enum ScriptPos script_pos) {
WERD_CHOICE::WERD_CHOICE(const char *src_string,
const UNICHARSET &unicharset)
: unicharset_(&unicharset){
GenericVector<UNICHAR_ID> encoding;
GenericVector<char> lengths;
std::vector<UNICHAR_ID> encoding;
std::vector<char> lengths;
std::string cleaned = unicharset.CleanupString(src_string);
if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
nullptr)) {
Expand Down
4 changes: 2 additions & 2 deletions src/ccutil/ambigs.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -130,7 +130,7 @@ void UnicharAmbigs::LoadUnicharAmbigs(const UNICHARSET& encoder_set,
}
// Update ambigs_for_adaption_.
if (use_ambigs_for_adaption) {
GenericVector<UNICHAR_ID> encoding;
std::vector<UNICHAR_ID> encoding;
// Silently ignore invalid strings, as before, so it is safe to use a
// universal ambigs file.
if (unicharset->encode_string(replacement_string, true, &encoding,
Expand Down Expand Up @@ -235,7 +235,7 @@ bool UnicharAmbigs::ParseAmbiguityLine(
return false;
}
// Encode wrong-string.
GenericVector<UNICHAR_ID> unichars;
std::vector<UNICHAR_ID> unichars;
if (!unicharset.encode_string(fields[0].c_str(), true, &unichars, nullptr,
nullptr)) {
return false;
Expand Down
28 changes: 28 additions & 0 deletions src/ccutil/serialis.h
Original file line number Diff line number Diff line change
Expand Up @@ -98,6 +98,34 @@ class TESS_API TFile {
return FWrite(data, sizeof(T), count) == count;
}

template <typename T>
bool Serialize(const std::vector<T>& data) {
auto size_used_ = data.size();
if (FWrite(&size_used_, sizeof(size_used_), 1) != 1) {
return false;
}
if (FWrite(data.data(), sizeof(T), size_used_) != size_used_) {
return false;
}
return true;
}

template <typename T>
bool DeSerialize(std::vector<T>& data) {
uint32_t reserved;
if (FReadEndian(&reserved, sizeof(reserved), 1) != 1) {
return false;
}
// Arbitrarily limit the number of elements to protect against bad data.
const uint32_t limit = 50000000;
//assert(reserved <= limit);
if (reserved > limit) {
return false;
}
data.reserve(reserved);
return FReadEndian(data.data(), sizeof(T), reserved) == reserved;
}

// Skip data.
bool Skip(size_t count);

Expand Down
44 changes: 22 additions & 22 deletions src/ccutil/unicharset.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -212,8 +212,8 @@ UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
// WARNING: this function now encodes the whole string for precision.
// Use encode_string in preference to repeatedly calling step.
int UNICHARSET::step(const char* str) const {
GenericVector<UNICHAR_ID> encoding;
GenericVector<char> lengths;
std::vector<UNICHAR_ID> encoding;
std::vector<char> lengths;
encode_string(str, true, &encoding, &lengths, nullptr);
if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
return lengths[0];
Expand All @@ -224,7 +224,7 @@ int UNICHARSET::step(const char* str) const {
// into the second (return) argument.
bool UNICHARSET::encodable_string(const char *str,
int *first_bad_position) const {
GenericVector<UNICHAR_ID> encoding;
std::vector<UNICHAR_ID> encoding;
return encode_string(str, true, &encoding, nullptr, first_bad_position);
}

Expand All @@ -238,13 +238,13 @@ bool UNICHARSET::encodable_string(const char *str,
// that do not belong in the unicharset, or encoding may fail.
// Use CleanupString to perform the cleaning.
bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
GenericVector<UNICHAR_ID>* encoding,
GenericVector<char>* lengths,
std::vector<UNICHAR_ID>* encoding,
std::vector<char>* lengths,
int* encoded_length) const {
GenericVector<UNICHAR_ID> working_encoding;
GenericVector<char> working_lengths;
GenericVector<char> best_lengths;
encoding->truncate(0); // Just in case str is empty.
std::vector<UNICHAR_ID> working_encoding;
std::vector<char> working_lengths;
std::vector<char> best_lengths;
encoding->resize(0); // Just in case str is empty.
int str_length = strlen(str);
int str_pos = 0;
bool perfect = true;
Expand Down Expand Up @@ -352,13 +352,13 @@ STRING UNICHARSET::debug_str(UNICHAR_ID id) const {
// Sets the normed_ids vector from the normed string. normed_ids is not
// stored in the file, and needs to be set when the UNICHARSET is loaded.
void UNICHARSET::set_normed_ids(UNICHAR_ID unichar_id) {
unichars[unichar_id].properties.normed_ids.truncate(0);
unichars[unichar_id].properties.normed_ids.resize(0);
if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
} else if (!encode_string(unichars[unichar_id].properties.normed.c_str(),
true, &unichars[unichar_id].properties.normed_ids,
nullptr, nullptr)) {
unichars[unichar_id].properties.normed_ids.truncate(0);
unichars[unichar_id].properties.normed_ids.resize(0);
unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
}
}
Expand Down Expand Up @@ -481,11 +481,11 @@ bool UNICHARSET::SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const {
// the overall process of encoding a partially failed string more efficient.
// See unicharset.h for definition of the args.
void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
GenericVector<UNICHAR_ID>* encoding,
GenericVector<char>* lengths,
std::vector<UNICHAR_ID>* encoding,
std::vector<char>* lengths,
int* best_total_length,
GenericVector<UNICHAR_ID>* best_encoding,
GenericVector<char>* best_lengths) const {
std::vector<UNICHAR_ID>* best_encoding,
std::vector<char>* best_lengths) const {
if (str_index > *best_total_length) {
// This is the best result so far.
*best_total_length = str_index;
Expand All @@ -509,8 +509,8 @@ void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
if (*best_total_length == str_length)
return; // Tail recursion success!
// Failed with that length, truncate back and try again.
encoding->truncate(encoding_index);
lengths->truncate(encoding_index);
encoding->resize(encoding_index);
lengths->resize(encoding_index);
}
int step = UNICHAR::utf8_step(str + str_index + length);
if (step == 0) step = 1;
Expand All @@ -528,7 +528,7 @@ bool UNICHARSET::GetStrProperties(const char* utf8_str,
props->Init();
props->SetRangesEmpty();
int total_unicodes = 0;
GenericVector<UNICHAR_ID> encoding;
std::vector<UNICHAR_ID> encoding;
if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr))
return false; // Some part was invalid.
for (int i = 0; i < encoding.size(); ++i) {
Expand Down Expand Up @@ -611,7 +611,7 @@ void UNICHARSET::unichar_insert(const char* const unichar_repr,
old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
const char* str = cleaned.c_str();
GenericVector<int> encoding;
std::vector<int> encoding;
if (!old_style_included_ &&
encode_string(str, true, &encoding, nullptr, nullptr))
return;
Expand Down Expand Up @@ -950,7 +950,7 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
unichars[ch].properties.enabled = def_enabled;
if (!def_enabled) {
// Enable the whitelist.
GenericVector<UNICHAR_ID> encoding;
std::vector<UNICHAR_ID> encoding;
encode_string(whitelist, false, &encoding, nullptr, nullptr);
for (int i = 0; i < encoding.size(); ++i) {
if (encoding[i] != INVALID_UNICHAR_ID)
Expand All @@ -959,7 +959,7 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
}
if (blacklist != nullptr && blacklist[0] != '\0') {
// Disable the blacklist.
GenericVector<UNICHAR_ID> encoding;
std::vector<UNICHAR_ID> encoding;
encode_string(blacklist, false, &encoding, nullptr, nullptr);
for (int i = 0; i < encoding.size(); ++i) {
if (encoding[i] != INVALID_UNICHAR_ID)
Expand All @@ -968,7 +968,7 @@ void UNICHARSET::set_black_and_whitelist(const char* blacklist,
}
if (unblacklist != nullptr && unblacklist[0] != '\0') {
// Re-enable the unblacklist.
GenericVector<UNICHAR_ID> encoding;
std::vector<UNICHAR_ID> encoding;
encode_string(unblacklist, false, &encoding, nullptr, nullptr);
for (int i = 0; i < encoding.size(); ++i) {
if (encoding[i] != INVALID_UNICHAR_ID)
Expand Down
18 changes: 9 additions & 9 deletions src/ccutil/unicharset.h
Original file line number Diff line number Diff line change
Expand Up @@ -227,8 +227,8 @@ class TESS_API UNICHARSET {
// that do not belong in the unicharset, or encoding may fail.
// Use CleanupString to perform the cleaning.
bool encode_string(const char* str, bool give_up_on_failure,
GenericVector<UNICHAR_ID>* encoding,
GenericVector<char>* lengths,
std::vector<UNICHAR_ID>* encoding,
std::vector<char>* lengths,
int* encoded_length) const;

// Return the unichar representation corresponding to the given UNICHAR_ID
Expand Down Expand Up @@ -467,7 +467,7 @@ class TESS_API UNICHARSET {
// Record normalized version of unichar with the given unichar_id.
void set_normed(UNICHAR_ID unichar_id, const char* normed) {
unichars[unichar_id].properties.normed = normed;
unichars[unichar_id].properties.normed_ids.truncate(0);
unichars[unichar_id].properties.normed_ids.resize(0);
}
// Sets the normed_ids vector from the normed string. normed_ids is not
// stored in the file, and needs to be set when the UNICHARSET is loaded.
Expand Down Expand Up @@ -818,7 +818,7 @@ class TESS_API UNICHARSET {
// Returns a vector of UNICHAR_IDs that represent the ids of the normalized
// version of the given id. There may be more than one UNICHAR_ID in the
// vector if unichar_id represents a ligature.
const GenericVector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {
const std::vector<UNICHAR_ID>& normed_ids(UNICHAR_ID unichar_id) const {
return unichars[unichar_id].properties.normed_ids;
}

Expand Down Expand Up @@ -946,7 +946,7 @@ class TESS_API UNICHARSET {
// A string of unichar_ids that represent the corresponding normed string.
// For awkward characters like em-dash, this gives hyphen.
// For ligatures, this gives the string of normal unichars.
GenericVector<UNICHAR_ID> normed_ids;
std::vector<UNICHAR_ID> normed_ids;
STRING normed; // normalized version of this unichar
// Contains meta information about the fragment if a unichar represents
// a fragment of a character, otherwise should be set to nullptr.
Expand All @@ -972,11 +972,11 @@ class TESS_API UNICHARSET {
// best_encoding contains the encoding that used the longest part of str.
// best_lengths (may be null) contains the lengths of best_encoding.
void encode_string(const char* str, int str_index, int str_length,
GenericVector<UNICHAR_ID>* encoding,
GenericVector<char>* lengths,
std::vector<UNICHAR_ID>* encoding,
std::vector<char>* lengths,
int* best_total_length,
GenericVector<UNICHAR_ID>* best_encoding,
GenericVector<char>* best_lengths) const;
std::vector<UNICHAR_ID>* best_encoding,
std::vector<char>* best_lengths) const;

// Gets the properties for a grapheme string, combining properties for
// multiple characters in a meaningful way where possible.
Expand Down
10 changes: 5 additions & 5 deletions src/dict/dict.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -824,24 +824,24 @@ bool Dict::valid_bigram(const WERD_CHOICE& word1,
if (w2start >= w2end) return word2.length() < 3;

const UNICHARSET& uchset = getUnicharset();
GenericVector<UNICHAR_ID> bigram_string;
std::vector<UNICHAR_ID> bigram_string;
bigram_string.reserve(w1end + w2end + 1);
for (int i = w1start; i < w1end; i++) {
const GenericVector<UNICHAR_ID>& normed_ids =
const auto &normed_ids =
getUnicharset().normed_ids(word1.unichar_id(i));
if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
bigram_string.push_back(question_unichar_id_);
else
bigram_string += normed_ids;
bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
}
bigram_string.push_back(UNICHAR_SPACE);
for (int i = w2start; i < w2end; i++) {
const GenericVector<UNICHAR_ID>& normed_ids =
const auto &normed_ids =
getUnicharset().normed_ids(word2.unichar_id(i));
if (normed_ids.size() == 1 && uchset.get_isdigit(normed_ids[0]))
bigram_string.push_back(question_unichar_id_);
else
bigram_string += normed_ids;
bigram_string.insert(bigram_string.end(), normed_ids.begin(), normed_ids.end());
}
WERD_CHOICE normalized_word(&uchset, bigram_string.size());
for (int i = 0; i < bigram_string.size(); ++i) {
Expand Down
6 changes: 3 additions & 3 deletions src/dict/dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -116,7 +116,7 @@ class TESS_API Dict {
inline bool compound_marker(UNICHAR_ID unichar_id) {
const UNICHARSET& unicharset = getUnicharset();
ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
const GenericVector<UNICHAR_ID>& normed_ids =
const auto &normed_ids =
unicharset.normed_ids(unichar_id);
return normed_ids.size() == 1 &&
(normed_ids[0] == hyphen_unichar_id_ ||
Expand All @@ -127,7 +127,7 @@ class TESS_API Dict {
inline bool is_apostrophe(UNICHAR_ID unichar_id) {
const UNICHARSET& unicharset = getUnicharset();
ASSERT_HOST(unicharset.contains_unichar_id(unichar_id));
const GenericVector<UNICHAR_ID>& normed_ids =
const auto &normed_ids =
unicharset.normed_ids(unichar_id);
return normed_ids.size() == 1 && normed_ids[0] == apostrophe_unichar_id_;
}
Expand Down Expand Up @@ -157,7 +157,7 @@ class TESS_API Dict {
if (!last_word_on_line_ || first_pos)
return false;
ASSERT_HOST(unicharset->contains_unichar_id(unichar_id));
const GenericVector<UNICHAR_ID>& normed_ids =
const auto &normed_ids =
unicharset->normed_ids(unichar_id);
return normed_ids.size() == 1 && normed_ids[0] == hyphen_unichar_id_;
}
Expand Down
2 changes: 1 addition & 1 deletion src/dict/permdawg.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,7 @@ void Dict::go_deeper_dawg_fxn(
}
int num_unigrams = 0;
word->remove_last_unichar_id();
GenericVector<UNICHAR_ID> encoding;
std::vector<UNICHAR_ID> encoding;
const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
// Since the string came out of the unicharset, failure is impossible.
ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
Expand Down
Loading

0 comments on commit 9710bc0

Please sign in to comment.