Skip to content

Commit

Permalink
Replace remaining GenericVector by std::vector for src/dict
Browse files Browse the repository at this point in the history
Signed-off-by: Stefan Weil <sw@weilnetz.de>
  • Loading branch information
stweil committed Mar 16, 2021
1 parent 17eee86 commit bf42f83
Show file tree
Hide file tree
Showing 10 changed files with 67 additions and 63 deletions.
1 change: 1 addition & 0 deletions src/ccstruct/params_training_featdef.h
Original file line number Diff line number Diff line change
Expand Up @@ -19,6 +19,7 @@
#ifndef TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_
#define TESSERACT_WORDREC_PARAMS_TRAINING_FEATDEF_H_

#include <cstring> // for memset
#include <string>
#include <vector>

Expand Down
15 changes: 8 additions & 7 deletions src/dict/dawg.h
Original file line number Diff line number Diff line change
Expand Up @@ -57,9 +57,9 @@ struct NodeChild {
NodeChild() : unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
};

using NodeChildVector = GenericVector<NodeChild>;
using SuccessorList = GenericVector<int>;
using SuccessorListsVector = GenericVector<SuccessorList *>;
using NodeChildVector = std::vector<NodeChild>;
using SuccessorList = std::vector<int>;
using SuccessorListsVector = std::vector<SuccessorList *>;

enum DawgType {
DAWG_TYPE_PUNCTUATION,
Expand Down Expand Up @@ -176,7 +176,7 @@ class TESS_API Dawg {
/// Fills vec with unichar ids that represent the character classes
/// of the given unichar_id.
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
GenericVector<UNICHAR_ID> *vec) const {
std::vector<UNICHAR_ID> *vec) const {
(void)unichar_id;
(void)unicharset;
(void)vec;
Expand Down Expand Up @@ -355,15 +355,16 @@ struct DawgPosition {
bool back_to_punc = false;
};

class DawgPositionVector : public GenericVector<DawgPosition> {
class DawgPositionVector : public std::vector<DawgPosition> {
public:
/// Adds an entry for the given dawg_index with the given node to the vec.
/// Returns false if the same entry already exists in the vector,
/// true otherwise.
inline bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg) {
for (int i = 0; i < size(); ++i) {
if (data_[i] == new_pos)
for (auto position : *this) {
if (position == new_pos) {
return false;
}
}
push_back(new_pos);
if (debug) {
Expand Down
40 changes: 21 additions & 19 deletions src/dict/dict.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -201,19 +201,19 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
punc_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_PUNC_DAWG, dawg_debug_level, data_file);
if (punc_dawg_)
dawgs_ += punc_dawg_;
dawgs_.push_back(punc_dawg_);
}
if (load_system_dawg) {
Dawg *system_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg)
dawgs_ += system_dawg;
dawgs_.push_back(system_dawg);
}
if (load_number_dawg) {
Dawg *number_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg)
dawgs_ += number_dawg;
dawgs_.push_back(number_dawg);
}
if (load_bigram_dawg) {
bigram_dawg_ =
Expand All @@ -225,13 +225,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
freq_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_FREQ_DAWG, dawg_debug_level, data_file);
if (freq_dawg_)
dawgs_ += freq_dawg_;
dawgs_.push_back(freq_dawg_);
}
if (load_unambig_dawg) {
unambig_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_UNAMBIG_DAWG, dawg_debug_level, data_file);
if (unambig_dawg_)
dawgs_ += unambig_dawg_;
dawgs_.push_back(unambig_dawg_);
}

std::string name;
Expand All @@ -249,7 +249,7 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}

Expand All @@ -267,13 +267,13 @@ void Dict::Load(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}

document_words_ =
new Trie(DAWG_TYPE_WORD, lang, DOC_DAWG_PERM, getUnicharset().size(), dawg_debug_level);
dawgs_ += document_words_;
dawgs_.push_back(document_words_);

// This dawg is temporary and should not be searched by letter_is_ok.
pending_words_ =
Expand All @@ -287,19 +287,19 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
punc_dawg_ =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_PUNC_DAWG, dawg_debug_level, data_file);
if (punc_dawg_)
dawgs_ += punc_dawg_;
dawgs_.push_back(punc_dawg_);
}
if (load_system_dawg) {
Dawg *system_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_SYSTEM_DAWG, dawg_debug_level, data_file);
if (system_dawg)
dawgs_ += system_dawg;
dawgs_.push_back(system_dawg);
}
if (load_number_dawg) {
Dawg *number_dawg =
dawg_cache_->GetSquishedDawg(lang, TESSDATA_LSTM_NUMBER_DAWG, dawg_debug_level, data_file);
if (number_dawg)
dawgs_ += number_dawg;
dawgs_.push_back(number_dawg);
}

// stolen from Dict::Load (but needs params_ from Tesseract
Expand All @@ -319,7 +319,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}

Expand All @@ -337,7 +337,7 @@ void Dict::LoadLSTM(const std::string &lang, TessdataManager *data_file) {
tprintf("Error: failed to load %s\n", name.c_str());
delete trie_ptr;
} else {
dawgs_ += trie_ptr;
dawgs_.push_back(trie_ptr);
}
}
}
Expand All @@ -358,9 +358,9 @@ bool Dict::FinishLoad() {
const Dawg *other = dawgs_[j];
if (dawg != nullptr && other != nullptr && (dawg->lang() == other->lang()) &&
kDawgSuccessors[dawg->type()][other->type()])
*lst += j;
lst->push_back(j);
}
successors_ += lst;
successors_.push_back(lst);
}
return true;
}
Expand All @@ -378,7 +378,9 @@ void Dict::End() {
delete dawg_cache_;
dawg_cache_ = nullptr;
}
successors_.delete_data_pointers();
for (auto successor : successors_) {
delete successor;
}
dawgs_.clear();
successors_.clear();
document_words_ = nullptr;
Expand Down Expand Up @@ -550,7 +552,7 @@ void Dict::ProcessPatternEdges(const Dawg *dawg, const DawgPosition &pos, UNICHA
NODE_REF node = GetStartingNode(dawg, pos.dawg_ref);
// Try to find the edge corresponding to the exact unichar_id and to all the
// edges corresponding to the character class of unichar_id.
GenericVector<UNICHAR_ID> unichar_id_patterns;
std::vector<UNICHAR_ID> unichar_id_patterns;
unichar_id_patterns.push_back(unichar_id);
dawg->unichar_id_to_patterns(unichar_id, getUnicharset(), &unichar_id_patterns);
for (int i = 0; i < unichar_id_patterns.size(); ++i) {
Expand Down Expand Up @@ -605,12 +607,12 @@ void Dict::default_dawgs(DawgPositionVector *dawg_pos_vec, bool suppress_pattern
int dawg_ty = dawgs_[i]->type();
bool subsumed_by_punc = kDawgSuccessors[DAWG_TYPE_PUNCTUATION][dawg_ty];
if (dawg_ty == DAWG_TYPE_PUNCTUATION) {
*dawg_pos_vec += DawgPosition(-1, NO_EDGE, i, NO_EDGE, false);
dawg_pos_vec->push_back(DawgPosition(-1, NO_EDGE, i, NO_EDGE, false));
if (dawg_debug_level >= 3) {
tprintf("Adding beginning punc dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
}
} else if (!punc_dawg_available || !subsumed_by_punc) {
*dawg_pos_vec += DawgPosition(i, NO_EDGE, -1, NO_EDGE, false);
dawg_pos_vec->push_back(DawgPosition(i, NO_EDGE, -1, NO_EDGE, false));
if (dawg_debug_level >= 3) {
tprintf("Adding beginning dawg [%d, " REFFORMAT "]\n", i, NO_EDGE);
}
Expand Down
4 changes: 2 additions & 2 deletions src/dict/dict.h
Original file line number Diff line number Diff line change
Expand Up @@ -54,7 +54,7 @@ struct CHAR_FRAGMENT_INFO {
float certainty;
};

using DawgVector = GenericVector<Dawg *>;
using DawgVector = std::vector<Dawg *>;

//
// Constants
Expand Down Expand Up @@ -495,7 +495,7 @@ class TESS_API Dict {
// matching. The first member of each list is taken as canonical. For
// example, the first list contains hyphens and dashes with the first symbol
// being the ASCII hyphen minus.
std::vector<GenericVector<UNICHAR_ID>> equivalent_symbols_;
std::vector<std::vector<UNICHAR_ID>> equivalent_symbols_;
// Dawg Cache reference - this is who we ask to allocate/deallocate dawgs.
DawgCache *dawg_cache_;
bool dawg_cache_is_ours_; // we should delete our own dawg_cache_
Expand Down
4 changes: 1 addition & 3 deletions src/dict/stopper.h
Original file line number Diff line number Diff line change
Expand Up @@ -2,7 +2,6 @@
** Filename: stopper.h
** Purpose: Stopping criteria for word classifier.
** Author: Dan Johnson
** History: Wed May 1 09:42:57 1991, DSJ, Created.
**
** (c) Copyright Hewlett-Packard Company, 1988.
** Licensed under the Apache License, Version 2.0 (the "License");
Expand All @@ -22,7 +21,6 @@
#include "ratngs.h"

#include <tesseract/unichar.h>
#include "genericvector.h"

namespace tesseract {

Expand All @@ -46,7 +44,7 @@ struct DANGERR_INFO {
UNICHAR_ID leftmost; // in the replacement, what's the leftmost character?
};

using DANGERR = GenericVector<DANGERR_INFO>;
using DANGERR = std::vector<DANGERR_INFO>;

} // namespace tesseract

Expand Down
26 changes: 14 additions & 12 deletions src/dict/trie.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,6 @@

#include "dawg.h"
#include "dict.h"
#include "genericvector.h"
#include "helpers.h"
#include "kdpair.h"

Expand All @@ -49,7 +48,9 @@ const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {

// Reset the Trie to empty.
void Trie::clear() {
nodes_.delete_data_pointers();
for (auto node : nodes_) {
delete node;
}
nodes_.clear();
root_back_freelist_.clear();
num_edges_ = 0;
Expand Down Expand Up @@ -122,10 +123,11 @@ bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag, in
EDGE_RECORD edge_rec;
link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
if (node1 == 0 && direction == BACKWARD_EDGE && !root_back_freelist_.empty()) {
EDGE_INDEX edge_index = root_back_freelist_.pop_back();
EDGE_INDEX edge_index = root_back_freelist_.back();
root_back_freelist_.pop_back();
(*vec)[edge_index] = edge_rec;
} else if (search_index < vec->size()) {
vec->insert(edge_rec, search_index);
vec->insert(vec->begin() + search_index, edge_rec);
} else {
vec->push_back(edge_rec);
}
Expand Down Expand Up @@ -153,7 +155,7 @@ void Trie::add_word_ending(EDGE_RECORD *edge_ptr, NODE_REF the_next_node, bool m
*edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
}

bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const GenericVector<bool> *repetitions) {
bool Trie::add_word_to_dawg(const WERD_CHOICE &word, const std::vector<bool> *repetitions) {
if (word.length() <= 0)
return false; // can't add empty words
if (repetitions != nullptr)
Expand Down Expand Up @@ -330,7 +332,7 @@ void Trie::initialize_patterns(UNICHARSET *unicharset) {
}

void Trie::unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset,
GenericVector<UNICHAR_ID> *vec) const {
std::vector<UNICHAR_ID> *vec) const {
bool is_alpha = unicharset.get_isalpha(unichar_id);
if (is_alpha) {
vec->push_back(alpha_pattern_);
Expand Down Expand Up @@ -388,7 +390,7 @@ bool Trie::read_pattern_list(const char *filename, const UNICHARSET &unicharset)
// Parse the pattern and construct a unichar id vector.
// Record the number of repetitions of each unichar in the parallel vector.
WERD_CHOICE word(&unicharset);
GenericVector<bool> repetitions_vec;
std::vector<bool> repetitions_vec;
const char *str_ptr = string;
int step = unicharset.step(str_ptr);
bool failed = false;
Expand Down Expand Up @@ -462,12 +464,12 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
tprintf("\n");
}
if (direction == FORWARD_EDGE) {
nodes_[node1]->forward_edges.remove(edge_index);
nodes_[node1]->forward_edges.erase(nodes_[node1]->forward_edges.begin() + edge_index);
} else if (node1 == 0) {
KillEdge(&nodes_[node1]->backward_edges[edge_index]);
root_back_freelist_.push_back(edge_index);
} else {
nodes_[node1]->backward_edges.remove(edge_index);
nodes_[node1]->backward_edges.erase(nodes_[node1]->backward_edges.begin() + edge_index);
}
--num_edges_;
}
Expand All @@ -476,7 +478,7 @@ void Trie::remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bo
// 1 Avoid insertion sorting or bubble sorting the tail root node
// (back links on node 0, a list of all the leaves.). The node is
// huge, and sorting it with n^2 time is terrible.
// 2 Avoid using GenericVector::remove on the tail root node.
// 2 Avoid using vector::erase on the tail root node.
// (a) During add of words to the trie, zero-out the unichars and
// keep a freelist of spaces to re-use.
// (b) During reduction, just zero-out the unichars of deleted back
Expand Down Expand Up @@ -624,13 +626,13 @@ void Trie::sort_edges(EDGE_VECTOR *edges) {
int num_edges = edges->size();
if (num_edges <= 1)
return;
GenericVector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
std::vector<KDPairInc<UNICHAR_ID, EDGE_RECORD>> sort_vec;
sort_vec.reserve(num_edges);
for (int i = 0; i < num_edges; ++i) {
sort_vec.push_back(
KDPairInc<UNICHAR_ID, EDGE_RECORD>(unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
}
sort_vec.sort();
std::sort(sort_vec.begin(), sort_vec.end());
for (int i = 0; i < num_edges; ++i)
(*edges)[i] = sort_vec[i].data();
}
Expand Down
Loading

0 comments on commit bf42f83

Please sign in to comment.