71 static const int kSaneNumConcreteChars = 0;
75 static const char kAlphaPatternUnicode[];
76 static const char kDigitPatternUnicode[];
77 static const char kAlphanumPatternUnicode[];
78 static const char kPuncPatternUnicode[];
79 static const char kLowerPatternUnicode[];
80 static const char kUpperPatternUnicode[];
82 static const char *get_reverse_policy_name(
90 int unicharset_size,
int debug_level)
91 :
Dawg(type, lang, perm, debug_level) {
92 init(unicharset_size);
94 deref_node_index_mask_ = ~letter_mask_;
96 initialized_patterns_ =
false;
98 virtual ~Trie() { nodes_.delete_data_pointers(); }
105 bool word_end)
const {
108 if (!edge_char_of(node_ref, NO_EDGE,
FORWARD_EDGE, word_end, unichar_id,
109 &edge_ptr, &edge_index))
return NO_EDGE;
110 return make_edge_ref(node_ref, edge_index);
118 bool word_end)
const {
120 nodes_[
static_cast<int>(node)]->forward_edges;
121 for (
int i = 0; i < forward_edges.
size(); ++i) {
122 if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
124 make_edge_ref(node, i)));
134 if (edge_ref == NO_EDGE || num_edges_ == 0)
return NO_EDGE;
135 return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
143 if (edge_ref == NO_EDGE || num_edges_ == 0)
return false;
144 return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
149 if (edge_ref == NO_EDGE || num_edges_ == 0)
return INVALID_UNICHAR_ID;
150 return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
155 *edge_rec &= ~letter_mask_;
159 return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
164 void print_node(
NODE_REF node,
int max_num_edges)
const;
176 bool read_and_add_word_list(
const char *
filename,
183 bool read_word_list(
const char *filename,
234 bool read_pattern_list(
const char *filename,
const UNICHARSET &unicharset);
238 void initialize_patterns(
UNICHARSET *unicharset);
242 void unichar_id_to_patterns(
UNICHAR_ID unichar_id,
251 bool word_end)
const {
252 if (edge_ref == NO_EDGE)
return NO_EDGE;
254 return (marker_flag_from_edge_rec(*edge_rec) &&
255 unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
256 word_end == end_of_word_from_edge_rec(*edge_rec)) ?
272 return add_word_to_dawg(word, NULL);
294 int edge_index =
static_cast<int>(
296 int node_index =
static_cast<int>(
297 (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
304 return ((node_index << flag_start_bit_) |
314 *edge = ((nxt << next_node_start_bit_) |
315 (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
321 marker_flag_from_edge_rec(edge_rec) ?
"R," :
"",
322 (direction_from_edge_rec(edge_rec) ==
FORWARD_EDGE) ?
"F" :
"B",
323 end_of_word_from_edge_rec(edge_rec) ?
",E" :
"",
324 unichar_id_from_edge_rec(edge_rec));
329 NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
330 return (node_ref != NO_EDGE &&
337 tprintf(
"\n__________________________\n%s\n", msg);
338 for (
int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
339 tprintf(
"__________________________\n");
353 int direction,
bool word_end,
359 bool repeats,
bool word_end,
UNICHAR_ID unichar_id) {
360 return (add_edge_linkage(node1, node2, repeats,
FORWARD_EDGE,
361 word_end, unichar_id) &&
363 word_end, unichar_id));
385 remove_edge_linkage(node1, node2,
FORWARD_EDGE, word_end, unichar_id);
386 remove_edge_linkage(node2, node1,
BACKWARD_EDGE, word_end, unichar_id);
400 bool reduce_lettered_edges(
EDGE_INDEX edge_index,
418 UNICHAR_ID character_class_to_pattern(
char ch);
uinT64 deref_node_index_mask_
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
EDGE_VECTOR backward_edges
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
bool end_of_word(EDGE_REF edge_ref) const
uinT64 deref_direction_mask_
int direction(EDGEPT *point)
UNICHAR_ID lower_pattern_
NODE_REF next_node(EDGE_REF edge_ref) const
UNICHAR_ID upper_pattern_
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
bool add_word_to_dawg(const WERD_CHOICE &word)
bool initialized_patterns_
void print_edge_rec(const EDGE_RECORD &edge_rec) const
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const
EDGE_VECTOR forward_edges
UNICHAR_ID alphanum_pattern_
Trie(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
void print_all(const char *msg, int max_num_edges)
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
UNICHAR_ID digit_pattern_
UNICHAR_ID alpha_pattern_
void KillEdge(EDGE_RECORD *edge_rec) const
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
bool DeadEdge(const EDGE_RECORD &edge_rec) const
GenericVector< EDGE_INDEX > root_back_freelist_
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
GenericVector< EDGE_RECORD > EDGE_VECTOR
GenericVector< TRIE_NODE_RECORD * > TRIE_NODES
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const