tesseract  4.00.00dev
trie.h
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: trie.h (Formerly trie.h)
5  * Description: Functions to build a trie data structure.
6  * Author: Mark Seaman, SW Productivity
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Fri Jul 26 11:26:34 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 #ifndef TRIE_H
26 #define TRIE_H
27 
28 #include "dawg.h"
29 #include "cutil.h"
30 #include "genericvector.h"
31 
32 class UNICHARSET;
33 
34 // Note: if we consider either NODE_REF or EDGE_INDEX to ever exceed
35 // max int32, we will need to change GenericVector to use int64 for size
36 // and address indices. This does not seem to be needed immediately,
37 // since currently the largest number of edges limit used by tesseract
38 // (kMaxNumEdges in wordlist2dawg.cpp) is far less than max int32.
39 // There are also int casts below to satisfy the WIN32 compiler that would
40 // need to be changed.
41 // It might be cleanest to change the types of most of the Trie/Dawg related
42 // typedefs to int and restrict the casts to extracting these values from
43 // the 64 bit EDGE_RECORD.
44 typedef inT64 EDGE_INDEX; // index of an edge in a given node
45 typedef bool *NODE_MARKER;
47 
51 };
53 
54 namespace tesseract {
55 
62 class Trie : public Dawg {
63  public:
68  };
69 
70  // Minimum number of concrete characters at the beginning of user patterns.
71  static const int kSaneNumConcreteChars = 0;
72  // Various unicode whitespace characters are used to denote unichar patterns,
73  // (character classifier would never produce these whitespace characters as a
74  // valid classification).
75  static const char kAlphaPatternUnicode[];
76  static const char kDigitPatternUnicode[];
77  static const char kAlphanumPatternUnicode[];
78  static const char kPuncPatternUnicode[];
79  static const char kLowerPatternUnicode[];
80  static const char kUpperPatternUnicode[];
81 
82  static const char *get_reverse_policy_name(
83  RTLReversePolicy reverse_policy);
84 
85  // max_num_edges argument allows limiting the amount of memory this
86  // Trie can consume (if a new word insert would cause the Trie to
87  // contain more edges than max_num_edges, all the edges are cleared
88  // so that new inserts can proceed).
89  Trie(DawgType type, const STRING &lang, PermuterType perm,
90  int unicharset_size, int debug_level)
91  : Dawg(type, lang, perm, debug_level) {
92  init(unicharset_size);
93  num_edges_ = 0;
94  deref_node_index_mask_ = ~letter_mask_;
95  new_dawg_node(); // need to allocate node 0
96  initialized_patterns_ = false;
97  }
98  virtual ~Trie() { nodes_.delete_data_pointers(); }
99 
100  // Reset the Trie to empty.
101  void clear();
102 
104  EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id,
105  bool word_end) const {
106  EDGE_RECORD *edge_ptr;
107  EDGE_INDEX edge_index;
108  if (!edge_char_of(node_ref, NO_EDGE, FORWARD_EDGE, word_end, unichar_id,
109  &edge_ptr, &edge_index)) return NO_EDGE;
110  return make_edge_ref(node_ref, edge_index);
111  }
112 
118  bool word_end) const {
119  const EDGE_VECTOR &forward_edges =
120  nodes_[static_cast<int>(node)]->forward_edges;
121  for (int i = 0; i < forward_edges.size(); ++i) {
122  if (!word_end || end_of_word_from_edge_rec(forward_edges[i])) {
123  vec->push_back(NodeChild(unichar_id_from_edge_rec(forward_edges[i]),
124  make_edge_ref(node, i)));
125  }
126  }
127  }
128 
133  NODE_REF next_node(EDGE_REF edge_ref) const {
134  if (edge_ref == NO_EDGE || num_edges_ == 0) return NO_EDGE;
135  return next_node_from_edge_rec(*deref_edge_ref(edge_ref));
136  }
137 
142  bool end_of_word(EDGE_REF edge_ref) const {
143  if (edge_ref == NO_EDGE || num_edges_ == 0) return false;
144  return end_of_word_from_edge_rec(*deref_edge_ref(edge_ref));
145  }
146 
148  UNICHAR_ID edge_letter(EDGE_REF edge_ref) const {
149  if (edge_ref == NO_EDGE || num_edges_ == 0) return INVALID_UNICHAR_ID;
150  return unichar_id_from_edge_rec(*deref_edge_ref(edge_ref));
151  }
152  // Sets the UNICHAR_ID in the given edge_rec to unicharset_size_, marking
153  // the edge dead.
154  void KillEdge(EDGE_RECORD* edge_rec) const {
155  *edge_rec &= ~letter_mask_;
156  *edge_rec |= (unicharset_size_ << LETTER_START_BIT);
157  }
158  bool DeadEdge(const EDGE_RECORD& edge_rec) const {
159  return unichar_id_from_edge_rec(edge_rec) == unicharset_size_;
160  }
161 
162  // Prints the contents of the node indicated by the given NODE_REF.
163  // At most max_num_edges will be printed.
164  void print_node(NODE_REF node, int max_num_edges) const;
165 
166  // Writes edges from nodes_ to an EDGE_ARRAY and creates a SquishedDawg.
167  // Eliminates redundant edges and returns the pointer to the SquishedDawg.
168  // Note: the caller is responsible for deallocating memory associated
169  // with the returned SquishedDawg pointer.
170  SquishedDawg *trie_to_dawg();
171 
172  // Reads a list of words from the given file and adds into the Trie.
173  // Calls WERD_CHOICE::reverse_unichar_ids_if_rtl() according to the reverse
174  // policy and information in the unicharset.
175  // Returns false on error.
176  bool read_and_add_word_list(const char *filename,
177  const UNICHARSET &unicharset,
179 
180  // Reads a list of words from the given file, applying the reverse_policy,
181  // according to information in the unicharset.
182  // Returns false on error.
183  bool read_word_list(const char *filename,
184  const UNICHARSET &unicharset,
185  Trie::RTLReversePolicy reverse_policy,
186  GenericVector<STRING>* words);
187  // Adds a list of words previously read using read_word_list to the trie
188  // using the given unicharset to convert to unichar-ids.
189  // Returns false on error.
190  bool add_word_list(const GenericVector<STRING>& words,
191  const UNICHARSET &unicharset);
192 
193  // Inserts the list of patterns from the given file into the Trie.
194  // The pattern list file should contain one pattern per line in UTF-8 format.
195  //
196  // Each pattern can contain any non-whitespace characters, however only the
197  // patterns that contain characters from the unicharset of the corresponding
198  // language will be useful.
199  // The only meta character is '\'. To be used in a pattern as an ordinary
200  // string it should be escaped with '\' (e.g. string "C:\Documents" should
201  // be written in the patterns file as "C:\\Documents").
202  // This function supports a very limited regular expression syntax. One can
203  // express a character, a certain character class and a number of times the
204  // entity should be repeated in the pattern.
205  //
206  // To denote a character class use one of:
207  // \c - unichar for which UNICHARSET::get_isalpha() is true (character)
208  // \d - unichar for which UNICHARSET::get_isdigit() is true
209  // \n - unichar for which UNICHARSET::get_isdigit() and
210  // UNICHARSET::isalpha() are true
211  // \p - unichar for which UNICHARSET::get_ispunct() is true
212  // \a - unichar for which UNICHARSET::get_islower() is true
213  // \A - unichar for which UNICHARSET::get_isupper() is true
214  //
215  // \* could be specified after each character or pattern to indicate that
216  // the character/pattern can be repeated any number of times before the next
217  // character/pattern occurs.
218  //
219  // Examples:
220  // 1-8\d\d-GOOG-411 will be expanded to strings:
221  // 1-800-GOOG-411, 1-801-GOOG-411, ... 1-899-GOOG-411.
222  //
223  // http://www.\n\*.com will be expanded to strings like:
224  // http://www.a.com http://www.a123.com ... http://www.ABCDefgHIJKLMNop.com
225  //
226  // Note: In choosing which patterns to include please be aware of the fact
227  // providing very generic patterns will make tesseract run slower.
228  // For example \n\* at the beginning of the pattern will make Tesseract
229  // consider all the combinations of proposed character choices for each
230  // of the segmentations, which will be unacceptably slow.
231  // Because of potential problems with speed that could be difficult to
232  // identify, each user pattern has to have at least kSaneNumConcreteChars
233  // concrete characters from the unicharset at the beginning.
234  bool read_pattern_list(const char *filename, const UNICHARSET &unicharset);
235 
236  // Initializes the values of *_pattern_ unichar ids.
237  // This function should be called before calling read_pattern_list().
238  void initialize_patterns(UNICHARSET *unicharset);
239 
240  // Fills in the given unichar id vector with the unichar ids that represent
241  // the patterns of the character classes of the given unichar_id.
242  void unichar_id_to_patterns(UNICHAR_ID unichar_id,
243  const UNICHARSET &unicharset,
244  GenericVector<UNICHAR_ID> *vec) const;
245 
246  // Returns the given EDGE_REF if the EDGE_RECORD that it points to has
247  // a self loop and the given unichar_id matches the unichar_id stored in the
248  // EDGE_RECORD, returns NO_EDGE otherwise.
250  UNICHAR_ID unichar_id,
251  bool word_end) const {
252  if (edge_ref == NO_EDGE) return NO_EDGE;
253  EDGE_RECORD *edge_rec = deref_edge_ref(edge_ref);
254  return (marker_flag_from_edge_rec(*edge_rec) &&
255  unichar_id == unichar_id_from_edge_rec(*edge_rec) &&
256  word_end == end_of_word_from_edge_rec(*edge_rec)) ?
257  edge_ref : NO_EDGE;
258  }
259 
260  // Adds a word to the Trie (creates the necessary nodes and edges).
261  //
262  // If repetitions vector is not NULL, each entry in the vector indicates
263  // whether the unichar id with the corresponding index in the word is allowed
264  // to repeat an unlimited number of times. For each entry that is true, MARKER
265  // flag of the corresponding edge created for this unichar id is set to true).
266  //
267  // Return true if add succeeded, false otherwise (e.g. when a word contained
268  // an invalid unichar id or the trie was getting too large and was cleared).
269  bool add_word_to_dawg(const WERD_CHOICE &word,
270  const GenericVector<bool> *repetitions);
271  bool add_word_to_dawg(const WERD_CHOICE &word) {
272  return add_word_to_dawg(word, NULL);
273  }
274 
275  protected:
276  // The structure of an EDGE_REF for Trie edges is as follows:
277  // [LETTER_START_BIT, flag_start_bit_):
278  // edge index in *_edges in a TRIE_NODE_RECORD
279  // [flag_start_bit, 30th bit]: node index in nodes (TRIE_NODES vector)
280  //
281  // With this arrangement there are enough bits to represent edge indices
282  // (each node can have at most unicharset_size_ forward edges and
283  // the position of flag_start_bit is set to be log2(unicharset_size_)).
284  // It is also possible to accommodate a maximum number of nodes that is at
285  // least as large as that of the SquishedDawg representation (in SquishedDawg
286  // each EDGE_RECORD has 32-(flag_start_bit+NUM_FLAG_BITS) bits to represent
287  // the next node index).
288  //
289 
290  // Returns the pointer to EDGE_RECORD after decoding the location
291  // of the edge from the information in the given EDGE_REF.
292  // This function assumes that EDGE_REF holds valid node/edge indices.
293  inline EDGE_RECORD *deref_edge_ref(EDGE_REF edge_ref) const {
294  int edge_index = static_cast<int>(
295  (edge_ref & letter_mask_) >> LETTER_START_BIT);
296  int node_index = static_cast<int>(
297  (edge_ref & deref_node_index_mask_) >> flag_start_bit_);
298  TRIE_NODE_RECORD *node_rec = nodes_[node_index];
299  return &(node_rec->forward_edges[edge_index]);
300  }
302  inline EDGE_REF make_edge_ref(NODE_REF node_index,
303  EDGE_INDEX edge_index) const {
304  return ((node_index << flag_start_bit_) |
305  (edge_index << LETTER_START_BIT));
306  }
308  inline void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats,
309  int direction, bool word_end, UNICHAR_ID unichar_id) {
310  EDGE_RECORD flags = 0;
311  if (repeats) flags |= MARKER_FLAG;
312  if (word_end) flags |= WERD_END_FLAG;
313  if (direction == BACKWARD_EDGE) flags |= DIRECTION_FLAG;
314  *edge = ((nxt << next_node_start_bit_) |
315  (static_cast<EDGE_RECORD>(flags) << flag_start_bit_) |
316  (static_cast<EDGE_RECORD>(unichar_id) << LETTER_START_BIT));
317  }
319  inline void print_edge_rec(const EDGE_RECORD &edge_rec) const {
320  tprintf("|" REFFORMAT "|%s%s%s|%d|", next_node_from_edge_rec(edge_rec),
321  marker_flag_from_edge_rec(edge_rec) ? "R," : "",
322  (direction_from_edge_rec(edge_rec) == FORWARD_EDGE) ? "F" : "B",
323  end_of_word_from_edge_rec(edge_rec) ? ",E" : "",
324  unichar_id_from_edge_rec(edge_rec));
325  }
326  // Returns true if the next node in recorded the given EDGE_RECORD
327  // has exactly one forward edge.
328  inline bool can_be_eliminated(const EDGE_RECORD &edge_rec) {
329  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
330  return (node_ref != NO_EDGE &&
331  nodes_[static_cast<int>(node_ref)]->forward_edges.size() == 1);
332  }
333 
334  // Prints the contents of the Trie.
335  // At most max_num_edges will be printed for each node.
336  void print_all(const char* msg, int max_num_edges) {
337  tprintf("\n__________________________\n%s\n", msg);
338  for (int i = 0; i < nodes_.size(); ++i) print_node(i, max_num_edges);
339  tprintf("__________________________\n");
340  }
341 
342  // Finds the edge with the given direction, word_end and unichar_id
343  // in the node indicated by node_ref. Fills in the pointer to the
344  // EDGE_RECORD and the index of the edge with the the values
345  // corresponding to the edge found. Returns true if an edge was found.
346  bool edge_char_of(NODE_REF node_ref, NODE_REF next_node,
347  int direction, bool word_end, UNICHAR_ID unichar_id,
348  EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const;
349 
350  // Adds an single edge linkage between node1 and node2 in the direction
351  // indicated by direction argument.
352  bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats,
353  int direction, bool word_end,
354  UNICHAR_ID unichar_id);
355 
356  // Adds forward edge linkage from node1 to node2 and the corresponding
357  // backward edge linkage in the other direction.
358  bool add_new_edge(NODE_REF node1, NODE_REF node2,
359  bool repeats, bool word_end, UNICHAR_ID unichar_id) {
360  return (add_edge_linkage(node1, node2, repeats, FORWARD_EDGE,
361  word_end, unichar_id) &&
362  add_edge_linkage(node2, node1, repeats, BACKWARD_EDGE,
363  word_end, unichar_id));
364  }
365 
366  // Sets the word ending flags in an already existing edge pair.
367  // Returns true on success.
368  void add_word_ending(EDGE_RECORD *edge,
369  NODE_REF the_next_node,
370  bool repeats,
371  UNICHAR_ID unichar_id);
372 
373  // Allocates space for a new node in the Trie.
374  NODE_REF new_dawg_node();
375 
376  // Removes a single edge linkage to between node1 and node2 in the
377  // direction indicated by direction argument.
378  void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction,
379  bool word_end, UNICHAR_ID unichar_id);
380 
381  // Removes forward edge linkage from node1 to node2 and the corresponding
382  // backward edge linkage in the other direction.
383  void remove_edge(NODE_REF node1, NODE_REF node2,
384  bool word_end, UNICHAR_ID unichar_id) {
385  remove_edge_linkage(node1, node2, FORWARD_EDGE, word_end, unichar_id);
386  remove_edge_linkage(node2, node1, BACKWARD_EDGE, word_end, unichar_id);
387  }
388 
389  // Compares edge1 and edge2 in the given node to see if they point to two
390  // next nodes that could be collapsed. If they do, performs the reduction
391  // and returns true.
392  bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1,
393  const EDGE_RECORD &edge2);
394 
395  // Assuming that edge_index indicates the first edge in a group of edges
396  // in this node with a particular letter value, looks through these edges
397  // to see if any of them can be collapsed. If so does it. Returns to the
398  // caller when all edges with this letter have been reduced.
399  // Returns true if further reduction is possible with this same letter.
400  bool reduce_lettered_edges(EDGE_INDEX edge_index,
401  UNICHAR_ID unichar_id,
402  NODE_REF node,
404  NODE_MARKER reduced_nodes);
405 
412  void sort_edges(EDGE_VECTOR *edges);
413 
415  void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes);
416 
417  // Returns the pattern unichar id for the given character class code.
418  UNICHAR_ID character_class_to_pattern(char ch);
419 
420  // Member variables
421  TRIE_NODES nodes_; // vector of nodes in the Trie
422  uinT64 num_edges_; // sum of all edges (forward and backward)
423  uinT64 deref_direction_mask_; // mask for EDGE_REF to extract direction
424  uinT64 deref_node_index_mask_; // mask for EDGE_REF to extract node index
425  // Freelist of edges in the root backwards node that were previously zeroed.
427  // Variables for translating character class codes denoted in user patterns
428  // file to the unichar ids used to represent them in a Trie.
436 };
437 } // namespace tesseract
438 
439 #endif
uinT64 deref_node_index_mask_
Definition: trie.h:424
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:358
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:104
int64_t inT64
Definition: host.h:40
#define FORWARD_EDGE
Definition: dawg.h:84
int UNICHAR_ID
Definition: unichar.h:33
#define BACKWARD_EDGE
Definition: dawg.h:85
uint64_t uinT64
Definition: host.h:41
EDGE_VECTOR backward_edges
Definition: trie.h:50
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:308
LIST reverse(LIST list)
Definition: oldlist.cpp:351
bool end_of_word(EDGE_REF edge_ref) const
Definition: trie.h:142
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
uinT64 deref_direction_mask_
Definition: trie.h:423
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
UNICHAR_ID lower_pattern_
Definition: trie.h:434
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:133
UNICHAR_ID upper_pattern_
Definition: trie.h:435
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:249
int size() const
Definition: genericvector.h:72
bool add_word_to_dawg(const WERD_CHOICE &word)
Definition: trie.h:271
bool initialized_patterns_
Definition: trie.h:429
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:319
#define REFFORMAT
Definition: dawg.h:92
EDGE_RECORD * deref_edge_ref(EDGE_REF edge_ref) const
Definition: trie.h:293
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const
Definition: trie.h:117
inT64 NODE_REF
Definition: dawg.h:55
EDGE_VECTOR forward_edges
Definition: trie.h:49
RTLReversePolicy
Definition: trie.h:64
#define WERD_END_FLAG
Definition: dawg.h:89
Definition: strngs.h:45
UNICHAR_ID alphanum_pattern_
Definition: trie.h:432
Trie(DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: trie.h:89
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:336
DawgType
Definition: dawg.h:71
uinT64 num_edges_
Definition: trie.h:422
TRIE_NODES nodes_
Definition: trie.h:421
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:383
UNICHAR_ID digit_pattern_
Definition: trie.h:431
UNICHAR_ID alpha_pattern_
Definition: trie.h:430
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:154
UNICHAR_ID punc_pattern_
Definition: trie.h:433
const char * filename
Definition: ioapi.h:38
EDGE_REF make_edge_ref(NODE_REF node_index, EDGE_INDEX edge_index) const
Definition: trie.h:302
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:158
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:426
bool * NODE_MARKER
Definition: trie.h:45
inT64 EDGE_INDEX
Definition: trie.h:32
virtual ~Trie()
Definition: trie.h:98
#define DIRECTION_FLAG
Definition: dawg.h:88
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:328
uinT64 EDGE_RECORD
Definition: dawg.h:50
inT64 EDGE_REF
Definition: dawg.h:54
GenericVector< EDGE_RECORD > EDGE_VECTOR
Definition: trie.h:46
PermuterType
Definition: ratngs.h:240
GenericVector< TRIE_NODE_RECORD * > TRIE_NODES
Definition: trie.h:52
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Definition: trie.h:148
#define MARKER_FLAG
Definition: dawg.h:87
#define LETTER_START_BIT
Definition: dawg.h:90