tesseract  4.00.00dev
trie.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: trie.c (Formerly trie.c)
5  * Description: Functions to build a trie data structure.
6  * Author: Mark Seaman, OCR Technology
7  * Created: Fri Oct 16 14:37:00 1987
8  * Modified: Fri Jul 26 12:18:10 1991 (Mark Seaman) marks@hpgrlt
9  * Language: C
10  * Package: N/A
11  * Status: Reusable Software Component
12  *
13  * (c) Copyright 1987, Hewlett-Packard Company.
14  ** Licensed under the Apache License, Version 2.0 (the "License");
15  ** you may not use this file except in compliance with the License.
16  ** You may obtain a copy of the License at
17  ** http://www.apache.org/licenses/LICENSE-2.0
18  ** Unless required by applicable law or agreed to in writing, software
19  ** distributed under the License is distributed on an "AS IS" BASIS,
20  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
21  ** See the License for the specific language governing permissions and
22  ** limitations under the License.
23  *
24  *********************************************************************************/
25 /*----------------------------------------------------------------------
26  I n c l u d e s
27 ----------------------------------------------------------------------*/
28 #ifdef _MSC_VER
29 #pragma warning(disable:4244) // Conversion warnings
30 #pragma warning(disable:4800) // int/bool warnings
31 #endif
32 #include "trie.h"
33 
34 #include "callcpp.h"
35 #include "dawg.h"
36 #include "dict.h"
37 #include "genericvector.h"
38 #include "helpers.h"
39 #include "kdpair.h"
40 
41 namespace tesseract {
42 
43 const char kDoNotReverse[] = "RRP_DO_NO_REVERSE";
44 const char kReverseIfHasRTL[] = "RRP_REVERSE_IF_HAS_RTL";
45 const char kForceReverse[] = "RRP_FORCE_REVERSE";
46 
47 const char * const RTLReversePolicyNames[] = {
50  kForceReverse
51 };
52 
53 const char Trie::kAlphaPatternUnicode[] = "\u2000";
54 const char Trie::kDigitPatternUnicode[] = "\u2001";
55 const char Trie::kAlphanumPatternUnicode[] = "\u2002";
56 const char Trie::kPuncPatternUnicode[] = "\u2003";
57 const char Trie::kLowerPatternUnicode[] = "\u2004";
58 const char Trie::kUpperPatternUnicode[] = "\u2005";
59 
60 const char *Trie::get_reverse_policy_name(RTLReversePolicy reverse_policy) {
61  return RTLReversePolicyNames[reverse_policy];
62 }
63 
64 // Reset the Trie to empty.
65 void Trie::clear() {
67  nodes_.clear();
69  num_edges_ = 0;
70  new_dawg_node(); // Need to allocate node 0.
71 }
72 
74  int direction, bool word_end, UNICHAR_ID unichar_id,
75  EDGE_RECORD **edge_ptr, EDGE_INDEX *edge_index) const {
76  if (debug_level_ == 3) {
77  tprintf("edge_char_of() given node_ref " REFFORMAT " next_node " REFFORMAT
78  " direction %d word_end %d unichar_id %d, exploring node:\n",
79  node_ref, next_node, direction, word_end, unichar_id);
80  if (node_ref != NO_EDGE) {
81  print_node(node_ref, nodes_[node_ref]->forward_edges.size());
82  }
83  }
84  if (node_ref == NO_EDGE) return false;
85  assert(node_ref < nodes_.size());
86  EDGE_VECTOR &vec = (direction == FORWARD_EDGE) ?
87  nodes_[node_ref]->forward_edges : nodes_[node_ref]->backward_edges;
88  int vec_size = vec.size();
89  if (node_ref == 0 && direction == FORWARD_EDGE) { // binary search
90  EDGE_INDEX start = 0;
91  EDGE_INDEX end = vec_size - 1;
92  EDGE_INDEX k;
93  int compare;
94  while (start <= end) {
95  k = (start + end) >> 1; // (start + end) / 2
96  compare = given_greater_than_edge_rec(next_node, word_end,
97  unichar_id, vec[k]);
98  if (compare == 0) { // given == vec[k]
99  *edge_ptr = &(vec[k]);
100  *edge_index = k;
101  return true;
102  } else if (compare == 1) { // given > vec[k]
103  start = k + 1;
104  } else { // given < vec[k]
105  end = k - 1;
106  }
107  }
108  } else { // linear search
109  for (int i = 0; i < vec_size; ++i) {
110  EDGE_RECORD &edge_rec = vec[i];
111  if (edge_rec_match(next_node, word_end, unichar_id,
112  next_node_from_edge_rec(edge_rec),
113  end_of_word_from_edge_rec(edge_rec),
114  unichar_id_from_edge_rec(edge_rec))) {
115  *edge_ptr = &(edge_rec);
116  *edge_index = i;
117  return true;
118  }
119  }
120  }
121  return false; // not found
122 }
123 
124 bool Trie::add_edge_linkage(NODE_REF node1, NODE_REF node2, bool marker_flag,
125  int direction, bool word_end,
126  UNICHAR_ID unichar_id) {
127  EDGE_VECTOR *vec = (direction == FORWARD_EDGE) ?
128  &(nodes_[node1]->forward_edges) : &(nodes_[node1]->backward_edges);
129  int search_index;
130  if (node1 == 0 && direction == FORWARD_EDGE) {
131  search_index = 0; // find the index to make the add sorted
132  while (search_index < vec->size() &&
133  given_greater_than_edge_rec(node2, word_end, unichar_id,
134  (*vec)[search_index]) == 1) {
135  search_index++;
136  }
137  } else {
138  search_index = vec->size(); // add is unsorted, so index does not matter
139  }
140  EDGE_RECORD edge_rec;
141  link_edge(&edge_rec, node2, marker_flag, direction, word_end, unichar_id);
142  if (node1 == 0 && direction == BACKWARD_EDGE &&
144  EDGE_INDEX edge_index = root_back_freelist_.pop_back();
145  (*vec)[edge_index] = edge_rec;
146  } else if (search_index < vec->size()) {
147  vec->insert(edge_rec, search_index);
148  } else {
149  vec->push_back(edge_rec);
150  }
151  if (debug_level_ > 1) {
152  tprintf("new edge in nodes_[" REFFORMAT "]: ", node1);
153  print_edge_rec(edge_rec);
154  tprintf("\n");
155  }
156  num_edges_++;
157  return true;
158 }
159 
161  NODE_REF the_next_node,
162  bool marker_flag,
163  UNICHAR_ID unichar_id) {
164  EDGE_RECORD *back_edge_ptr;
165  EDGE_INDEX back_edge_index;
166  ASSERT_HOST(edge_char_of(the_next_node, NO_EDGE, BACKWARD_EDGE, false,
167  unichar_id, &back_edge_ptr, &back_edge_index));
168  if (marker_flag) {
169  *back_edge_ptr |= (MARKER_FLAG << flag_start_bit_);
170  *edge_ptr |= (MARKER_FLAG << flag_start_bit_);
171  }
172  // Mark both directions as end of word.
173  *back_edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
174  *edge_ptr |= (WERD_END_FLAG << flag_start_bit_);
175 }
176 
178  const GenericVector<bool> *repetitions) {
179  if (word.length() <= 0) return false; // can't add empty words
180  if (repetitions != NULL) ASSERT_HOST(repetitions->size() == word.length());
181  // Make sure the word does not contain invalid unchar ids.
182  for (int i = 0; i < word.length(); ++i) {
183  if (word.unichar_id(i) < 0 ||
184  word.unichar_id(i) >= unicharset_size_) return false;
185  }
186 
187  EDGE_RECORD *edge_ptr;
188  NODE_REF last_node = 0;
189  NODE_REF the_next_node;
190  bool marker_flag = false;
191  EDGE_INDEX edge_index;
192  int i;
193  inT32 still_finding_chars = true;
194  inT32 word_end = false;
195  bool add_failed = false;
196  bool found;
197 
198  if (debug_level_ > 1) word.print("\nAdding word: ");
199 
200  UNICHAR_ID unichar_id;
201  for (i = 0; i < word.length() - 1; ++i) {
202  unichar_id = word.unichar_id(i);
203  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
204  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
205  if (still_finding_chars) {
206  found = edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, word_end,
207  unichar_id, &edge_ptr, &edge_index);
208  if (found && debug_level_ > 1) {
209  tprintf("exploring edge " REFFORMAT " in node " REFFORMAT "\n",
210  edge_index, last_node);
211  }
212  if (!found) {
213  still_finding_chars = false;
214  } else if (next_node_from_edge_rec(*edge_ptr) == 0) {
215  // We hit the end of an existing word, but the new word is longer.
216  // In this case we have to disconnect the existing word from the
217  // backwards root node, mark the current position as end-of-word
218  // and add new nodes for the increased length. Disconnecting the
219  // existing word from the backwards root node requires a linear
220  // search, so it is much faster to add the longest words first,
221  // to avoid having to come here.
222  word_end = true;
223  still_finding_chars = false;
224  remove_edge(last_node, 0, word_end, unichar_id);
225  } else {
226  // We have to add a new branch here for the new word.
227  if (marker_flag) set_marker_flag_in_edge_rec(edge_ptr);
228  last_node = next_node_from_edge_rec(*edge_ptr);
229  }
230  }
231  if (!still_finding_chars) {
232  the_next_node = new_dawg_node();
233  if (debug_level_ > 1)
234  tprintf("adding node " REFFORMAT "\n", the_next_node);
235  if (the_next_node == 0) {
236  add_failed = true;
237  break;
238  }
239  if (!add_new_edge(last_node, the_next_node,
240  marker_flag, word_end, unichar_id)) {
241  add_failed = true;
242  break;
243  }
244  word_end = false;
245  last_node = the_next_node;
246  }
247  }
248  the_next_node = 0;
249  unichar_id = word.unichar_id(i);
250  marker_flag = (repetitions != NULL) ? (*repetitions)[i] : false;
251  if (debug_level_ > 1) tprintf("Adding letter %d\n", unichar_id);
252  if (still_finding_chars &&
253  edge_char_of(last_node, NO_EDGE, FORWARD_EDGE, false,
254  unichar_id, &edge_ptr, &edge_index)) {
255  // An extension of this word already exists in the trie, so we
256  // only have to add the ending flags in both directions.
257  add_word_ending(edge_ptr, next_node_from_edge_rec(*edge_ptr),
258  marker_flag, unichar_id);
259  } else {
260  // Add a link to node 0. All leaves connect to node 0 so the back links can
261  // be used in reduction to a dawg. This root backward node has one edge
262  // entry for every word, (except prefixes of longer words) so it is huge.
263  if (!add_failed &&
264  !add_new_edge(last_node, the_next_node, marker_flag, true, unichar_id))
265  add_failed = true;
266  }
267  if (add_failed) {
268  tprintf("Re-initializing document dictionary...\n");
269  clear();
270  return false;
271  } else {
272  return true;
273  }
274 }
275 
277  TRIE_NODE_RECORD *node = new TRIE_NODE_RECORD();
278  nodes_.push_back(node);
279  return nodes_.length() - 1;
280 }
281 
282 // Sort function to sort words by decreasing order of length.
283 static int sort_strings_by_dec_length(const void* v1, const void* v2) {
284  const STRING* s1 = static_cast<const STRING*>(v1);
285  const STRING* s2 = static_cast<const STRING*>(v2);
286  return s2->length() - s1->length();
287 }
288 
290  const UNICHARSET &unicharset,
291  Trie::RTLReversePolicy reverse_policy) {
292  GenericVector<STRING> word_list;
293  if (!read_word_list(filename, unicharset, reverse_policy, &word_list))
294  return false;
295  word_list.sort(sort_strings_by_dec_length);
296  return add_word_list(word_list, unicharset);
297 }
298 
300  const UNICHARSET &unicharset,
301  Trie::RTLReversePolicy reverse_policy,
302  GenericVector<STRING>* words) {
303  FILE *word_file;
304  char string[CHARS_PER_LINE];
305  int word_count = 0;
306 
307  word_file = fopen(filename, "rb");
308  if (word_file == NULL) return false;
309 
310  while (fgets(string, CHARS_PER_LINE, word_file) != NULL) {
311  chomp_string(string); // remove newline
312  WERD_CHOICE word(string, unicharset);
313  if ((reverse_policy == RRP_REVERSE_IF_HAS_RTL &&
314  word.has_rtl_unichar_id()) ||
315  reverse_policy == RRP_FORCE_REVERSE) {
317  }
318  ++word_count;
319  if (debug_level_ && word_count % 10000 == 0)
320  tprintf("Read %d words so far\n", word_count);
321  if (word.length() != 0 && !word.contains_unichar_id(INVALID_UNICHAR_ID)) {
322  words->push_back(word.unichar_string());
323  } else if (debug_level_) {
324  tprintf("Skipping invalid word %s\n", string);
325  if (debug_level_ >= 3) word.print();
326  }
327  }
328  if (debug_level_)
329  tprintf("Read %d words total.\n", word_count);
330  fclose(word_file);
331  return true;
332 }
333 
335  const UNICHARSET &unicharset) {
336  for (int i = 0; i < words.size(); ++i) {
337  WERD_CHOICE word(words[i].string(), unicharset);
338  if (!word_in_dawg(word)) {
339  add_word_to_dawg(word);
340  if (!word_in_dawg(word)) {
341  tprintf("Error: word '%s' not in DAWG after adding it\n",
342  words[i].string());
343  return false;
344  }
345  }
346  }
347  return true;
348 }
349 
357  unicharset->unichar_insert(kPuncPatternUnicode);
363  initialized_patterns_ = true;
364  unicharset_size_ = unicharset->size();
365 }
366 
368  const UNICHARSET &unicharset,
369  GenericVector<UNICHAR_ID> *vec) const {
370  bool is_alpha = unicharset.get_isalpha(unichar_id);
371  if (is_alpha) {
374  if (unicharset.get_islower(unichar_id)) {
376  } else if (unicharset.get_isupper(unichar_id)) {
378  }
379  }
380  if (unicharset.get_isdigit(unichar_id)) {
382  if (!is_alpha) vec->push_back(alphanum_pattern_);
383  }
384  if (unicharset.get_ispunctuation(unichar_id)) {
385  vec->push_back(punc_pattern_);
386  }
387 }
388 
390  if (ch == 'c') {
391  return alpha_pattern_;
392  } else if (ch == 'd') {
393  return digit_pattern_;
394  } else if (ch == 'n') {
395  return alphanum_pattern_;
396  } else if (ch == 'p') {
397  return punc_pattern_;
398  } else if (ch == 'a') {
399  return lower_pattern_;
400  } else if (ch == 'A') {
401  return upper_pattern_;
402  } else {
403  return INVALID_UNICHAR_ID;
404  }
405 }
406 
408  const UNICHARSET &unicharset) {
409  if (!initialized_patterns_) {
410  tprintf("please call initialize_patterns() before read_pattern_list()\n");
411  return false;
412  }
413 
414  FILE *pattern_file = fopen(filename, "rb");
415  if (pattern_file == NULL) {
416  tprintf("Error opening pattern file %s\n", filename);
417  return false;
418  }
419 
420  int pattern_count = 0;
421  char string[CHARS_PER_LINE];
422  while (fgets(string, CHARS_PER_LINE, pattern_file) != NULL) {
423  chomp_string(string); // remove newline
424  // Parse the pattern and construct a unichar id vector.
425  // Record the number of repetitions of each unichar in the parallel vector.
426  WERD_CHOICE word(&unicharset);
427  GenericVector<bool> repetitions_vec;
428  const char *str_ptr = string;
429  int step = unicharset.step(str_ptr);
430  bool failed = false;
431  while (step > 0) {
432  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
433  if (step == 1 && *str_ptr == '\\') {
434  ++str_ptr;
435  if (*str_ptr == '\\') { // regular '\' unichar that was escaped
436  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
437  } else {
438  if (word.length() < kSaneNumConcreteChars) {
439  tprintf("Please provide at least %d concrete characters at the"
440  " beginning of the pattern\n", kSaneNumConcreteChars);
441  failed = true;
442  break;
443  }
444  // Parse character class from expression.
445  curr_unichar_id = character_class_to_pattern(*str_ptr);
446  }
447  } else {
448  curr_unichar_id = unicharset.unichar_to_id(str_ptr, step);
449  }
450  if (curr_unichar_id == INVALID_UNICHAR_ID) {
451  failed = true;
452  break; // failed to parse this pattern
453  }
454  word.append_unichar_id(curr_unichar_id, 1, 0.0, 0.0);
455  repetitions_vec.push_back(false);
456  str_ptr += step;
457  step = unicharset.step(str_ptr);
458  // Check if there is a repetition pattern specified after this unichar.
459  if (step == 1 && *str_ptr == '\\' && *(str_ptr+1) == '*') {
460  repetitions_vec[repetitions_vec.size()-1] = true;
461  str_ptr += 2;
462  step = unicharset.step(str_ptr);
463  }
464  }
465  if (failed) {
466  tprintf("Invalid user pattern %s\n", string);
467  continue;
468  }
469  // Insert the pattern into the trie.
470  if (debug_level_ > 2) {
471  tprintf("Inserting expanded user pattern %s\n",
472  word.debug_string().string());
473  }
474  if (!this->word_in_dawg(word)) {
475  this->add_word_to_dawg(word, &repetitions_vec);
476  if (!this->word_in_dawg(word)) {
477  tprintf("Error: failed to insert pattern '%s'\n", string);
478  }
479  }
480  ++pattern_count;
481  }
482  if (debug_level_) {
483  tprintf("Read %d valid patterns from %s\n", pattern_count, filename);
484  }
485  fclose(pattern_file);
486  return true;
487 }
488 
490  bool word_end, UNICHAR_ID unichar_id) {
491  EDGE_RECORD *edge_ptr = NULL;
492  EDGE_INDEX edge_index = 0;
493  ASSERT_HOST(edge_char_of(node1, node2, direction, word_end,
494  unichar_id, &edge_ptr, &edge_index));
495  if (debug_level_ > 1) {
496  tprintf("removed edge in nodes_[" REFFORMAT "]: ", node1);
497  print_edge_rec(*edge_ptr);
498  tprintf("\n");
499  }
500  if (direction == FORWARD_EDGE) {
501  nodes_[node1]->forward_edges.remove(edge_index);
502  } else if (node1 == 0) {
503  KillEdge(&nodes_[node1]->backward_edges[edge_index]);
504  root_back_freelist_.push_back(edge_index);
505  } else {
506  nodes_[node1]->backward_edges.remove(edge_index);
507  }
508  --num_edges_;
509 }
510 
511 // Some optimizations employed in add_word_to_dawg and trie_to_dawg:
512 // 1 Avoid insertion sorting or bubble sorting the tail root node
513 // (back links on node 0, a list of all the leaves.). The node is
514 // huge, and sorting it with n^2 time is terrible.
515 // 2 Avoid using GenericVector::remove on the tail root node.
516 // (a) During add of words to the trie, zero-out the unichars and
517 // keep a freelist of spaces to re-use.
518 // (b) During reduction, just zero-out the unichars of deleted back
519 // links, skipping zero entries while searching.
520 // 3 Avoid linear search of the tail root node. This has to be done when
521 // a suffix is added to an existing word. Adding words by decreasing
522 // length avoids this problem entirely. Words can still be added in
523 // any order, but it is faster to add the longest first.
525  root_back_freelist_.clear(); // Will be invalided by trie_to_dawg.
526  if (debug_level_ > 2) {
527  print_all("Before reduction:", MAX_NODE_EDGES_DISPLAY);
528  }
529  NODE_MARKER reduced_nodes = new bool[nodes_.size()];
530  for (int i = 0; i < nodes_.size(); i++) reduced_nodes[i] = 0;
531  this->reduce_node_input(0, reduced_nodes);
532  delete[] reduced_nodes;
533 
534  if (debug_level_ > 2) {
535  print_all("After reduction:", MAX_NODE_EDGES_DISPLAY);
536  }
537  // Build a translation map from node indices in nodes_ vector to
538  // their target indices in EDGE_ARRAY.
539  NODE_REF *node_ref_map = new NODE_REF[nodes_.size() + 1];
540  int i, j;
541  node_ref_map[0] = 0;
542  for (i = 0; i < nodes_.size(); ++i) {
543  node_ref_map[i+1] = node_ref_map[i] + nodes_[i]->forward_edges.size();
544  }
545  int num_forward_edges = node_ref_map[i];
546 
547  // Convert nodes_ vector into EDGE_ARRAY translating the next node references
548  // in edges using node_ref_map. Empty nodes and backward edges are dropped.
549  EDGE_ARRAY edge_array = new EDGE_RECORD[num_forward_edges];
550  EDGE_ARRAY edge_array_ptr = edge_array;
551  for (i = 0; i < nodes_.size(); ++i) {
552  TRIE_NODE_RECORD *node_ptr = nodes_[i];
553  int end = node_ptr->forward_edges.size();
554  for (j = 0; j < end; ++j) {
555  EDGE_RECORD &edge_rec = node_ptr->forward_edges[j];
556  NODE_REF node_ref = next_node_from_edge_rec(edge_rec);
557  ASSERT_HOST(node_ref < nodes_.size());
558  UNICHAR_ID unichar_id = unichar_id_from_edge_rec(edge_rec);
559  link_edge(edge_array_ptr, node_ref_map[node_ref], false, FORWARD_EDGE,
560  end_of_word_from_edge_rec(edge_rec), unichar_id);
561  if (j == end - 1) set_marker_flag_in_edge_rec(edge_array_ptr);
562  ++edge_array_ptr;
563  }
564  }
565  delete[] node_ref_map;
566 
567  return new SquishedDawg(edge_array, num_forward_edges, type_, lang_,
569 }
570 
572  const EDGE_RECORD &edge1,
573  const EDGE_RECORD &edge2) {
574  if (debug_level_ > 1) {
575  tprintf("\nCollapsing node %" PRIi64 ":\n", node);
577  tprintf("Candidate edges: ");
578  print_edge_rec(edge1);
579  tprintf(", ");
580  print_edge_rec(edge2);
581  tprintf("\n\n");
582  }
583  NODE_REF next_node1 = next_node_from_edge_rec(edge1);
584  NODE_REF next_node2 = next_node_from_edge_rec(edge2);
585  TRIE_NODE_RECORD *next_node2_ptr = nodes_[next_node2];
586  // Translate all edges going to/from next_node2 to go to/from next_node1.
587  EDGE_RECORD *edge_ptr = NULL;
588  EDGE_INDEX edge_index;
589  int i;
590  // The backward link in node to next_node2 will be zeroed out by the caller.
591  // Copy all the backward links in next_node2 to node next_node1
592  for (i = 0; i < next_node2_ptr->backward_edges.size(); ++i) {
593  const EDGE_RECORD &bkw_edge = next_node2_ptr->backward_edges[i];
594  NODE_REF curr_next_node = next_node_from_edge_rec(bkw_edge);
595  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(bkw_edge);
596  int curr_word_end = end_of_word_from_edge_rec(bkw_edge);
597  bool marker_flag = marker_flag_from_edge_rec(bkw_edge);
598  add_edge_linkage(next_node1, curr_next_node, marker_flag, BACKWARD_EDGE,
599  curr_word_end, curr_unichar_id);
600  // Relocate the corresponding forward edge in curr_next_node
601  ASSERT_HOST(edge_char_of(curr_next_node, next_node2, FORWARD_EDGE,
602  curr_word_end, curr_unichar_id,
603  &edge_ptr, &edge_index));
604  set_next_node_in_edge_rec(edge_ptr, next_node1);
605  }
606  int next_node2_num_edges = (next_node2_ptr->forward_edges.size() +
607  next_node2_ptr->backward_edges.size());
608  if (debug_level_ > 1) {
609  tprintf("removed %d edges from node " REFFORMAT "\n",
610  next_node2_num_edges, next_node2);
611  }
612  next_node2_ptr->forward_edges.clear();
613  next_node2_ptr->backward_edges.clear();
614  num_edges_ -= next_node2_num_edges;
615  return true;
616 }
617 
619  UNICHAR_ID unichar_id,
620  NODE_REF node,
621  EDGE_VECTOR* backward_edges,
622  NODE_MARKER reduced_nodes) {
623  if (debug_level_ > 1)
624  tprintf("reduce_lettered_edges(edge=" REFFORMAT ")\n", edge_index);
625  // Compare each of the edge pairs with the given unichar_id.
626  bool did_something = false;
627  for (int i = edge_index; i < backward_edges->size() - 1; ++i) {
628  // Find the first edge that can be eliminated.
629  UNICHAR_ID curr_unichar_id = INVALID_UNICHAR_ID;
630  while (i < backward_edges->size()) {
631  if (!DeadEdge((*backward_edges)[i])) {
632  curr_unichar_id = unichar_id_from_edge_rec((*backward_edges)[i]);
633  if (curr_unichar_id != unichar_id) return did_something;
634  if (can_be_eliminated((*backward_edges)[i])) break;
635  }
636  ++i;
637  }
638  if (i == backward_edges->size()) break;
639  const EDGE_RECORD &edge_rec = (*backward_edges)[i];
640  // Compare it to the rest of the edges with the given unichar_id.
641  for (int j = i + 1; j < backward_edges->size(); ++j) {
642  const EDGE_RECORD &next_edge_rec = (*backward_edges)[j];
643  if (DeadEdge(next_edge_rec)) continue;
644  UNICHAR_ID next_id = unichar_id_from_edge_rec(next_edge_rec);
645  if (next_id != unichar_id) break;
646  if (end_of_word_from_edge_rec(next_edge_rec) ==
647  end_of_word_from_edge_rec(edge_rec) &&
648  can_be_eliminated(next_edge_rec) &&
649  eliminate_redundant_edges(node, edge_rec, next_edge_rec)) {
650  reduced_nodes[next_node_from_edge_rec(edge_rec)] = 0;
651  did_something = true;
652  KillEdge(&(*backward_edges)[j]);
653  }
654  }
655  }
656  return did_something;
657 }
658 
660  int num_edges = edges->size();
661  if (num_edges <= 1) return;
663  sort_vec.reserve(num_edges);
664  for (int i = 0; i < num_edges; ++i) {
666  unichar_id_from_edge_rec((*edges)[i]), (*edges)[i]));
667  }
668  sort_vec.sort();
669  for (int i = 0; i < num_edges; ++i)
670  (*edges)[i] = sort_vec[i].data;
671 }
672 
674  NODE_MARKER reduced_nodes) {
675  EDGE_VECTOR &backward_edges = nodes_[node]->backward_edges;
676  sort_edges(&backward_edges);
677  if (debug_level_ > 1) {
678  tprintf("reduce_node_input(node=" REFFORMAT ")\n", node);
680  }
681 
682  EDGE_INDEX edge_index = 0;
683  while (edge_index < backward_edges.size()) {
684  if (DeadEdge(backward_edges[edge_index])) continue;
685  UNICHAR_ID unichar_id =
686  unichar_id_from_edge_rec(backward_edges[edge_index]);
687  while (reduce_lettered_edges(edge_index, unichar_id, node,
688  &backward_edges, reduced_nodes));
689  while (++edge_index < backward_edges.size()) {
690  UNICHAR_ID id = unichar_id_from_edge_rec(backward_edges[edge_index]);
691  if (!DeadEdge(backward_edges[edge_index]) && id != unichar_id) break;
692  }
693  }
694  reduced_nodes[node] = true; // mark as reduced
695 
696  if (debug_level_ > 1) {
697  tprintf("Node " REFFORMAT " after reduction:\n", node);
699  }
700 
701  for (int i = 0; i < backward_edges.size(); ++i) {
702  if (DeadEdge(backward_edges[i])) continue;
703  NODE_REF next_node = next_node_from_edge_rec(backward_edges[i]);
704  if (next_node != 0 && !reduced_nodes[next_node]) {
705  reduce_node_input(next_node, reduced_nodes);
706  }
707  }
708 }
709 
710 void Trie::print_node(NODE_REF node, int max_num_edges) const {
711  if (node == NO_EDGE) return; // nothing to print
712  TRIE_NODE_RECORD *node_ptr = nodes_[node];
713  int num_fwd = node_ptr->forward_edges.size();
714  int num_bkw = node_ptr->backward_edges.size();
715  EDGE_VECTOR *vec;
716  for (int dir = 0; dir < 2; ++dir) {
717  if (dir == 0) {
718  vec = &(node_ptr->forward_edges);
719  tprintf(REFFORMAT " (%d %d): ", node, num_fwd, num_bkw);
720  } else {
721  vec = &(node_ptr->backward_edges);
722  tprintf("\t");
723  }
724  int i;
725  for (i = 0; (dir == 0 ? i < num_fwd : i < num_bkw) &&
726  i < max_num_edges; ++i) {
727  if (DeadEdge((*vec)[i])) continue;
728  print_edge_rec((*vec)[i]);
729  tprintf(" ");
730  }
731  if (dir == 0 ? i < num_fwd : i < num_bkw) tprintf("...");
732  tprintf("\n");
733  }
734 }
735 
736 } // namespace tesseract
void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: trie.cpp:367
static const char kLowerPatternUnicode[]
Definition: trie.h:79
int debug_level_
Definition: dawg.h:315
bool add_new_edge(NODE_REF node1, NODE_REF node2, bool repeats, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:358
EDGE_REF edge_char_of(NODE_REF node_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: trie.h:104
static const char kDigitPatternUnicode[]
Definition: trie.h:76
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
#define FORWARD_EDGE
Definition: dawg.h:84
void print() const
Definition: ratngs.h:578
bool add_word_to_dawg(const WERD_CHOICE &word, const GenericVector< bool > *repetitions)
Definition: trie.cpp:177
int32_t inT32
Definition: host.h:38
bool reduce_lettered_edges(EDGE_INDEX edge_index, UNICHAR_ID unichar_id, NODE_REF node, EDGE_VECTOR *backward_edges, NODE_MARKER reduced_nodes)
Definition: trie.cpp:618
int UNICHAR_ID
Definition: unichar.h:33
void reserve(int size)
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:343
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
#define BACKWARD_EDGE
Definition: dawg.h:85
int length() const
Definition: ratngs.h:301
EDGE_VECTOR backward_edges
Definition: trie.h:50
void remove_edge_linkage(NODE_REF node1, NODE_REF node2, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:489
voidpf void uLong size
Definition: ioapi.h:39
NODE_REF new_dawg_node()
Definition: trie.cpp:276
static const char kAlphaPatternUnicode[]
Definition: trie.h:75
void add_word_ending(EDGE_RECORD *edge, NODE_REF the_next_node, bool repeats, UNICHAR_ID unichar_id)
Definition: trie.cpp:160
void link_edge(EDGE_RECORD *edge, NODE_REF nxt, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:308
void remove(int index)
const char *const RTLReversePolicyNames[]
Definition: trie.cpp:47
bool eliminate_redundant_edges(NODE_REF node, const EDGE_RECORD &edge1, const EDGE_RECORD &edge2)
Definition: trie.cpp:571
int push_back(T object)
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:225
const STRING debug_string() const
Definition: ratngs.h:503
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:234
#define tprintf(...)
Definition: tprintf.h:31
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
const char * string() const
Definition: strngs.cpp:198
UNICHAR_ID lower_pattern_
Definition: trie.h:434
NODE_REF next_node(EDGE_REF edge_ref) const
Definition: trie.h:133
bool add_edge_linkage(NODE_REF node1, NODE_REF node2, bool repeats, int direction, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.cpp:124
UNICHAR_ID upper_pattern_
Definition: trie.h:435
bool empty() const
Definition: genericvector.h:90
void sort_edges(EDGE_VECTOR *edges)
Definition: trie.cpp:659
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:212
inT32 length() const
Definition: strngs.cpp:193
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
int size() const
Definition: genericvector.h:72
bool initialized_patterns_
Definition: trie.h:429
void print_edge_rec(const EDGE_RECORD &edge_rec) const
Definition: trie.h:319
#define REFFORMAT
Definition: dawg.h:92
int flag_start_bit_
Definition: dawg.h:309
bool word_in_dawg(const WERD_CHOICE &word) const
Returns true if the given word is in the Dawg.
Definition: dawg.cpp:69
const char kDoNotReverse[]
Definition: trie.cpp:43
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool read_and_add_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse)
Definition: trie.cpp:289
static const char * get_reverse_policy_name(RTLReversePolicy reverse_policy)
Definition: trie.cpp:60
inT64 NODE_REF
Definition: dawg.h:55
SquishedDawg * trie_to_dawg()
Definition: trie.cpp:524
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
static const char kPuncPatternUnicode[]
Definition: trie.h:78
EDGE_VECTOR forward_edges
Definition: trie.h:49
void insert(T t, int index)
void chomp_string(char *str)
Definition: helpers.h:82
RTLReversePolicy
Definition: trie.h:64
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
#define MAX_NODE_EDGES_DISPLAY
Definition: dawg.h:86
const char kReverseIfHasRTL[]
Definition: trie.cpp:44
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:304
#define WERD_END_FLAG
Definition: dawg.h:89
Definition: strngs.h:45
void reduce_node_input(NODE_REF node, NODE_MARKER reduced_nodes)
Definition: trie.cpp:673
UNICHAR_ID alphanum_pattern_
Definition: trie.h:432
DawgType type_
Definition: dawg.h:300
static const char kUpperPatternUnicode[]
Definition: trie.h:80
void print_all(const char *msg, int max_num_edges)
Definition: trie.h:336
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:216
void initialize_patterns(UNICHARSET *unicharset)
Definition: trie.cpp:350
bool add_word_list(const GenericVector< STRING > &words, const UNICHARSET &unicharset)
Definition: trie.cpp:334
int unicharset_size_
Definition: dawg.h:308
uinT64 num_edges_
Definition: trie.h:422
int length() const
Definition: genericvector.h:85
bool read_word_list(const char *filename, const UNICHARSET &unicharset, Trie::RTLReversePolicy reverse_policy, GenericVector< STRING > *words)
Definition: trie.cpp:299
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:409
void clear()
Definition: trie.cpp:65
void remove_edge(NODE_REF node1, NODE_REF node2, bool word_end, UNICHAR_ID unichar_id)
Definition: trie.h:383
TRIE_NODES nodes_
Definition: trie.h:421
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:250
UNICHAR_ID digit_pattern_
Definition: trie.h:431
int step(const char *str) const
Definition: unicharset.cpp:211
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:271
const STRING & unichar_string() const
Definition: ratngs.h:539
UNICHAR_ID alpha_pattern_
Definition: trie.h:430
void KillEdge(EDGE_RECORD *edge_rec) const
Definition: trie.h:154
UNICHAR_ID punc_pattern_
Definition: trie.h:433
const char * filename
Definition: ioapi.h:38
bool DeadEdge(const EDGE_RECORD &edge_rec) const
Definition: trie.h:158
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:240
STRING lang_
Definition: dawg.h:301
GenericVector< EDGE_INDEX > root_back_freelist_
Definition: trie.h:426
int size() const
Definition: unicharset.h:299
bool * NODE_MARKER
Definition: trie.h:45
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:303
UNICHAR_ID character_class_to_pattern(char ch)
Definition: trie.cpp:389
inT64 EDGE_INDEX
Definition: trie.h:32
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
bool can_be_eliminated(const EDGE_RECORD &edge_rec)
Definition: trie.h:328
uinT64 EDGE_RECORD
Definition: dawg.h:50
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458
const char kForceReverse[]
Definition: trie.cpp:45
#define CHARS_PER_LINE
Definition: cutil.h:57
bool read_pattern_list(const char *filename, const UNICHARSET &unicharset)
Definition: trie.cpp:407
void delete_data_pointers()
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:53
static const char kAlphanumPatternUnicode[]
Definition: trie.h:77
#define MARKER_FLAG
Definition: dawg.h:87
static const int kSaneNumConcreteChars
Definition: trie.h:71
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
void print_node(NODE_REF node, int max_num_edges) const
Definition: trie.cpp:710
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:229