tesseract  4.00.00dev
dawg.h
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: dawg.h (Formerly dawg.h)
5  * Description: Definition of a class that represents Directed Accyclic Word
6  * Graph (DAWG), functions to build and manipulate the DAWG.
7  * Author: Mark Seaman, SW Productivity
8  * Created: Fri Oct 16 14:37:00 1987
9  * Modified: Wed Jun 19 16:50:24 1991 (Mark Seaman) marks@hpgrlt
10  * Language: C
11  * Package: N/A
12  * Status: Reusable Software Component
13  *
14  * (c) Copyright 1987, Hewlett-Packard Company.
15  ** Licensed under the Apache License, Version 2.0 (the "License");
16  ** you may not use this file except in compliance with the License.
17  ** You may obtain a copy of the License at
18  ** http://www.apache.org/licenses/LICENSE-2.0
19  ** Unless required by applicable law or agreed to in writing, software
20  ** distributed under the License is distributed on an "AS IS" BASIS,
21  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
22  ** See the License for the specific language governing permissions and
23  ** limitations under the License.
24  *
25  *********************************************************************************/
26 
27 #ifndef DICT_DAWG_H_
28 #define DICT_DAWG_H_
29 
30 /*----------------------------------------------------------------------
31  I n c l u d e s
32 ----------------------------------------------------------------------*/
33 
34 #include "elst.h"
35 #include "ratngs.h"
36 #include "params.h"
37 #include "tesscallback.h"
38 
39 #ifndef __GNUC__
40 #ifdef _WIN32
41 #define NO_EDGE (inT64) 0xffffffffffffffffi64
42 #endif /*_WIN32*/
43 #else
44 #define NO_EDGE (inT64) 0xffffffffffffffffll
45 #endif /*__GNUC__*/
46 
47 /*----------------------------------------------------------------------
48  T y p e s
49 ----------------------------------------------------------------------*/
50 class UNICHARSET;
51 
52 typedef uinT64 EDGE_RECORD;
54 typedef inT64 EDGE_REF;
55 typedef inT64 NODE_REF;
56 typedef EDGE_REF *NODE_MAP;
57 
58 namespace tesseract {
59 
60 struct NodeChild {
63  NodeChild(UNICHAR_ID id, EDGE_REF ref): unichar_id(id), edge_ref(ref) {}
64  NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
65 };
66 
70 
71 enum DawgType {
76 
77  DAWG_TYPE_COUNT // number of enum entries
78 };
79 
80 /*----------------------------------------------------------------------
81  C o n s t a n t s
82 ----------------------------------------------------------------------*/
83 
84 #define FORWARD_EDGE (inT32) 0
85 #define BACKWARD_EDGE (inT32) 1
86 #define MAX_NODE_EDGES_DISPLAY (inT64) 100
87 #define MARKER_FLAG (inT64) 1
88 #define DIRECTION_FLAG (inT64) 2
89 #define WERD_END_FLAG (inT64) 4
90 #define LETTER_START_BIT 0
91 #define NUM_FLAG_BITS 3
92 #define REFFORMAT "%" PRId64
93 
94 static const bool kDawgSuccessors[DAWG_TYPE_COUNT][DAWG_TYPE_COUNT] = {
95  { 0, 1, 1, 0 }, // for DAWG_TYPE_PUNCTUATION
96  { 1, 0, 0, 0 }, // for DAWG_TYPE_WORD
97  { 1, 0, 0, 0 }, // for DAWG_TYPE_NUMBER
98  { 0, 0, 0, 0 }, // for DAWG_TYPE_PATTERN
99 };
100 
101 static const char kWildcard[] = "*";
102 
103 
104 /*----------------------------------------------------------------------
105  C l a s s e s a n d S t r u c t s
106 ----------------------------------------------------------------------*/
107 //
117 //
118 class Dawg {
119  public:
121  static const inT16 kDawgMagicNumber = 42;
125  static const UNICHAR_ID kPatternUnicharID = 0;
126 
127  inline DawgType type() const { return type_; }
128  inline const STRING &lang() const { return lang_; }
129  inline PermuterType permuter() const { return perm_; }
130 
131  virtual ~Dawg() {}
132 
134  bool word_in_dawg(const WERD_CHOICE &word) const;
135 
136  // Returns true if the given word prefix is not contraindicated by the dawg.
137  // If requires_complete is true, then the exact complete word must be present.
138  bool prefix_in_dawg(const WERD_CHOICE &prefix, bool requires_complete) const;
139 
142  int check_for_words(const char *filename,
143  const UNICHARSET &unicharset,
144  bool enable_wildcard) const;
145 
146  // For each word in the Dawg, call the given (permanent) callback with the
147  // text (UTF-8) version of the word.
148  void iterate_words(const UNICHARSET &unicharset,
150 
151  // For each word in the Dawg, call the given (permanent) callback with the
152  // text (UTF-8) version of the word.
153  void iterate_words(const UNICHARSET &unicharset,
154  TessCallback1<const char *> *cb) const;
155 
156  // Pure virtual function that should be implemented by the derived classes.
157 
159  virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
160  bool word_end) const = 0;
161 
164  virtual void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
165  bool word_end) const = 0;
166 
169  virtual NODE_REF next_node(EDGE_REF edge_ref) const = 0;
170 
173  virtual bool end_of_word(EDGE_REF edge_ref) const = 0;
174 
176  virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const = 0;
177 
180  virtual void print_node(NODE_REF node, int max_num_edges) const = 0;
181 
184  virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id,
185  const UNICHARSET &unicharset,
186  GenericVector<UNICHAR_ID> *vec) const {
187  (void)unichar_id;
188  (void)unicharset;
189  (void)vec;
190  }
191 
196  EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const {
197  (void)edge_ref;
198  (void)unichar_id;
199  (void)word_end;
200  return false;
201  }
202 
203  protected:
204  Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
205  : type_(type),
206  lang_(lang),
207  perm_(perm),
208  unicharset_size_(0),
209  debug_level_(debug_level) {}
210 
212  inline NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const {
213  return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
214  }
216  inline bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const {
217  return (edge_rec & (MARKER_FLAG << flag_start_bit_)) != 0;
218  }
220  inline int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const {
221  return ((edge_rec & (DIRECTION_FLAG << flag_start_bit_))) ?
223  }
225  inline bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const {
226  return (edge_rec & (WERD_END_FLAG << flag_start_bit_)) != 0;
227  }
230  const EDGE_RECORD &edge_rec) const {
231  return ((edge_rec & letter_mask_) >> LETTER_START_BIT);
232  }
235  EDGE_RECORD *edge_rec, EDGE_REF value) {
236  *edge_rec &= (~next_node_mask_);
237  *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
238  }
240  inline void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec) {
241  *edge_rec |= (MARKER_FLAG << flag_start_bit_);
242  }
250  inline int given_greater_than_edge_rec(NODE_REF next_node,
251  bool word_end,
252  UNICHAR_ID unichar_id,
253  const EDGE_RECORD &edge_rec) const {
254  UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
255  NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
256  bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
257  if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
258  curr_word_end, curr_unichar_id)) return 0;
259  if (unichar_id > curr_unichar_id) return 1;
260  if (unichar_id == curr_unichar_id) {
261  if (next_node > curr_next_node) return 1;
262  if (next_node == curr_next_node) {
263  if (word_end > curr_word_end) return 1;
264  }
265  }
266  return -1;
267  }
271  inline bool edge_rec_match(NODE_REF next_node,
272  bool word_end,
273  UNICHAR_ID unichar_id,
274  NODE_REF other_next_node,
275  bool other_word_end,
276  UNICHAR_ID other_unichar_id) const {
277  return ((unichar_id == other_unichar_id) &&
278  (next_node == NO_EDGE || next_node == other_next_node) &&
279  (!word_end || (word_end == other_word_end)));
280  }
281 
284  void init(int unicharset_size);
285 
291  bool match_words(WERD_CHOICE *word, inT32 index,
292  NODE_REF node, UNICHAR_ID wildcard) const;
293 
294  // Recursively iterate over all words in a dawg (see public iterate_words).
295  void iterate_words_rec(const WERD_CHOICE &word_so_far,
296  NODE_REF to_explore,
298 
299  // Member Variables.
304  // Variables to construct various edge masks. Formerly:
305  // #define NEXT_EDGE_MASK (inT64) 0xfffffff800000000i64
306  // #define FLAGS_MASK (inT64) 0x0000000700000000i64
307  // #define LETTER_MASK (inT64) 0x00000000ffffffffi64
314  // Level of debug statements to print to stdout.
316 };
317 
318 //
319 // DawgPosition keeps track of where we are in the primary dawg we're searching
320 // as well as where we may be in the "punctuation dawg" which may provide
321 // surrounding context.
322 //
323 // Example:
324 // punctuation dawg -- space is the "pattern character"
325 // " " // no punctuation
326 // "' '" // leading and trailing apostrophes
327 // " '" // trailing apostrophe
328 // word dawg:
329 // "cat"
330 // "cab"
331 // "cat's"
332 //
333 // DawgPosition(dawg_index, dawg_ref, punc_index, punc_ref, rtp)
334 //
335 // DawgPosition(-1, NO_EDGE, p, pe, false)
336 // We're in the punctuation dawg, no other dawg has been started.
337 // (1) If there's a pattern edge as a punc dawg child of us,
338 // for each punc-following dawg starting with ch, produce:
339 // Result: DawgPosition(k, w, p', false)
340 // (2) If there's a valid continuation in the punc dawg, produce:
341 // Result: DawgPosition(-k, NO_EDGE, p', false)
342 //
343 // DawgPosition(k, w, -1, NO_EDGE, false)
344 // We're in dawg k. Going back to punctuation dawg is not an option.
345 // Follow ch in dawg k.
346 //
347 // DawgPosition(k, w, p, pe, false)
348 // We're in dawg k. Continue in dawg k and/or go back to the punc dawg.
349 // If ending, check that the punctuation dawg is also ok to end here.
350 //
351 // DawgPosition(k, w, p, pe true)
352 // We're back in the punctuation dawg. Continuing there is the only option.
353 struct DawgPosition {
355  : dawg_index(-1), dawg_ref(NO_EDGE), punc_ref(NO_EDGE),
356  back_to_punc(false) {}
357  DawgPosition(int dawg_idx, EDGE_REF dawgref,
358  int punc_idx, EDGE_REF puncref,
359  bool backtopunc)
360  : dawg_index(dawg_idx), dawg_ref(dawgref),
361  punc_index(punc_idx), punc_ref(puncref),
362  back_to_punc(backtopunc) {
363  }
364  bool operator==(const DawgPosition &other) {
365  return dawg_index == other.dawg_index &&
366  dawg_ref == other.dawg_ref &&
367  punc_index == other.punc_index &&
368  punc_ref == other.punc_ref &&
369  back_to_punc == other.back_to_punc;
370  }
371 
376  // Have we returned to the punc dawg at the end of the word?
378 };
379 
380 class DawgPositionVector : public GenericVector<DawgPosition> {
381  public:
384  void clear() { size_used_ = 0; }
388  inline bool add_unique(const DawgPosition &new_pos,
389  bool debug,
390  const char *debug_msg) {
391  for (int i = 0; i < size_used_; ++i) {
392  if (data_[i] == new_pos) return false;
393  }
394  push_back(new_pos);
395  if (debug) {
396  tprintf("%s[%d, " REFFORMAT "] [punc: " REFFORMAT "%s]\n",
397  debug_msg, new_pos.dawg_index, new_pos.dawg_ref,
398  new_pos.punc_ref, new_pos.back_to_punc ? " returned" : "");
399  }
400  return true;
401  }
402 };
403 
404 //
411 //
412 class SquishedDawg : public Dawg {
413  public:
415  int debug_level)
416  : Dawg(type, lang, perm, debug_level) {}
417  SquishedDawg(const char *filename, DawgType type, const STRING &lang,
418  PermuterType perm, int debug_level)
419  : Dawg(type, lang, perm, debug_level) {
420  TFile file;
421  ASSERT_HOST(file.Open(filename, nullptr));
422  ASSERT_HOST(read_squished_dawg(&file));
423  num_forward_edges_in_node0 = num_forward_edges(0);
424  }
425  SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type,
426  const STRING &lang, PermuterType perm, int unicharset_size,
427  int debug_level)
428  : Dawg(type, lang, perm, debug_level),
429  edges_(edges),
430  num_edges_(num_edges) {
431  init(unicharset_size);
432  num_forward_edges_in_node0 = num_forward_edges(0);
433  if (debug_level > 3) print_all("SquishedDawg:");
434  }
435  ~SquishedDawg();
436 
437  // Loads using the given TFile. Returns false on failure.
438  bool Load(TFile *fp) {
439  if (!read_squished_dawg(fp)) return false;
440  num_forward_edges_in_node0 = num_forward_edges(0);
441  return true;
442  }
443 
444  int NumEdges() { return num_edges_; }
445 
447  EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id,
448  bool word_end) const;
449 
452  void unichar_ids_of(NODE_REF node, NodeChildVector *vec,
453  bool word_end) const {
454  EDGE_REF edge = node;
455  if (!edge_occupied(edge) || edge == NO_EDGE) return;
456  assert(forward_edge(edge)); // we don't expect any backward edges to
457  do { // be present when this function is called
458  if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
459  vec->push_back(NodeChild(unichar_id_from_edge_rec(edges_[edge]), edge));
460  }
461  } while (!last_edge(edge++));
462  }
463 
467  return next_node_from_edge_rec((edges_[edge]));
468  }
469 
473  return end_of_word_from_edge_rec((edges_[edge_ref]));
474  }
475 
478  return unichar_id_from_edge_rec((edges_[edge_ref]));
479  }
480 
483  void print_node(NODE_REF node, int max_num_edges) const;
484 
486  void write_squished_dawg(FILE *file);
487 
490  void write_squished_dawg(const char *filename) {
491  FILE *file = fopen(filename, "wb");
492  if (file == NULL) {
493  tprintf("Error opening %s\n", filename);
494  exit(1);
495  }
496  this->write_squished_dawg(file);
497  fclose(file);
498  }
499 
500  private:
502  inline void set_next_node(EDGE_REF edge_ref, EDGE_REF value) {
503  set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
504  }
506  inline void set_empty_edge(EDGE_REF edge_ref) {
507  (edges_[edge_ref] = next_node_mask_);
508  }
510  inline void clear_all_edges() {
511  for (int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
512  }
514  inline void clear_marker_flag(EDGE_REF edge_ref) {
515  (edges_[edge_ref] &= ~(MARKER_FLAG << flag_start_bit_));
516  }
518  inline bool forward_edge(EDGE_REF edge_ref) const {
519  return (edge_occupied(edge_ref) &&
520  (FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
521  }
523  inline bool backward_edge(EDGE_REF edge_ref) const {
524  return (edge_occupied(edge_ref) &&
525  (BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
526  }
528  inline bool edge_occupied(EDGE_REF edge_ref) const {
529  return (edges_[edge_ref] != next_node_mask_);
530  }
532  inline bool last_edge(EDGE_REF edge_ref) const {
533  return (edges_[edge_ref] & (MARKER_FLAG << flag_start_bit_)) != 0;
534  }
535 
537  inT32 num_forward_edges(NODE_REF node) const;
538 
540  bool read_squished_dawg(TFile *file);
541 
543  void print_edge(EDGE_REF edge) const;
544 
546  void print_all(const char* msg) {
547  tprintf("\n__________________________\n%s\n", msg);
548  for (int i = 0; i < num_edges_; ++i) print_edge(i);
549  tprintf("__________________________\n");
550  }
552  NODE_MAP build_node_map(inT32 *num_nodes) const;
553 
554 
555  // Member variables.
556  EDGE_ARRAY edges_;
557  inT32 num_edges_;
558  int num_forward_edges_in_node0;
559 };
560 
561 } // namespace tesseract
562 
563 #endif // DICT_DAWG_H_
virtual ~Dawg()
Definition: dawg.h:131
EDGE_REF * NODE_MAP
Definition: dawg.h:56
int debug_level_
Definition: dawg.h:315
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
Definition: dawg.h:388
EDGE_REF dawg_ref
Definition: dawg.h:373
UNICHAR_ID unichar_id
Definition: dawg.h:61
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
Definition: dawg.h:220
int64_t inT64
Definition: host.h:40
#define FORWARD_EDGE
Definition: dawg.h:84
uinT64 flags_mask_
Definition: dawg.h:312
int32_t inT32
Definition: host.h:38
int UNICHAR_ID
Definition: unichar.h:33
int next_node_start_bit_
Definition: dawg.h:310
#define BACKWARD_EDGE
Definition: dawg.h:85
uint64_t uinT64
Definition: host.h:41
bool end_of_word(EDGE_REF edge_ref) const
Definition: dawg.h:472
DawgType type() const
Definition: dawg.h:127
NODE_REF next_node(EDGE_REF edge) const
Definition: dawg.h:466
int push_back(T object)
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
Definition: dawg.h:225
SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
Definition: dawg.h:425
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
Definition: dawg.h:234
GenericVector< NodeChild > NodeChildVector
Definition: dawg.h:67
#define tprintf(...)
Definition: tprintf.h:31
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:204
GenericVector< int > SuccessorList
Definition: dawg.h:68
GenericVector< SuccessorList * > SuccessorListsVector
Definition: dawg.h:69
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
Definition: dawg.h:212
uinT64 next_node_mask_
Definition: dawg.h:311
#define REFFORMAT
Definition: dawg.h:92
int flag_start_bit_
Definition: dawg.h:309
int16_t inT16
Definition: host.h:36
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const
Definition: dawg.h:452
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT64 NODE_REF
Definition: dawg.h:55
bool operator==(const DawgPosition &other)
Definition: dawg.h:364
PermuterType permuter() const
Definition: dawg.h:129
bool Load(TFile *fp)
Definition: dawg.h:438
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
Definition: dawg.h:477
#define WERD_END_FLAG
Definition: dawg.h:89
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
Definition: dawg.h:195
Definition: strngs.h:45
const STRING & lang() const
Definition: dawg.h:128
DawgType type_
Definition: dawg.h:300
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
Definition: dawg.h:216
DawgType
Definition: dawg.h:71
SquishedDawg(const char *filename, DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:417
EDGE_REF punc_ref
Definition: dawg.h:375
int unicharset_size_
Definition: dawg.h:308
EDGE_REF edge_ref
Definition: dawg.h:62
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
Definition: dawg.h:250
NodeChild(UNICHAR_ID id, EDGE_REF ref)
Definition: dawg.h:63
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
Definition: dawg.h:271
int8_t inT8
Definition: host.h:34
const char * filename
Definition: ioapi.h:38
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
Definition: dawg.h:240
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:38
STRING lang_
Definition: dawg.h:301
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
Definition: dawg.h:184
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
Definition: dawg.h:303
#define DIRECTION_FLAG
Definition: dawg.h:88
uinT64 EDGE_RECORD
Definition: dawg.h:50
SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
Definition: dawg.h:414
inT64 EDGE_REF
Definition: dawg.h:54
EDGE_RECORD * EDGE_ARRAY
Definition: dawg.h:53
PermuterType
Definition: ratngs.h:240
uinT64 letter_mask_
Definition: dawg.h:313
#define MARKER_FLAG
Definition: dawg.h:87
DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref, bool backtopunc)
Definition: dawg.h:357
#define LETTER_START_BIT
Definition: dawg.h:90
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
Definition: dawg.h:229
void write_squished_dawg(const char *filename)
Definition: dawg.h:490