41 #define NO_EDGE (inT64) 0xffffffffffffffffi64 44 #define NO_EDGE (inT64) 0xffffffffffffffffll 64 NodeChild(): unichar_id(INVALID_UNICHAR_ID), edge_ref(NO_EDGE) {}
84 #define FORWARD_EDGE (inT32) 0 85 #define BACKWARD_EDGE (inT32) 1 86 #define MAX_NODE_EDGES_DISPLAY (inT64) 100 87 #define MARKER_FLAG (inT64) 1 88 #define DIRECTION_FLAG (inT64) 2 89 #define WERD_END_FLAG (inT64) 4 90 #define LETTER_START_BIT 0 91 #define NUM_FLAG_BITS 3 92 #define REFFORMAT "%" PRId64 101 static const char kWildcard[] =
"*";
121 static const inT16 kDawgMagicNumber = 42;
138 bool prefix_in_dawg(
const WERD_CHOICE &prefix,
bool requires_complete)
const;
142 int check_for_words(
const char *
filename,
144 bool enable_wildcard)
const;
148 void iterate_words(
const UNICHARSET &unicharset,
153 void iterate_words(
const UNICHARSET &unicharset,
160 bool word_end)
const = 0;
164 virtual void unichar_ids_of(
NODE_REF node, NodeChildVector *vec,
165 bool word_end)
const = 0;
173 virtual bool end_of_word(
EDGE_REF edge_ref)
const = 0;
180 virtual void print_node(
NODE_REF node,
int max_num_edges)
const = 0;
209 debug_level_(debug_level) {}
213 return ((edge_rec & next_node_mask_) >> next_node_start_bit_);
217 return (edge_rec & (
MARKER_FLAG << flag_start_bit_)) != 0;
236 *edge_rec &= (~next_node_mask_);
237 *edge_rec |= ((value << next_node_start_bit_) & next_node_mask_);
254 UNICHAR_ID curr_unichar_id = unichar_id_from_edge_rec(edge_rec);
255 NODE_REF curr_next_node = next_node_from_edge_rec(edge_rec);
256 bool curr_word_end = end_of_word_from_edge_rec(edge_rec);
257 if (edge_rec_match(next_node, word_end, unichar_id, curr_next_node,
258 curr_word_end, curr_unichar_id))
return 0;
259 if (unichar_id > curr_unichar_id)
return 1;
260 if (unichar_id == curr_unichar_id) {
261 if (next_node > curr_next_node)
return 1;
262 if (next_node == curr_next_node) {
263 if (word_end > curr_word_end)
return 1;
277 return ((unichar_id == other_unichar_id) &&
278 (next_node == NO_EDGE || next_node == other_next_node) &&
279 (!word_end || (word_end == other_word_end)));
284 void init(
int unicharset_size);
295 void iterate_words_rec(
const WERD_CHOICE &word_so_far,
355 : dawg_index(-1), dawg_ref(NO_EDGE), punc_ref(NO_EDGE),
356 back_to_punc(false) {}
360 : dawg_index(dawg_idx), dawg_ref(dawgref),
361 punc_index(punc_idx), punc_ref(puncref),
362 back_to_punc(backtopunc) {
390 const char *debug_msg) {
391 for (
int i = 0; i < size_used_; ++i) {
392 if (data_[i] == new_pos)
return false;
416 :
Dawg(type, lang, perm, debug_level) {}
419 :
Dawg(type, lang, perm, debug_level) {
423 num_forward_edges_in_node0 = num_forward_edges(0);
428 :
Dawg(type, lang, perm, debug_level),
430 num_edges_(num_edges) {
431 init(unicharset_size);
432 num_forward_edges_in_node0 = num_forward_edges(0);
433 if (debug_level > 3) print_all(
"SquishedDawg:");
439 if (!read_squished_dawg(fp))
return false;
440 num_forward_edges_in_node0 = num_forward_edges(0);
448 bool word_end)
const;
453 bool word_end)
const {
455 if (!edge_occupied(edge) || edge == NO_EDGE)
return;
456 assert(forward_edge(edge));
458 if (!word_end || end_of_word_from_edge_rec(edges_[edge])) {
461 }
while (!last_edge(edge++));
467 return next_node_from_edge_rec((edges_[edge]));
473 return end_of_word_from_edge_rec((edges_[edge_ref]));
478 return unichar_id_from_edge_rec((edges_[edge_ref]));
483 void print_node(
NODE_REF node,
int max_num_edges)
const;
486 void write_squished_dawg(FILE *file);
491 FILE *file = fopen(filename,
"wb");
493 tprintf(
"Error opening %s\n", filename);
496 this->write_squished_dawg(file);
503 set_next_node_in_edge_rec(&(edges_[edge_ref]), value);
506 inline void set_empty_edge(
EDGE_REF edge_ref) {
507 (edges_[
edge_ref] = next_node_mask_);
510 inline void clear_all_edges() {
511 for (
int edge = 0; edge < num_edges_; edge++) set_empty_edge(edge);
514 inline void clear_marker_flag(
EDGE_REF edge_ref) {
518 inline bool forward_edge(
EDGE_REF edge_ref)
const {
519 return (edge_occupied(edge_ref) &&
520 (
FORWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
523 inline bool backward_edge(
EDGE_REF edge_ref)
const {
524 return (edge_occupied(edge_ref) &&
525 (
BACKWARD_EDGE == direction_from_edge_rec(edges_[edge_ref])));
528 inline bool edge_occupied(
EDGE_REF edge_ref)
const {
529 return (edges_[edge_ref] != next_node_mask_);
532 inline bool last_edge(
EDGE_REF edge_ref)
const {
533 return (edges_[edge_ref] & (
MARKER_FLAG << flag_start_bit_)) != 0;
540 bool read_squished_dawg(
TFile *file);
543 void print_edge(
EDGE_REF edge)
const;
546 void print_all(
const char* msg) {
547 tprintf(
"\n__________________________\n%s\n", msg);
548 for (
int i = 0; i < num_edges_; ++i) print_edge(i);
549 tprintf(
"__________________________\n");
558 int num_forward_edges_in_node0;
563 #endif // DICT_DAWG_H_
bool add_unique(const DawgPosition &new_pos, bool debug, const char *debug_msg)
int direction_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the direction flag of this edge.
bool end_of_word(EDGE_REF edge_ref) const
NODE_REF next_node(EDGE_REF edge) const
bool end_of_word_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns true if this edge marks the end of a word.
SquishedDawg(EDGE_ARRAY edges, int num_edges, DawgType type, const STRING &lang, PermuterType perm, int unicharset_size, int debug_level)
void set_next_node_in_edge_rec(EDGE_RECORD *edge_rec, EDGE_REF value)
Sets the next node link for this edge in the Dawg.
GenericVector< NodeChild > NodeChildVector
Dawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
GenericVector< int > SuccessorList
GenericVector< SuccessorList * > SuccessorListsVector
NODE_REF next_node_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the next node visited by following this edge.
void unichar_ids_of(NODE_REF node, NodeChildVector *vec, bool word_end) const
bool operator==(const DawgPosition &other)
PermuterType permuter() const
UNICHAR_ID edge_letter(EDGE_REF edge_ref) const
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
virtual EDGE_REF pattern_loop_edge(EDGE_REF edge_ref, UNICHAR_ID unichar_id, bool word_end) const
const STRING & lang() const
bool marker_flag_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns the marker flag of this edge.
SquishedDawg(const char *filename, DawgType type, const STRING &lang, PermuterType perm, int debug_level)
int given_greater_than_edge_rec(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, const EDGE_RECORD &edge_rec) const
NodeChild(UNICHAR_ID id, EDGE_REF ref)
bool edge_rec_match(NODE_REF next_node, bool word_end, UNICHAR_ID unichar_id, NODE_REF other_next_node, bool other_word_end, UNICHAR_ID other_unichar_id) const
void set_marker_flag_in_edge_rec(EDGE_RECORD *edge_rec)
Sets this edge record to be the last one in a sequence of edges.
bool Open(const STRING &filename, FileReader reader)
virtual void unichar_id_to_patterns(UNICHAR_ID unichar_id, const UNICHARSET &unicharset, GenericVector< UNICHAR_ID > *vec) const
PermuterType perm_
Permuter code that should be used if the word is found in this Dawg.
SquishedDawg(DawgType type, const STRING &lang, PermuterType perm, int debug_level)
DawgPosition(int dawg_idx, EDGE_REF dawgref, int punc_idx, EDGE_REF puncref, bool backtopunc)
UNICHAR_ID unichar_id_from_edge_rec(const EDGE_RECORD &edge_rec) const
Returns UNICHAR_ID recorded in this edge.
void write_squished_dawg(const char *filename)