tesseract  4.00.00dev
recodebeam.h
Go to the documentation of this file.
1 // File: recodebeam.h
3 // Description: Beam search to decode from the re-encoded CJK as a sequence of
4 // smaller numbers in place of a single large code.
5 // Author: Ray Smith
6 // Created: Fri Mar 13 09:12:01 PDT 2015
7 //
8 // (C) Copyright 2015, Google Inc.
9 // Licensed under the Apache License, Version 2.0 (the "License");
10 // you may not use this file except in compliance with the License.
11 // You may obtain a copy of the License at
12 // http://www.apache.org/licenses/LICENSE-2.0
13 // Unless required by applicable law or agreed to in writing, software
14 // distributed under the License is distributed on an "AS IS" BASIS,
15 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16 // See the License for the specific language governing permissions and
17 // limitations under the License.
18 //
20 
21 #ifndef THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
22 #define THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
23 
24 #include "dawg.h"
25 #include "dict.h"
26 #include "genericheap.h"
27 #include "kdpair.h"
28 #include "networkio.h"
29 #include "ratngs.h"
30 #include "unicharcompress.h"
31 
32 namespace tesseract {
33 
34 // Enum describing what can follow the current node.
35 // Consider the following softmax outputs:
36 // Timestep 0 1 2 3 4 5 6 7 8
37 // X-score 0.01 0.55 0.98 0.42 0.01 0.01 0.40 0.95 0.01
38 // Y-score 0.00 0.01 0.01 0.01 0.01 0.97 0.59 0.04 0.01
39 // Null-score 0.99 0.44 0.01 0.57 0.98 0.02 0.01 0.01 0.98
40 // Then the correct CTC decoding (in which adjacent equal classes are folded,
41 // and then all nulls are dropped) is clearly XYX, but simple decoding (taking
42 // the max at each timestep) leads to:
43 // Null@0.99 X@0.55 X@0.98 Null@0.57 Null@0.98 Y@0.97 Y@0.59 X@0.95 Null@0.98,
44 // which folds to the correct XYX. The conversion to Tesseract rating and
45 // certainty uses the sum of the log probs (log of the product of probabilities)
46 // for the Rating and the minimum log prob for the certainty, but that yields a
47 // minimum certainty of log(0.55), which is poor for such an obvious case.
48 // CTC says that the probability of the result is the SUM of the products of the
49 // probabilities over ALL PATHS that decode to the same result, which includes:
50 // NXXNNYYXN, NNXNNYYN, NXXXNYYXN, NNXXNYXXN, and others including XXXXXYYXX.
51 // That is intractable, so some compromise between simple and ideal is needed.
52 // Observing that evenly split timesteps rarely happen next to each other, we
53 // allow scores at a transition between classes to be added for decoding thus:
54 // N@0.99 (N+X)@0.99 X@0.98 (N+X)@0.99 N@0.98 Y@0.97 (X+Y+N)@1.00 X@0.95 N@0.98.
55 // This works because NNX and NXX both decode to X, so in the middle we can use
56 // N+X. Note that the classes either side of a sum must stand alone, i.e. use a
57 // single score, to force all paths to pass through them and decode to the same
58 // result. Also in the special case of a transition from X to Y, with only one
59 // timestep between, it is possible to add X+Y+N, since XXY, XYY, and XNY all
60 // decode to XY.
61 // An important condition is that we cannot combine X and Null between two
62 // stand-alone Xs, since that can decode as XNX->XX or XXX->X, so the scores for
63 // X and Null have to go in separate paths. Combining scores in this way
64 // provides a much better minimum certainty of log(0.95).
65 // In the implementation of the beam search, we have to place the possibilities
66 // X, X+N and X+Y+N in the beam under appropriate conditions of the previous
67 // node, and constrain what can follow, to enforce the rules explained above.
68 // We therefore have 3 different types of node determined by what can follow:
70  NC_ANYTHING, // This node used just its own score, so anything can follow.
71  NC_ONLY_DUP, // The current node combined another score with the score for
72  // itself, without a stand-alone duplicate before, so must be
73  // followed by a stand-alone duplicate.
74  NC_NO_DUP, // The current node combined another score with the score for
75  // itself, after a stand-alone, so can only be followed by
76  // something other than a duplicate of the current node.
78 };
79 
80 // Enum describing the top-n status of a code.
81 enum TopNState {
82  TN_TOP2, // Winner or 2nd.
83  TN_TOPN, // Runner up in top-n, but not 1st or 2nd.
84  TN_ALSO_RAN, // Not in the top-n.
86 };
87 
88 // Lattice element for Re-encode beam search.
89 struct RecodeNode {
91  : code(-1),
92  unichar_id(INVALID_UNICHAR_ID),
94  start_of_dawg(false),
95  start_of_word(false),
96  end_of_word(false),
97  duplicate(false),
98  certainty(0.0f),
99  score(0.0f),
100  prev(NULL),
101  dawgs(NULL),
102  code_hash(0) {}
103  RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start,
104  bool word_start, bool end, bool dup, float cert, float s,
105  const RecodeNode* p, DawgPositionVector* d, uinT64 hash)
106  : code(c),
107  unichar_id(uni_id),
108  permuter(perm),
109  start_of_dawg(dawg_start),
110  start_of_word(word_start),
111  end_of_word(end),
112  duplicate(dup),
113  certainty(cert),
114  score(s),
115  prev(p),
116  dawgs(d),
117  code_hash(hash) {}
118  // NOTE: If we could use C++11, then this would be a move constructor.
119  // Instead we have copy constructor that does a move!! This is because we
120  // don't want to copy the whole DawgPositionVector each time, and true
121  // copying isn't necessary for this struct. It does get moved around a lot
122  // though inside the heap and during heap push, hence the move semantics.
123  RecodeNode(RecodeNode& src) : dawgs(NULL) {
124  *this = src;
125  ASSERT_HOST(src.dawgs == NULL);
126  }
128  delete dawgs;
129  memcpy(this, &src, sizeof(src));
130  src.dawgs = NULL;
131  return *this;
132  }
133  ~RecodeNode() { delete dawgs; }
134  // Prints details of the node.
135  void Print(int null_char, const UNICHARSET& unicharset, int depth) const;
136 
137  // The re-encoded code here = index to network output.
138  int code;
139  // The decoded unichar_id is only valid for the final code of a sequence.
141  // The type of permuter active at this point. Intervals between start_of_word
142  // and end_of_word make valid words of type given by permuter where
143  // end_of_word is true. These aren't necessarily delimited by spaces.
145  // True if this is the initial dawg state. May be attached to a space or,
146  // in a non-space-delimited lang, the end of the previous word.
148  // True if this is the first node in a dictionary word.
150  // True if this represents a valid candidate end of word position. Does not
151  // necessarily mark the end of a word, since a word can be extended beyond a
152  // candidate end by a continuation, eg 'the' continues to 'these'.
154  // True if this->code is a duplicate of prev->code. Some training modes
155  // allow the network to output duplicate characters and crush them with CTC,
156  // but that would mess up the dictionary search, so we just smash them
157  // together on the fly using the duplicate flag.
158  bool duplicate;
159  // Certainty (log prob) of (just) this position.
160  float certainty;
161  // Total certainty of the path to this position.
162  float score;
163  // The previous node in this chain. Borrowed pointer.
164  const RecodeNode* prev;
165  // The currently active dawgs at this position. Owned pointer.
167  // A hash of all codes in the prefix and this->code as well. Used for
168  // duplicate path removal.
170 };
171 
174 
175 // Class that holds the entire beam search for recognition of a text line.
177  public:
178  // Borrows the pointer, which is expected to survive until *this is deleted.
179  RecodeBeamSearch(const UnicharCompress& recoder, int null_char,
180  bool simple_text, Dict* dict);
181 
182  // Decodes the set of network outputs, storing the lattice internally.
183  // If charset is not null, it enables detailed debugging of the beam search.
184  void Decode(const NetworkIO& output, double dict_ratio, double cert_offset,
185  double worst_dict_cert, const UNICHARSET* charset);
186  void Decode(const GENERIC_2D_ARRAY<float>& output, double dict_ratio,
187  double cert_offset, double worst_dict_cert,
188  const UNICHARSET* charset);
189 
190  // Returns the best path as labels/scores/xcoords similar to simple CTC.
191  void ExtractBestPathAsLabels(GenericVector<int>* labels,
192  GenericVector<int>* xcoords) const;
193  // Returns the best path as unichar-ids/certs/ratings/xcoords skipping
194  // duplicates, nulls and intermediate parts.
195  void ExtractBestPathAsUnicharIds(bool debug, const UNICHARSET* unicharset,
196  GenericVector<int>* unichar_ids,
197  GenericVector<float>* certs,
198  GenericVector<float>* ratings,
199  GenericVector<int>* xcoords) const;
200 
201  // Returns the best path as a set of WERD_RES.
202  void ExtractBestPathAsWords(const TBOX& line_box, float scale_factor,
203  bool debug, const UNICHARSET* unicharset,
204  PointerVector<WERD_RES>* words);
205 
206  // Generates debug output of the content of the beams after a Decode.
207  void DebugBeams(const UNICHARSET& unicharset) const;
208 
209  // Clipping value for certainty inside Tesseract. Reflects the minimum value
210  // of certainty that will be returned by ExtractBestPathAsUnicharIds.
211  // Supposedly on a uniform scale that can be compared across languages and
212  // engines.
213  static const float kMinCertainty;
214  // Number of different code lengths for which we have a separate beam.
215  static const int kNumLengths = RecodedCharID::kMaxCodeLen + 1;
216  // Total number of beams: dawg/nodawg * number of NodeContinuation * number
217  // of different lengths.
218  static const int kNumBeams = 2 * NC_COUNT * kNumLengths;
219  // Returns the relevant factor in the beams_ index.
220  static int LengthFromBeamsIndex(int index) { return index % kNumLengths; }
222  return static_cast<NodeContinuation>((index / kNumLengths) % NC_COUNT);
223  }
224  static bool IsDawgFromBeamsIndex(int index) {
225  return index / (kNumLengths * NC_COUNT) > 0;
226  }
227  // Computes a beams_ index from the given factors.
228  static int BeamIndex(bool is_dawg, NodeContinuation cont, int length) {
229  return (is_dawg * NC_COUNT + cont) * kNumLengths + length;
230  }
231 
232  private:
233  // Struct for the Re-encode beam search. This struct holds the data for
234  // a single time-step position of the output. Use a PointerVector<RecodeBeam>
235  // to hold all the timesteps and prevent reallocation of the individual heaps.
236  struct RecodeBeam {
237  // Resets to the initial state without deleting all the memory.
238  void Clear() {
239  for (int i = 0; i < kNumBeams; ++i) {
240  beams_[i].clear();
241  }
242  RecodeNode empty;
243  for (int i = 0; i < NC_COUNT; ++i) {
244  best_initial_dawgs_[i] = empty;
245  }
246  }
247 
248  // A separate beam for each combination of code length,
249  // NodeContinuation, and dictionary flag. Separating out all these types
250  // allows the beam to be quite narrow, and yet still have a low chance of
251  // losing the best path.
252  // We have to keep all these beams separate, since the highest scoring paths
253  // come from the paths that are most likely to dead-end at any time, like
254  // dawg paths, NC_ONLY_DUP etc.
255  // Each heap is stored with the WORST result at the top, so we can quickly
256  // get the top-n values.
257  RecodeHeap beams_[kNumBeams];
258  // While the language model is only a single word dictionary, we can use
259  // word starts as a choke point in the beam, and keep only a single dict
260  // start node at each step (for each NodeContinuation type), so we find the
261  // best one here and push it on the heap, if it qualifies, after processing
262  // all of the step.
263  RecodeNode best_initial_dawgs_[NC_COUNT];
264  };
266 
267  // Generates debug output of the content of a single beam position.
268  void DebugBeamPos(const UNICHARSET& unicharset, const RecodeHeap& heap) const;
269 
270  // Returns the given best_nodes as unichar-ids/certs/ratings/xcoords skipping
271  // duplicates, nulls and intermediate parts.
272  static void ExtractPathAsUnicharIds(
273  const GenericVector<const RecodeNode*>& best_nodes,
274  GenericVector<int>* unichar_ids, GenericVector<float>* certs,
275  GenericVector<float>* ratings, GenericVector<int>* xcoords);
276 
277  // Sets up a word with the ratings matrix and fake blobs with boxes in the
278  // right places.
279  WERD_RES* InitializeWord(bool leading_space, const TBOX& line_box,
280  int word_start, int word_end, float space_certainty,
281  const UNICHARSET* unicharset,
282  const GenericVector<int>& xcoords,
283  float scale_factor);
284 
285  // Fills top_n_flags_ with bools that are true iff the corresponding output
286  // is one of the top_n.
287  void ComputeTopN(const float* outputs, int num_outputs, int top_n);
288 
289  // Adds the computation for the current time-step to the beam. Call at each
290  // time-step in sequence from left to right. outputs is the activation vector
291  // for the current timestep.
292  void DecodeStep(const float* outputs, int t, double dict_ratio,
293  double cert_offset, double worst_dict_cert,
294  const UNICHARSET* charset);
295 
296  // Adds to the appropriate beams the legal (according to recoder)
297  // continuations of context prev, which is from the given index to beams_,
298  // using the given network outputs to provide scores to the choices. Uses only
299  // those choices for which top_n_flags[code] == top_n_flag.
300  void ContinueContext(const RecodeNode* prev, int index, const float* outputs,
301  TopNState top_n_flag, double dict_ratio,
302  double cert_offset, double worst_dict_cert,
303  RecodeBeam* step);
304  // Continues for a new unichar, using dawg or non-dawg as per flag.
305  void ContinueUnichar(int code, int unichar_id, float cert,
306  float worst_dict_cert, float dict_ratio, bool use_dawgs,
307  NodeContinuation cont, const RecodeNode* prev,
308  RecodeBeam* step);
309  // Adds a RecodeNode composed of the args to the correct heap in step if
310  // unichar_id is a valid dictionary continuation of whatever is in prev.
311  void ContinueDawg(int code, int unichar_id, float cert, NodeContinuation cont,
312  const RecodeNode* prev, RecodeBeam* step);
313  // Sets the correct best_initial_dawgs_ with a RecodeNode composed of the args
314  // if better than what is already there.
315  void PushInitialDawgIfBetter(int code, int unichar_id, PermuterType permuter,
316  bool start, bool end, float cert,
317  NodeContinuation cont, const RecodeNode* prev,
318  RecodeBeam* step);
319  // Adds a RecodeNode composed of the args to the correct heap in step for
320  // partial unichar or duplicate if there is room or if better than the
321  // current worst element if already full.
322  void PushDupOrNoDawgIfBetter(int length, bool dup, int code, int unichar_id,
323  float cert, float worst_dict_cert,
324  float dict_ratio, bool use_dawgs,
325  NodeContinuation cont, const RecodeNode* prev,
326  RecodeBeam* step);
327  // Adds a RecodeNode composed of the args to the correct heap in step if there
328  // is room or if better than the current worst element if already full.
329  void PushHeapIfBetter(int max_size, int code, int unichar_id,
330  PermuterType permuter, bool dawg_start, bool word_start,
331  bool end, bool dup, float cert, const RecodeNode* prev,
332  DawgPositionVector* d, RecodeHeap* heap);
333  // Adds a RecodeNode to heap if there is room
334  // or if better than the current worst element if already full.
335  void PushHeapIfBetter(int max_size, RecodeNode* node, RecodeHeap* heap);
336  // Searches the heap for an entry matching new_node, and updates the entry
337  // with reshuffle if needed. Returns true if there was a match.
338  bool UpdateHeapIfMatched(RecodeNode* new_node, RecodeHeap* heap);
339  // Computes and returns the code-hash for the given code and prev.
340  uinT64 ComputeCodeHash(int code, bool dup, const RecodeNode* prev) const;
341  // Backtracks to extract the best path through the lattice that was built
342  // during Decode. On return the best_nodes vector essentially contains the set
343  // of code, score pairs that make the optimal path with the constraint that
344  // the recoder can decode the code sequence back to a sequence of unichar-ids.
345  void ExtractBestPaths(GenericVector<const RecodeNode*>* best_nodes,
346  GenericVector<const RecodeNode*>* second_nodes) const;
347  // Helper backtracks through the lattice from the given node, storing the
348  // path and reversing it.
349  void ExtractPath(const RecodeNode* node,
351  // Helper prints debug information on the given lattice path.
352  void DebugPath(const UNICHARSET* unicharset,
353  const GenericVector<const RecodeNode*>& path) const;
354  // Helper prints debug information on the given unichar path.
355  void DebugUnicharPath(const UNICHARSET* unicharset,
357  const GenericVector<int>& unichar_ids,
358  const GenericVector<float>& certs,
359  const GenericVector<float>& ratings,
360  const GenericVector<int>& xcoords) const;
361 
362  static const int kBeamWidths[RecodedCharID::kMaxCodeLen + 1];
363 
364  // The encoder/decoder that we will be using.
365  const UnicharCompress& recoder_;
366  // The beam for each timestep in the output.
368  // The number of timesteps valid in beam_;
369  int beam_size_;
370  // A flag to indicate which outputs are the top-n choices. Current timestep
371  // only.
372  GenericVector<TopNState> top_n_flags_;
373  // A record of the highest and second scoring codes.
374  int top_code_;
375  int second_code_;
376  // Heap used to compute the top_n_flags_.
377  GenericHeap<TopPair> top_heap_;
378  // Borrowed pointer to the dictionary to use in the search.
379  Dict* dict_;
380  // True if the language is space-delimited, which is true for most languages
381  // except chi*, jpn, tha.
382  bool space_delimited_;
383  // True if the input is simple text, ie adjacent equal chars are not to be
384  // eliminated.
385  bool is_simple_text_;
386  // The encoded (class label) of the null/reject character.
387  int null_char_;
388 };
389 
390 } // namespace tesseract.
391 
392 #endif // THIRD_PARTY_TESSERACT_LSTM_RECODEBEAM_H_
static bool IsDawgFromBeamsIndex(int index)
Definition: recodebeam.h:224
RecodeNode(int c, int uni_id, PermuterType perm, bool dawg_start, bool word_start, bool end, bool dup, float cert, float s, const RecodeNode *p, DawgPositionVector *d, uinT64 hash)
Definition: recodebeam.h:103
static const float kMinCertainty
Definition: recodebeam.h:213
uint64_t uinT64
Definition: host.h:41
static const int kMaxCodeLen
void Print(int null_char, const UNICHARSET &unicharset, int depth) const
Definition: recodebeam.cpp:42
static int LengthFromBeamsIndex(int index)
Definition: recodebeam.h:220
GenericHeap< RecodePair > RecodeHeap
Definition: recodebeam.h:173
#define ASSERT_HOST(x)
Definition: errcode.h:84
RecodeNode(RecodeNode &src)
Definition: recodebeam.h:123
RecodeNode & operator=(RecodeNode &src)
Definition: recodebeam.h:127
const RecodeNode * prev
Definition: recodebeam.h:164
static NodeContinuation ContinuationFromBeamsIndex(int index)
Definition: recodebeam.h:221
static int BeamIndex(bool is_dawg, NodeContinuation cont, int length)
Definition: recodebeam.h:228
Definition: rect.h:30
KDPairInc< double, RecodeNode > RecodePair
Definition: recodebeam.h:172
NodeContinuation
Definition: recodebeam.h:69
PermuterType permuter
Definition: recodebeam.h:144
DawgPositionVector * dawgs
Definition: recodebeam.h:166
PermuterType
Definition: ratngs.h:240