tesseract  4.00.00dev
control.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: control.cpp (Formerly control.c)
3  * Description: Module-independent matcher controller.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 11:09:58 BST 1992
6  * ReHacked: Tue Sep 22 08:42:49 BST 1992 Phil Cheatle
7  *
8  * (C) Copyright 1992, Hewlett-Packard Ltd.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  **********************************************************************/
20 
21 // Include automatically generated configuration file if running autoconf.
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #include <string.h>
27 #include <math.h>
28 #ifdef __UNIX__
29 #include <assert.h>
30 #include <unistd.h>
31 #include <errno.h>
32 #endif
33 #include <ctype.h>
34 #include "callcpp.h"
35 #include "control.h"
36 #include "docqual.h"
37 #include "drawfx.h"
38 #include "fixspace.h"
39 #include "globals.h"
40 #include "lstmrecognizer.h"
41 #include "ocrclass.h"
42 #include "output.h"
43 #include "pgedit.h"
44 #include "reject.h"
45 #include "sorthelper.h"
46 #include "tessbox.h"
47 #include "tesseractclass.h"
48 #include "tessvars.h"
49 #include "werdit.h"
50 
51 #define MIN_FONT_ROW_COUNT 8
52 #define MAX_XHEIGHT_DIFF 3
53 
54 const char* const kBackUpConfigFile = "tempconfigdata.config";
55 // Min believable x-height for any text when refitting as a fraction of
56 // original x-height
57 const double kMinRefitXHeightFraction = 0.5;
58 
59 
66 namespace tesseract {
68  TBOX &selection_box) {
69  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
70  if (it != NULL) {
72  it->DeleteCurrentWord();
73  delete it;
74  }
75 }
76 
83  inT16 char_qual;
84  inT16 good_char_qual;
85 
86  WordData word_data(*pr_it);
87  SetupWordPassN(2, &word_data);
88  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
89  if (lstm_recognizer_ == NULL) {
90  classify_word_and_language(2, pr_it, &word_data);
91  } else {
92  classify_word_and_language(1, pr_it, &word_data);
93  }
95  WERD_RES* word_res = pr_it->word();
96  word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
97  tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
98  "char_quality: %d; good_char_quality: %d\n",
99  word_res->reject_map.length(),
100  word_blob_quality(word_res, pr_it->row()->row),
101  word_outline_errs(word_res), char_qual, good_char_qual);
102  }
103  return TRUE;
104 }
105 
106 // Helper function to check for a target word and handle it appropriately.
107 // Inspired by Jetsoft's requirement to process only single words on pass2
108 // and beyond.
109 // If word_config is not null:
110 // If the word_box and target_word_box overlap, read the word_config file
111 // else reset to previous config data.
112 // return true.
113 // else
114 // If the word_box and target_word_box overlap or pass <= 1, return true.
115 // Note that this function uses a fixed temporary file for storing the previous
116 // configs, so it is neither thread-safe, nor process-safe, but the assumption
117 // is that it will only be used for one debug window at a time.
118 //
119 // Since this function is used for debugging (and not to change OCR results)
120 // set only debug params from the word config file.
121 bool Tesseract::ProcessTargetWord(const TBOX& word_box,
122  const TBOX& target_word_box,
123  const char* word_config,
124  int pass) {
125  if (word_config != NULL) {
126  if (word_box.major_overlap(target_word_box)) {
127  if (backup_config_file_ == NULL) {
128  backup_config_file_ = kBackUpConfigFile;
129  FILE* config_fp = fopen(backup_config_file_, "wb");
130  ParamUtils::PrintParams(config_fp, params());
131  fclose(config_fp);
132  ParamUtils::ReadParamsFile(word_config,
134  params());
135  }
136  } else {
137  if (backup_config_file_ != NULL) {
138  ParamUtils::ReadParamsFile(backup_config_file_,
140  params());
141  backup_config_file_ = NULL;
142  }
143  }
144  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
145  return false;
146  }
147  return true;
148 }
149 
152  const TBOX* target_word_box,
153  const char* word_config,
154  PAGE_RES* page_res,
155  GenericVector<WordData>* words) {
156  // Prepare all the words.
157  PAGE_RES_IT page_res_it(page_res);
158  for (page_res_it.restart_page(); page_res_it.word() != NULL;
159  page_res_it.forward()) {
160  if (target_word_box == NULL ||
161  ProcessTargetWord(page_res_it.word()->word->bounding_box(),
162  *target_word_box, word_config, 1)) {
163  words->push_back(WordData(page_res_it));
164  }
165  }
166  // Setup all the words for recognition with polygonal approximation.
167  for (int w = 0; w < words->size(); ++w) {
168  SetupWordPassN(pass_n, &(*words)[w]);
169  if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
170  }
171 }
172 
173 // Sets up the single word ready for whichever engine is to be run.
174 void Tesseract::SetupWordPassN(int pass_n, WordData* word) {
175  if (pass_n == 1 || !word->word->done) {
176  if (pass_n == 1) {
177  word->word->SetupForRecognition(unicharset, this, BestPix(),
182  word->row, word->block);
183  } else if (pass_n == 2) {
184  // TODO(rays) Should we do this on pass1 too?
185  word->word->caps_height = 0.0;
186  if (word->word->x_height == 0.0f)
187  word->word->x_height = word->row->x_height();
188  }
189  word->lang_words.truncate(0);
190  for (int s = 0; s <= sub_langs_.size(); ++s) {
191  // The sub_langs_.size() entry is for the master language.
192  Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
193  WERD_RES* word_res = new WERD_RES;
194  word_res->InitForRetryRecognition(*word->word);
195  word->lang_words.push_back(word_res);
196  // LSTM doesn't get setup for pass2.
197  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
198  word_res->SetupForRecognition(
199  lang_t->unicharset, lang_t, BestPix(),
200  lang_t->tessedit_ocr_engine_mode, NULL,
202  lang_t->textord_use_cjk_fp_model,
203  lang_t->poly_allow_detailed_fx, word->row, word->block);
204  }
205  }
206  }
207 }
208 
209 // Runs word recognition on all the words.
210 bool Tesseract::RecogAllWordsPassN(int pass_n, ETEXT_DESC* monitor,
211  PAGE_RES_IT* pr_it,
212  GenericVector<WordData>* words) {
213  // TODO(rays) Before this loop can be parallelized (it would yield a massive
214  // speed-up) all remaining member globals need to be converted to local/heap
215  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
216  // added. The results will be significantly different with adaption on, and
217  // deterioration will need investigation.
218  pr_it->restart_page();
219  for (int w = 0; w < words->size(); ++w) {
220  WordData* word = &(*words)[w];
221  if (w > 0) word->prev_word = &(*words)[w - 1];
222  if (monitor != NULL) {
223  monitor->ocr_alive = TRUE;
224  if (pass_n == 1) {
225  monitor->progress = 70 * w / words->size();
226  if (monitor->progress_callback != NULL) {
227  TBOX box = pr_it->word()->word->bounding_box();
228  (*monitor->progress_callback)(monitor->progress, box.left(),
229  box.right(), box.top(), box.bottom());
230  }
231  } else {
232  monitor->progress = 70 + 30 * w / words->size();
233  if (monitor->progress_callback != NULL) {
234  (*monitor->progress_callback)(monitor->progress, 0, 0, 0, 0);
235  }
236  }
237  if (monitor->deadline_exceeded() ||
238  (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
239  words->size()))) {
240  // Timeout. Fake out the rest of the words.
241  for (; w < words->size(); ++w) {
242  (*words)[w].word->SetupFake(unicharset);
243  }
244  return false;
245  }
246  }
247  if (word->word->tess_failed) {
248  int s;
249  for (s = 0; s < word->lang_words.size() &&
250  word->lang_words[s]->tess_failed; ++s) {}
251  // If all are failed, skip it. Image words are skipped by this test.
252  if (s > word->lang_words.size()) continue;
253  }
254  // Sync pr_it with the wth WordData.
255  while (pr_it->word() != NULL && pr_it->word() != word->word)
256  pr_it->forward();
257  ASSERT_HOST(pr_it->word() != NULL);
258  bool make_next_word_fuzzy = false;
259  if (!AnyLSTMLang() &&
260  ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
261  // Needs to be setup again to see the new outlines in the chopped_word.
262  SetupWordPassN(pass_n, word);
263  }
264 
265  classify_word_and_language(pass_n, pr_it, word);
267  tprintf("Pass%d: %s [%s]\n", pass_n,
269  word->word->best_choice->debug_string().string());
270  }
271  pr_it->forward();
272  if (make_next_word_fuzzy && pr_it->word() != NULL) {
273  pr_it->MakeCurrentWordFuzzy();
274  }
275  }
276  return true;
277 }
278 
301  ETEXT_DESC* monitor,
302  const TBOX* target_word_box,
303  const char* word_config,
304  int dopasses) {
305  PAGE_RES_IT page_res_it(page_res);
306 
308  tessedit_test_adaption.set_value (TRUE);
309  tessedit_minimal_rejection.set_value (TRUE);
310  }
311 
312  if (dopasses==0 || dopasses==1) {
313  page_res_it.restart_page();
314  // ****************** Pass 1 *******************
315 
316  // If the adaptive classifier is full switch to one we prepared earlier,
317  // ie on the previous page. If the current adaptive classifier is non-empty,
318  // prepare a backup starting at this page, in case it fills up. Do all this
319  // independently for each language.
320  if (AdaptiveClassifierIsFull()) {
322  } else if (!AdaptiveClassifierIsEmpty()) {
324  }
325  // Now check the sub-langs as well.
326  for (int i = 0; i < sub_langs_.size(); ++i) {
327  if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
328  sub_langs_[i]->SwitchAdaptiveClassifier();
329  } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
330  sub_langs_[i]->StartBackupAdaptiveClassifier();
331  }
332  }
333  // Set up all words ready for recognition, so that if parallelism is on
334  // all the input and output classes are ready to run the classifier.
336  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
337  if (tessedit_parallelize) {
338  PrerecAllWordsPar(words);
339  }
340 
341  stats_.word_count = words.size();
342 
343  stats_.dict_words = 0;
344  stats_.doc_blob_quality = 0;
345  stats_.doc_outline_errs = 0;
346  stats_.doc_char_quality = 0;
347  stats_.good_char_count = 0;
348  stats_.doc_good_char_quality = 0;
349 
350  most_recently_used_ = this;
351  // Run pass 1 word recognition.
352  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
353  // Pass 1 post-processing.
354  for (page_res_it.restart_page(); page_res_it.word() != NULL;
355  page_res_it.forward()) {
356  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
357  fix_rep_char(&page_res_it);
358  continue;
359  }
360 
361  // Count dict words.
362  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
363  ++(stats_.dict_words);
364 
365  // Update misadaption log (we only need to do it on pass 1, since
366  // adaption only happens on this pass).
367  if (page_res_it.word()->blamer_bundle != NULL &&
368  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
369  page_res->misadaption_log.push_back(
370  page_res_it.word()->blamer_bundle->misadaption_debug());
371  }
372  }
373  }
374 
375  if (dopasses == 1) return true;
376 
377  // ****************** Pass 2 *******************
379  AnyTessLang()) {
380  page_res_it.restart_page();
382  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
383  if (tessedit_parallelize) {
384  PrerecAllWordsPar(words);
385  }
386  most_recently_used_ = this;
387  // Run pass 2 word recognition.
388  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
389  }
390 
391  // The next passes are only required for Tess-only.
392  if (AnyTessLang() && !AnyLSTMLang()) {
393  // ****************** Pass 3 *******************
394  // Fix fuzzy spaces.
396 
399  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
400 
401  // ****************** Pass 4 *******************
404 
405  // ****************** Pass 5,6 *******************
406  rejection_passes(page_res, monitor, target_word_box, word_config);
407 
408  // ****************** Pass 8 *******************
409  font_recognition_pass(page_res);
410 
411  // ****************** Pass 9 *******************
412  // Check the correctness of the final results.
413  blamer_pass(page_res);
414  script_pos_pass(page_res);
415  }
416 
417  // Write results pass.
419  // This is now redundant, but retained commented so show how to obtain
420  // bounding boxes and style information.
421 
422  // changed by jetsoft
423  // needed for dll to output memory structure
424  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
425  output_pass(page_res_it, target_word_box);
426  // end jetsoft
427  PageSegMode pageseg_mode = static_cast<PageSegMode>(
428  static_cast<int>(tessedit_pageseg_mode));
429  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
430 
431  // Remove empty words, as these mess up the result iterators.
432  for (page_res_it.restart_page(); page_res_it.word() != NULL;
433  page_res_it.forward()) {
434  WERD_RES* word = page_res_it.word();
435  POLY_BLOCK* pb = page_res_it.block()->block != NULL
436  ? page_res_it.block()->block->poly_block()
437  : NULL;
438  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
439  (word->best_choice->IsAllSpaces() && (pb == NULL || pb->IsText()))) {
440  page_res_it.DeleteCurrentWord();
441  }
442  }
443 
444  if (monitor != NULL) {
445  monitor->progress = 100;
446  }
447  return true;
448 }
449 
451  PAGE_RES_IT word_it(page_res);
452 
453  WERD_RES *w_prev = NULL;
454  WERD_RES *w = word_it.word();
455  while (1) {
456  w_prev = w;
457  while (word_it.forward() != NULL &&
458  (!word_it.word() || word_it.word()->part_of_combo)) {
459  // advance word_it, skipping over parts of combos
460  }
461  if (!word_it.word()) break;
462  w = word_it.word();
463  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
464  continue;
465  }
466  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
467  if (tessedit_bigram_debug) {
468  tprintf("Skipping because one of the words is W_REP_CHAR\n");
469  }
470  continue;
471  }
472  // Two words sharing the same language model, excellent!
473  GenericVector<WERD_CHOICE *> overrides_word1;
474  GenericVector<WERD_CHOICE *> overrides_word2;
475 
476  STRING orig_w1_str = w_prev->best_choice->unichar_string();
477  STRING orig_w2_str = w->best_choice->unichar_string();
478  WERD_CHOICE prev_best(w->uch_set);
479  {
480  int w1start, w1end;
481  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
482  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
483  }
484  WERD_CHOICE this_best(w->uch_set);
485  {
486  int w2start, w2end;
487  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
488  this_best = w->best_choice->shallow_copy(w2start, w2end);
489  }
490 
491  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
492  if (tessedit_bigram_debug) {
493  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
494  orig_w1_str.string(), orig_w2_str.string());
495  }
496  continue;
497  }
498  if (tessedit_bigram_debug > 2) {
499  tprintf("Examining alt choices for \"%s %s\".\n",
500  orig_w1_str.string(), orig_w2_str.string());
501  }
502  if (tessedit_bigram_debug > 1) {
503  if (!w_prev->best_choices.singleton()) {
504  w_prev->PrintBestChoices();
505  }
506  if (!w->best_choices.singleton()) {
507  w->PrintBestChoices();
508  }
509  }
510  float best_rating = 0.0;
511  int best_idx = 0;
512  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
513  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
514  WERD_CHOICE *p1 = prev_it.data();
515  WERD_CHOICE strip1(w->uch_set);
516  {
517  int p1start, p1end;
518  p1->GetNonSuperscriptSpan(&p1start, &p1end);
519  strip1 = p1->shallow_copy(p1start, p1end);
520  }
521  WERD_CHOICE_IT w_it(&w->best_choices);
522  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
523  WERD_CHOICE *p2 = w_it.data();
524  WERD_CHOICE strip2(w->uch_set);
525  {
526  int p2start, p2end;
527  p2->GetNonSuperscriptSpan(&p2start, &p2end);
528  strip2 = p2->shallow_copy(p2start, p2end);
529  }
530  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
531  overrides_word1.push_back(p1);
532  overrides_word2.push_back(p2);
533  if (overrides_word1.size() == 1 ||
534  p1->rating() + p2->rating() < best_rating) {
535  best_rating = p1->rating() + p2->rating();
536  best_idx = overrides_word1.size() - 1;
537  }
538  }
539  }
540  }
541  if (!overrides_word1.empty()) {
542  // Excellent, we have some bigram matches.
544  *overrides_word1[best_idx]) &&
546  *overrides_word2[best_idx])) {
547  if (tessedit_bigram_debug > 1) {
548  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
549  "model.\n", orig_w1_str.string(), orig_w2_str.string());
550  }
551  continue;
552  }
553  STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
554  STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
555  if (new_w1_str != orig_w1_str) {
556  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
557  }
558  if (new_w2_str != orig_w2_str) {
559  w->ReplaceBestChoice(overrides_word2[best_idx]);
560  }
561  if (tessedit_bigram_debug > 0) {
562  STRING choices_description;
563  int num_bigram_choices
564  = overrides_word1.size() * overrides_word2.size();
565  if (num_bigram_choices == 1) {
566  choices_description = "This was the unique bigram choice.";
567  } else {
568  if (tessedit_bigram_debug > 1) {
569  STRING bigrams_list;
570  const int kMaxChoicesToPrint = 20;
571  for (int i = 0; i < overrides_word1.size() &&
572  i < kMaxChoicesToPrint; i++) {
573  if (i > 0) { bigrams_list += ", "; }
574  WERD_CHOICE *p1 = overrides_word1[i];
575  WERD_CHOICE *p2 = overrides_word2[i];
576  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
577  if (i == kMaxChoicesToPrint) {
578  bigrams_list += " ...";
579  }
580  }
581  choices_description = "There were many choices: {";
582  choices_description += bigrams_list;
583  choices_description += "}";
584  } else {
585  choices_description.add_str_int("There were ", num_bigram_choices);
586  choices_description += " compatible bigrams.";
587  }
588  }
589  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
590  orig_w1_str.string(), orig_w2_str.string(),
591  new_w1_str.string(), new_w2_str.string(),
592  choices_description.string());
593  }
594  }
595  }
596 }
597 
599  ETEXT_DESC* monitor,
600  const TBOX* target_word_box,
601  const char* word_config) {
602  PAGE_RES_IT page_res_it(page_res);
603  // ****************** Pass 5 *******************
604  // Gather statistics on rejects.
605  int word_index = 0;
606  while (!tessedit_test_adaption && page_res_it.word() != NULL) {
608  WERD_RES* word = page_res_it.word();
609  word_index++;
610  if (monitor != NULL) {
611  monitor->ocr_alive = TRUE;
612  monitor->progress = 95 + 5 * word_index / stats_.word_count;
613  }
614  if (word->rebuild_word == NULL) {
615  // Word was not processed by tesseract.
616  page_res_it.forward();
617  continue;
618  }
619  check_debug_pt(word, 70);
620 
621  // changed by jetsoft
622  // specific to its needs to extract one word when need
623  if (target_word_box &&
625  *target_word_box, word_config, 4)) {
626  page_res_it.forward();
627  continue;
628  }
629  // end jetsoft
630 
631  page_res_it.rej_stat_word();
632  int chars_in_word = word->reject_map.length();
633  int rejects_in_word = word->reject_map.reject_count();
634 
635  int blob_quality = word_blob_quality(word, page_res_it.row()->row);
636  stats_.doc_blob_quality += blob_quality;
637  int outline_errs = word_outline_errs(word);
638  stats_.doc_outline_errs += outline_errs;
639  inT16 all_char_quality;
640  inT16 accepted_all_char_quality;
641  word_char_quality(word, page_res_it.row()->row,
642  &all_char_quality, &accepted_all_char_quality);
643  stats_.doc_char_quality += all_char_quality;
644  uint8_t permuter_type = word->best_choice->permuter();
645  if ((permuter_type == SYSTEM_DAWG_PERM) ||
646  (permuter_type == FREQ_DAWG_PERM) ||
647  (permuter_type == USER_DAWG_PERM)) {
648  stats_.good_char_count += chars_in_word - rejects_in_word;
649  stats_.doc_good_char_quality += accepted_all_char_quality;
650  }
651  check_debug_pt(word, 80);
653  (blob_quality == 0) && (outline_errs >= chars_in_word))
655  check_debug_pt(word, 90);
656  page_res_it.forward();
657  }
658 
660  tprintf
661  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
662  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
663  page_res->char_count, page_res->rej_count,
664  page_res->rej_count / static_cast<float>(page_res->char_count),
665  stats_.doc_blob_quality,
666  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
667  stats_.doc_outline_errs,
668  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
669  stats_.doc_char_quality,
670  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
671  stats_.doc_good_char_quality,
672  (stats_.good_char_count > 0) ?
673  (stats_.doc_good_char_quality /
674  static_cast<float>(stats_.good_char_count)) : 0.0);
675  }
676  BOOL8 good_quality_doc =
677  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
678  quality_rej_pc) &&
679  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
680  quality_blob_pc) &&
681  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
683  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
685 
686  // ****************** Pass 6 *******************
687  // Do whole document or whole block rejection pass
688  if (!tessedit_test_adaption) {
690  quality_based_rejection(page_res_it, good_quality_doc);
691  }
692 }
693 
695  if (!wordrec_run_blamer) return;
696  PAGE_RES_IT page_res_it(page_res);
697  for (page_res_it.restart_page(); page_res_it.word() != NULL;
698  page_res_it.forward()) {
699  WERD_RES *word = page_res_it.word();
702  }
703  tprintf("Blame reasons:\n");
704  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
706  static_cast<IncorrectResultReason>(bl)),
707  page_res->blame_reasons[bl]);
708  }
709  if (page_res->misadaption_log.length() > 0) {
710  tprintf("Misadaption log:\n");
711  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
712  tprintf("%s\n", page_res->misadaption_log[i].string());
713  }
714  }
715 }
716 
717 // Sets script positions and detects smallcaps on all output words.
719  PAGE_RES_IT page_res_it(page_res);
720  for (page_res_it.restart_page(); page_res_it.word() != NULL;
721  page_res_it.forward()) {
722  WERD_RES* word = page_res_it.word();
723  if (word->word->flag(W_REP_CHAR)) {
724  page_res_it.forward();
725  continue;
726  }
727  float x_height = page_res_it.block()->block->x_height();
728  float word_x_height = word->x_height;
729  if (word_x_height < word->best_choice->min_x_height() ||
730  word_x_height > word->best_choice->max_x_height()) {
731  word_x_height = (word->best_choice->min_x_height() +
732  word->best_choice->max_x_height()) / 2.0f;
733  }
734  // Test for small caps. Word capheight must be close to block xheight,
735  // and word must contain no lower case letters, and at least one upper case.
736  double small_cap_xheight = x_height * kXHeightCapRatio;
737  double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
738  if (word->uch_set->script_has_xheight() &&
739  small_cap_xheight - small_cap_delta <= word_x_height &&
740  word_x_height <= small_cap_xheight + small_cap_delta) {
741  // Scan for upper/lower.
742  int num_upper = 0;
743  int num_lower = 0;
744  for (int i = 0; i < word->best_choice->length(); ++i) {
745  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
746  ++num_upper;
747  else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
748  ++num_lower;
749  }
750  if (num_upper > 0 && num_lower == 0)
751  word->small_caps = true;
752  }
753  word->SetScriptPositions();
754  }
755 }
756 
757 // Helper finds the gap between the index word and the next.
758 static void WordGap(const PointerVector<WERD_RES>& words, int index, int* right,
759  int* next_left) {
760  *right = -MAX_INT32;
761  *next_left = MAX_INT32;
762  if (index < words.size()) {
763  *right = words[index]->word->bounding_box().right();
764  if (index + 1 < words.size())
765  *next_left = words[index + 1]->word->bounding_box().left();
766  }
767 }
768 
769 // Factored helper computes the rating, certainty, badness and validity of
770 // the permuter of the words in [first_index, end_index).
771 static void EvaluateWordSpan(const PointerVector<WERD_RES>& words,
772  int first_index, int end_index, float* rating,
773  float* certainty, bool* bad,
774  bool* valid_permuter) {
775  if (end_index <= first_index) {
776  *bad = true;
777  *valid_permuter = false;
778  }
779  for (int index = first_index; index < end_index && index < words.size();
780  ++index) {
781  WERD_CHOICE* choice = words[index]->best_choice;
782  if (choice == nullptr) {
783  *bad = true;
784  } else {
785  *rating += choice->rating();
786  *certainty = MIN(*certainty, choice->certainty());
787  if (!Dict::valid_word_permuter(choice->permuter(), false))
788  *valid_permuter = false;
789  }
790  }
791 }
792 
793 // Helper chooses the best combination of words, transferring good ones from
794 // new_words to best_words. To win, a new word must have (better rating and
795 // certainty) or (better permuter status and rating within rating ratio and
796 // certainty within certainty margin) than current best.
797 // All the new_words are consumed (moved to best_words or deleted.)
798 // The return value is the number of new_words used minus the number of
799 // best_words that remain in the output.
800 static int SelectBestWords(double rating_ratio,
801  double certainty_margin,
802  bool debug,
803  PointerVector<WERD_RES>* new_words,
804  PointerVector<WERD_RES>* best_words) {
805  // Process the smallest groups of words that have an overlapping word
806  // boundary at the end.
807  GenericVector<WERD_RES*> out_words;
808  // Index into each word vector (best, new).
809  int b = 0, n = 0;
810  int num_best = 0, num_new = 0;
811  while (b < best_words->size() || n < new_words->size()) {
812  // Start of the current run in each.
813  int start_b = b, start_n = n;
814  while (b < best_words->size() || n < new_words->size()) {
815  int b_right = -MAX_INT32;
816  int next_b_left = MAX_INT32;
817  WordGap(*best_words, b, &b_right, &next_b_left);
818  int n_right = -MAX_INT32;
819  int next_n_left = MAX_INT32;
820  WordGap(*new_words, n, &n_right, &next_n_left);
821  if (MAX(b_right, n_right) < MIN(next_b_left, next_n_left)) {
822  // The word breaks overlap. [start_b,b] and [start_n, n] match.
823  break;
824  }
825  // Keep searching for the matching word break.
826  if ((b_right < n_right && b < best_words->size()) ||
827  n == new_words->size())
828  ++b;
829  else
830  ++n;
831  }
832  // Rating of the current run in each.
833  float b_rating = 0.0f, n_rating = 0.0f;
834  // Certainty of the current run in each.
835  float b_certainty = 0.0f, n_certainty = 0.0f;
836  // True if any word is missing its best choice.
837  bool b_bad = false, n_bad = false;
838  // True if all words have a valid permuter.
839  bool b_valid_permuter = true, n_valid_permuter = true;
840  int end_b = b < best_words->size() ? b + 1 : b;
841  int end_n = n < new_words->size() ? n + 1 : n;
842  EvaluateWordSpan(*best_words, start_b, end_b, &b_rating, &b_certainty,
843  &b_bad, &b_valid_permuter);
844  EvaluateWordSpan(*new_words, start_n, end_n, &n_rating, &n_certainty,
845  &n_bad, &n_valid_permuter);
846  bool new_better = false;
847  if (!n_bad && (b_bad || (n_certainty > b_certainty &&
848  n_rating < b_rating) ||
849  (!b_valid_permuter && n_valid_permuter &&
850  n_rating < b_rating * rating_ratio &&
851  n_certainty > b_certainty - certainty_margin))) {
852  // New is better.
853  for (int i = start_n; i < end_n; ++i) {
854  out_words.push_back((*new_words)[i]);
855  (*new_words)[i] = NULL;
856  ++num_new;
857  }
858  new_better = true;
859  } else if (!b_bad) {
860  // Current best is better.
861  for (int i = start_b; i < end_b; ++i) {
862  out_words.push_back((*best_words)[i]);
863  (*best_words)[i] = NULL;
864  ++num_best;
865  }
866  }
867  if (debug) {
868  tprintf("%d new words %s than %d old words: r: %g v %g c: %g v %g"
869  " valid dict: %d v %d\n",
870  end_n - start_n, new_better ? "better" : "worse",
871  end_b - start_b, n_rating, b_rating,
872  n_certainty, b_certainty, n_valid_permuter, b_valid_permuter);
873  }
874  // Move on to the next group.
875  b = end_b;
876  n = end_n;
877  }
878  // Transfer from out_words to best_words.
879  best_words->clear();
880  for (int i = 0; i < out_words.size(); ++i)
881  best_words->push_back(out_words[i]);
882  return num_new - num_best;
883 }
884 
885 // Helper to recognize the word using the given (language-specific) tesseract.
886 // Returns positive if this recognizer found more new best words than the
887 // number kept from best_words.
889  WordRecognizer recognizer, bool debug,
890  WERD_RES** in_word,
891  PointerVector<WERD_RES>* best_words) {
892  if (debug) {
893  tprintf("Trying word using lang %s, oem %d\n",
894  lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
895  }
896  // Run the recognizer on the word.
897  PointerVector<WERD_RES> new_words;
898  (this->*recognizer)(word_data, in_word, &new_words);
899  if (new_words.empty()) {
900  // Transfer input word to new_words, as the classifier must have put
901  // the result back in the input.
902  new_words.push_back(*in_word);
903  *in_word = NULL;
904  }
905  if (debug) {
906  for (int i = 0; i < new_words.size(); ++i)
907  new_words[i]->DebugTopChoice("Lang result");
908  }
909  // Initial version is a bit of a hack based on better certainty and rating
910  // or a dictionary vs non-dictionary word.
911  return SelectBestWords(classify_max_rating_ratio,
913  debug, &new_words, best_words);
914 }
915 
916 // Helper returns true if all the words are acceptable.
917 static bool WordsAcceptable(const PointerVector<WERD_RES>& words) {
918  for (int w = 0; w < words.size(); ++w) {
919  if (words[w]->tess_failed || !words[w]->tess_accepted) return false;
920  }
921  return true;
922 }
923 
924 // Moves good-looking "noise"/diacritics from the reject list to the main
925 // blob list on the current word. Returns true if anything was done, and
926 // sets make_next_word_fuzzy if blob(s) were added to the end of the word.
928  bool* make_next_word_fuzzy) {
929  *make_next_word_fuzzy = false;
930  WERD* real_word = pr_it->word()->word;
931  if (real_word->rej_cblob_list()->empty() ||
932  real_word->cblob_list()->empty() ||
933  real_word->rej_cblob_list()->length() > noise_maxperword)
934  return false;
935  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
936  // Get the noise outlines into a vector with matching bool map.
937  GenericVector<C_OUTLINE*> outlines;
938  real_word->GetNoiseOutlines(&outlines);
939  GenericVector<bool> word_wanted;
940  GenericVector<bool> overlapped_any_blob;
941  GenericVector<C_BLOB*> target_blobs;
942  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
943  &word_wanted, &overlapped_any_blob,
944  &target_blobs);
945  // Filter the outlines that overlapped any blob and put them into the word
946  // now. This simplifies the remaining task and also makes it more accurate
947  // as it has more completed blobs to work on.
948  GenericVector<bool> wanted;
949  GenericVector<C_BLOB*> wanted_blobs;
950  GenericVector<C_OUTLINE*> wanted_outlines;
951  int num_overlapped = 0;
952  int num_overlapped_used = 0;
953  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
954  if (overlapped_any_blob[i]) {
955  ++num_overlapped;
956  if (word_wanted[i]) ++num_overlapped_used;
957  wanted.push_back(word_wanted[i]);
958  wanted_blobs.push_back(target_blobs[i]);
959  wanted_outlines.push_back(outlines[i]);
960  outlines[i] = NULL;
961  }
962  }
963  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);
964  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
965  &target_blobs);
966  int non_overlapped = 0;
967  int non_overlapped_used = 0;
968  for (int i = 0; i < word_wanted.size(); ++i) {
969  if (word_wanted[i]) ++non_overlapped_used;
970  if (outlines[i] != NULL) ++non_overlapped_used;
971  }
972  if (debug_noise_removal) {
973  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
974  num_overlapped_used, num_overlapped, non_overlapped_used,
975  non_overlapped);
976  real_word->bounding_box().print();
977  }
978  // Now we have decided which outlines we want, put them into the real_word.
979  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
980  make_next_word_fuzzy)) {
981  pr_it->MakeCurrentWordFuzzy();
982  }
983  // TODO(rays) Parts of combos have a deep copy of the real word, and need
984  // to have their noise outlines moved/assigned in the same way!!
985  return num_overlapped_used != 0 || non_overlapped_used != 0;
986 }
987 
988 // Attempts to put noise/diacritic outlines into the blobs that they overlap.
989 // Input: a set of noisy outlines that probably belong to the real_word.
990 // Output: word_wanted indicates which outlines are to be assigned to a blob,
991 // target_blobs indicates which to assign to, and overlapped_any_blob is
992 // true for all outlines that overlapped a blob.
994  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
995  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
996  GenericVector<bool>* overlapped_any_blob,
997  GenericVector<C_BLOB*>* target_blobs) {
998  GenericVector<bool> blob_wanted;
999  word_wanted->init_to_size(outlines.size(), false);
1000  overlapped_any_blob->init_to_size(outlines.size(), false);
1001  target_blobs->init_to_size(outlines.size(), NULL);
1002  // For each real blob, find the outlines that seriously overlap it.
1003  // A single blob could be several merged characters, so there can be quite
1004  // a few outlines overlapping, and the full engine needs to be used to chop
1005  // and join to get a sensible result.
1006  C_BLOB_IT blob_it(real_word->cblob_list());
1007  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1008  C_BLOB* blob = blob_it.data();
1009  TBOX blob_box = blob->bounding_box();
1010  blob_wanted.init_to_size(outlines.size(), false);
1011  int num_blob_outlines = 0;
1012  for (int i = 0; i < outlines.size(); ++i) {
1013  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1014  !(*word_wanted)[i]) {
1015  blob_wanted[i] = true;
1016  (*overlapped_any_blob)[i] = true;
1017  ++num_blob_outlines;
1018  }
1019  }
1020  if (debug_noise_removal) {
1021  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1022  blob_box.print();
1023  }
1024  // If any outlines overlap the blob, and not too many, classify the blob
1025  // (using the full engine, languages and all), and choose the maximal
1026  // combination of outlines that doesn't hurt the end-result classification
1027  // by too much. Mark them as wanted.
1028  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1029  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1030  outlines, num_blob_outlines,
1031  &blob_wanted)) {
1032  for (int i = 0; i < blob_wanted.size(); ++i) {
1033  if (blob_wanted[i]) {
1034  // Claim the outline and record where it is going.
1035  (*word_wanted)[i] = true;
1036  (*target_blobs)[i] = blob;
1037  }
1038  }
1039  }
1040  }
1041  }
1042 }
1043 
1044 // Attempts to assign non-overlapping outlines to their nearest blobs or
1045 // make new blobs out of them.
1047  const GenericVector<C_OUTLINE*>& outlines, int pass, WERD* real_word,
1048  PAGE_RES_IT* pr_it, GenericVector<bool>* word_wanted,
1049  GenericVector<C_BLOB*>* target_blobs) {
1050  GenericVector<bool> blob_wanted;
1051  word_wanted->init_to_size(outlines.size(), false);
1052  target_blobs->init_to_size(outlines.size(), NULL);
1053  // Check for outlines that need to be turned into stand-alone blobs.
1054  for (int i = 0; i < outlines.size(); ++i) {
1055  if (outlines[i] == NULL) continue;
1056  // Get a set of adjacent outlines that don't overlap any existing blob.
1057  blob_wanted.init_to_size(outlines.size(), false);
1058  int num_blob_outlines = 0;
1059  TBOX total_ol_box(outlines[i]->bounding_box());
1060  while (i < outlines.size() && outlines[i] != NULL) {
1061  blob_wanted[i] = true;
1062  total_ol_box += outlines[i]->bounding_box();
1063  ++i;
1064  ++num_blob_outlines;
1065  }
1066  // Find the insertion point.
1067  C_BLOB_IT blob_it(real_word->cblob_list());
1068  while (!blob_it.at_last() &&
1069  blob_it.data_relative(1)->bounding_box().left() <=
1070  total_ol_box.left()) {
1071  blob_it.forward();
1072  }
1073  // Choose which combination of them we actually want and where to put
1074  // them.
1075  if (debug_noise_removal)
1076  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1077  C_BLOB* left_blob = blob_it.data();
1078  TBOX left_box = left_blob->bounding_box();
1079  C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
1080  if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
1081  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1082  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1083  outlines, num_blob_outlines,
1084  &blob_wanted)) {
1085  if (debug_noise_removal) tprintf("Added to left blob\n");
1086  for (int j = 0; j < blob_wanted.size(); ++j) {
1087  if (blob_wanted[j]) {
1088  (*word_wanted)[j] = true;
1089  (*target_blobs)[j] = left_blob;
1090  }
1091  }
1092  } else if (right_blob != NULL &&
1093  (!left_box.x_overlap(total_ol_box) ||
1094  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1096  right_blob, outlines,
1097  num_blob_outlines, &blob_wanted)) {
1098  if (debug_noise_removal) tprintf("Added to right blob\n");
1099  for (int j = 0; j < blob_wanted.size(); ++j) {
1100  if (blob_wanted[j]) {
1101  (*word_wanted)[j] = true;
1102  (*target_blobs)[j] = right_blob;
1103  }
1104  }
1105  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
1106  outlines, num_blob_outlines,
1107  &blob_wanted)) {
1108  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1109  for (int j = 0; j < blob_wanted.size(); ++j) {
1110  if (blob_wanted[j]) {
1111  (*word_wanted)[j] = true;
1112  (*target_blobs)[j] = NULL;
1113  }
1114  }
1115  }
1116  }
1117 }
1118 
1119 // Starting with ok_outlines set to indicate which outlines overlap the blob,
1120 // chooses the optimal set (approximately) and returns true if any outlines
1121 // are desired, in which case ok_outlines indicates which ones.
1123  int pass, float certainty_threshold, PAGE_RES_IT* pr_it, C_BLOB* blob,
1124  const GenericVector<C_OUTLINE*>& outlines, int num_outlines,
1125  GenericVector<bool>* ok_outlines) {
1126  STRING best_str;
1127  float target_cert = certainty_threshold;
1128  if (blob != NULL) {
1129  float target_c2;
1130  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1131  if (debug_noise_removal) {
1132  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
1133  target_cert, target_c2);
1134  blob->bounding_box().print();
1135  }
1136  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1137  }
1138  GenericVector<bool> test_outlines = *ok_outlines;
1139  // Start with all the outlines in.
1140  STRING all_str;
1141  GenericVector<bool> best_outlines = *ok_outlines;
1142  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1143  pr_it, blob, &all_str);
1144  if (debug_noise_removal) {
1145  TBOX ol_box;
1146  for (int i = 0; i < test_outlines.size(); ++i) {
1147  if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1148  }
1149  tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1150  all_str.string(), best_cert, best_cert - target_cert);
1151  ol_box.print();
1152  }
1153  // Iteratively zero out the bit that improves the certainty the most, until
1154  // we get past the threshold, have zero bits, or fail to improve.
1155  int best_index = 0; // To zero out.
1156  while (num_outlines > 1 && best_index >= 0 &&
1157  (blob == NULL || best_cert < target_cert || blob != NULL)) {
1158  // Find the best bit to zero out.
1159  best_index = -1;
1160  for (int i = 0; i < outlines.size(); ++i) {
1161  if (test_outlines[i]) {
1162  test_outlines[i] = false;
1163  STRING str;
1164  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1165  pr_it, blob, &str);
1166  if (debug_noise_removal) {
1167  TBOX ol_box;
1168  for (int j = 0; j < outlines.size(); ++j) {
1169  if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1170  tprintf("%d", test_outlines[j]);
1171  }
1172  tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
1173  cert, cert - target_cert);
1174  ol_box.print();
1175  }
1176  if (cert > best_cert) {
1177  best_cert = cert;
1178  best_index = i;
1179  best_outlines = test_outlines;
1180  }
1181  test_outlines[i] = true;
1182  }
1183  }
1184  if (best_index >= 0) {
1185  test_outlines[best_index] = false;
1186  --num_outlines;
1187  }
1188  }
1189  if (best_cert >= target_cert) {
1190  // Save the best combination.
1191  *ok_outlines = best_outlines;
1192  if (debug_noise_removal) {
1193  tprintf("%s noise combination ", blob ? "Adding" : "New");
1194  for (int i = 0; i < best_outlines.size(); ++i) {
1195  tprintf("%d", best_outlines[i]);
1196  }
1197  tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1198  target_cert);
1199  }
1200  return true;
1201  }
1202  return false;
1203 }
1204 
1205 // Classifies the given blob plus the outlines flagged by ok_outlines, undoes
1206 // the inclusion of the outlines, and returns the certainty of the raw choice.
1208  const GenericVector<bool>& ok_outlines,
1209  const GenericVector<C_OUTLINE*>& outlines, int pass_n, PAGE_RES_IT* pr_it,
1210  C_BLOB* blob, STRING* best_str) {
1211  C_OUTLINE_IT ol_it;
1212  C_OUTLINE* first_to_keep = NULL;
1213  if (blob != NULL) {
1214  // Add the required outlines to the blob.
1215  ol_it.set_to_list(blob->out_list());
1216  first_to_keep = ol_it.data();
1217  }
1218  for (int i = 0; i < ok_outlines.size(); ++i) {
1219  if (ok_outlines[i]) {
1220  // This outline is to be added.
1221  if (blob == NULL) {
1222  blob = new C_BLOB(outlines[i]);
1223  ol_it.set_to_list(blob->out_list());
1224  } else {
1225  ol_it.add_before_stay_put(outlines[i]);
1226  }
1227  }
1228  }
1229  float c2;
1230  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1231  ol_it.move_to_first();
1232  if (first_to_keep == NULL) {
1233  // We created blob. Empty its outlines and delete it.
1234  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1235  delete blob;
1236  cert = -c2;
1237  } else {
1238  // Remove the outlines that we put in.
1239  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1240  ol_it.extract();
1241  }
1242  }
1243  return cert;
1244 }
1245 
1246 // Classifies the given blob (part of word_data->word->word) as an individual
1247 // word, using languages, chopper etc, returning only the certainty of the
1248 // best raw choice, and undoing all the work done to fake out the word.
1250  C_BLOB* blob, STRING* best_str, float* c2) {
1251  WERD* real_word = pr_it->word()->word;
1252  WERD* word = real_word->ConstructFromSingleBlob(
1253  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1254  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1255  // Get a new iterator that points to the new word.
1256  PAGE_RES_IT it(pr_it->page_res);
1257  while (it.word() != word_res && it.word() != NULL) it.forward();
1258  ASSERT_HOST(it.word() == word_res);
1259  WordData wd(it);
1260  // Force full initialization.
1261  SetupWordPassN(1, &wd);
1262  classify_word_and_language(pass_n, &it, &wd);
1263  if (debug_noise_removal) {
1264  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1265  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1266  wd.word->raw_choice->max_x_height());
1267  }
1268  float cert = wd.word->raw_choice->certainty();
1269  float rat = wd.word->raw_choice->rating();
1270  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1271  *best_str = wd.word->raw_choice->unichar_string();
1272  it.DeleteCurrentWord();
1273  pr_it->ResetWordIterator();
1274  return cert;
1275 }
1276 
1277 // Generic function for classifying a word. Can be used either for pass1 or
1278 // pass2 according to the function passed to recognizer.
1279 // word_data holds the word to be recognized, and its block and row, and
1280 // pr_it points to the word as well, in case we are running LSTM and it wants
1281 // to output multiple words.
1282 // Recognizes in the current language, and if successful that is all.
1283 // If recognition was not successful, tries all available languages until
1284 // it gets a successful result or runs out of languages. Keeps the best result.
1286  WordData* word_data) {
1287  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1289  // Best result so far.
1290  PointerVector<WERD_RES> best_words;
1291  // Points to the best result. May be word or in lang_words.
1292  WERD_RES* word = word_data->word;
1293  clock_t start_t = clock();
1294  bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1295  if (debug) {
1296  tprintf("%s word with lang %s at:",
1297  word->done ? "Already done" : "Processing",
1298  most_recently_used_->lang.string());
1299  word->word->bounding_box().print();
1300  }
1301  if (word->done) {
1302  // If done on pass1, leave it as-is.
1303  if (!word->tess_failed)
1304  most_recently_used_ = word->tesseract;
1305  return;
1306  }
1307  int sub = sub_langs_.size();
1308  if (most_recently_used_ != this) {
1309  // Get the index of the most_recently_used_.
1310  for (sub = 0; sub < sub_langs_.size() &&
1311  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1312  }
1313  most_recently_used_->RetryWithLanguage(
1314  *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1315  Tesseract* best_lang_tess = most_recently_used_;
1316  if (!WordsAcceptable(best_words)) {
1317  // Try all the other languages to see if they are any better.
1318  if (most_recently_used_ != this &&
1319  this->RetryWithLanguage(*word_data, recognizer, debug,
1320  &word_data->lang_words[sub_langs_.size()],
1321  &best_words) > 0) {
1322  best_lang_tess = this;
1323  }
1324  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1325  ++i) {
1326  if (most_recently_used_ != sub_langs_[i] &&
1327  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
1328  &word_data->lang_words[i],
1329  &best_words) > 0) {
1330  best_lang_tess = sub_langs_[i];
1331  }
1332  }
1333  }
1334  most_recently_used_ = best_lang_tess;
1335  if (!best_words.empty()) {
1336  if (best_words.size() == 1 && !best_words[0]->combination) {
1337  // Move the best single result to the main word.
1338  word_data->word->ConsumeWordResults(best_words[0]);
1339  } else {
1340  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1341  word_data->word = best_words.back();
1342  pr_it->ReplaceCurrentWord(&best_words);
1343  }
1344  ASSERT_HOST(word_data->word->box_word != NULL);
1345  } else {
1346  tprintf("no best words!!\n");
1347  }
1348  clock_t ocr_t = clock();
1349  if (tessedit_timing_debug) {
1350  tprintf("%s (ocr took %.2f sec)\n",
1351  word->best_choice->unichar_string().string(),
1352  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1353  }
1354 }
1355 
1363  WERD_RES** in_word,
1364  PointerVector<WERD_RES>* out_words) {
1365  ROW* row = word_data.row;
1366  BLOCK* block = word_data.block;
1367  prev_word_best_choice_ = word_data.prev_word != NULL
1368  ? word_data.prev_word->word->best_choice : NULL;
1369 #ifndef ANDROID_BUILD
1372  if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1373  LSTMRecognizeWord(*block, row, *in_word, out_words);
1374  if (!out_words->empty())
1375  return; // Successful lstm recognition.
1376  }
1378  // No fallback allowed, so use a fake.
1379  (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1380  return;
1381  }
1382  // Fall back to tesseract for failed words or odd words.
1383  (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
1384  OEM_TESSERACT_ONLY, NULL,
1387  poly_allow_detailed_fx, row, block);
1388  }
1389 #endif
1390  WERD_RES* word = *in_word;
1391  match_word_pass_n(1, word, row, block);
1392  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1393  word->tess_would_adapt = AdaptableWord(word);
1394  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1395 
1396  if (adapt_ok) {
1397  // Send word to adaptive classifier for training.
1398  word->BestChoiceToCorrectText();
1399  LearnWord(NULL, word);
1400  // Mark misadaptions if running blamer.
1401  if (word->blamer_bundle != NULL) {
1404  }
1405  }
1406 
1407  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1409  }
1410 }
1411 
1412 // Helper to report the result of the xheight fix.
1413 void Tesseract::ReportXhtFixResult(bool accept_new_word, float new_x_ht,
1414  WERD_RES* word, WERD_RES* new_word) {
1415  tprintf("New XHT Match:%s = %s ",
1416  word->best_choice->unichar_string().string(),
1417  word->best_choice->debug_string().string());
1418  word->reject_map.print(debug_fp);
1419  tprintf(" -> %s = %s ",
1420  new_word->best_choice->unichar_string().string(),
1421  new_word->best_choice->debug_string().string());
1422  new_word->reject_map.print(debug_fp);
1423  tprintf(" %s->%s %s %s\n",
1424  word->guessed_x_ht ? "GUESS" : "CERT",
1425  new_word->guessed_x_ht ? "GUESS" : "CERT",
1426  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1427  accept_new_word ? "ACCEPTED" : "");
1428 }
1429 
1430 // Run the x-height fix-up, based on min/max top/bottom information in
1431 // unicharset.
1432 // Returns true if the word was changed.
1433 // See the comment in fixxht.cpp for a description of the overall process.
1434 bool Tesseract::TrainedXheightFix(WERD_RES *word, BLOCK* block, ROW *row) {
1435  int original_misfits = CountMisfitTops(word);
1436  if (original_misfits == 0)
1437  return false;
1438  float baseline_shift = 0.0f;
1439  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1440  if (baseline_shift != 0.0f) {
1441  // Try the shift on its own first.
1442  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1443  word, block, row))
1444  return false;
1445  original_misfits = CountMisfitTops(word);
1446  if (original_misfits > 0) {
1447  float new_baseline_shift;
1448  // Now recompute the new x_height.
1449  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1450  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1451  // No test of return value here, as we are definitely making a change
1452  // to the word by shifting the baseline.
1453  TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1454  word, block, row);
1455  }
1456  }
1457  return true;
1458  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1459  return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1460  word, block, row);
1461  } else {
1462  return false;
1463  }
1464 }
1465 
1466 // Runs recognition with the test baseline shift and x-height and returns true
1467 // if there was an improvement in recognition result.
1468 bool Tesseract::TestNewNormalization(int original_misfits,
1469  float baseline_shift, float new_x_ht,
1470  WERD_RES *word, BLOCK* block, ROW *row) {
1471  bool accept_new_x_ht = false;
1472  WERD_RES new_x_ht_word(word->word);
1473  if (word->blamer_bundle != NULL) {
1474  new_x_ht_word.blamer_bundle = new BlamerBundle();
1475  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1476  }
1477  new_x_ht_word.x_height = new_x_ht;
1478  new_x_ht_word.baseline_shift = baseline_shift;
1479  new_x_ht_word.caps_height = 0.0;
1480  new_x_ht_word.SetupForRecognition(
1483  poly_allow_detailed_fx, row, block);
1484  match_word_pass_n(2, &new_x_ht_word, row, block);
1485  if (!new_x_ht_word.tess_failed) {
1486  int new_misfits = CountMisfitTops(&new_x_ht_word);
1487  if (debug_x_ht_level >= 1) {
1488  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1489  original_misfits, word->x_height,
1490  new_misfits, new_x_ht);
1491  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1492  word->best_choice->rating(), word->best_choice->certainty(),
1493  new_x_ht_word.best_choice->rating(),
1494  new_x_ht_word.best_choice->certainty());
1495  }
1496  // The misfits must improve and either the rating or certainty.
1497  accept_new_x_ht = new_misfits < original_misfits &&
1498  (new_x_ht_word.best_choice->certainty() >
1499  word->best_choice->certainty() ||
1500  new_x_ht_word.best_choice->rating() <
1501  word->best_choice->rating());
1502  if (debug_x_ht_level >= 1) {
1503  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1504  }
1505  }
1506  if (accept_new_x_ht) {
1507  word->ConsumeWordResults(&new_x_ht_word);
1508  return true;
1509  }
1510  return false;
1511 }
1512 
1520  WERD_RES** in_word,
1521  PointerVector<WERD_RES>* out_words) {
1522  // Return if we do not want to run Tesseract.
1524  return;
1525  }
1526  ROW* row = word_data.row;
1527  BLOCK* block = word_data.block;
1528  WERD_RES* word = *in_word;
1529  prev_word_best_choice_ = word_data.prev_word != NULL
1530  ? word_data.prev_word->word->best_choice : NULL;
1531 
1533  check_debug_pt(word, 30);
1534  if (!word->done) {
1535  word->caps_height = 0.0;
1536  if (word->x_height == 0.0f)
1537  word->x_height = row->x_height();
1538  match_word_pass_n(2, word, row, block);
1539  check_debug_pt(word, 40);
1540  }
1541 
1542  SubAndSuperscriptFix(word);
1543 
1544  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1546  block->classify_rotation().y() == 0.0f) {
1547  // Use the tops and bottoms since they are available.
1548  TrainedXheightFix(word, block, row);
1549  }
1550 
1552  }
1553 #ifndef GRAPHICS_DISABLED
1555  if (fx_win == NULL)
1556  create_fx_win();
1557  clear_fx_win();
1558  word->rebuild_word->plot(fx_win);
1559  TBOX wbox = word->rebuild_word->bounding_box();
1560  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1561  wbox.right(), wbox.bottom());
1563  }
1564 #endif
1566  check_debug_pt(word, 50);
1567 }
1568 
1569 
1577  ROW *row, BLOCK* block) {
1578  if (word->tess_failed) return;
1579  tess_segment_pass_n(pass_n, word);
1580 
1581  if (!word->tess_failed) {
1582  if (!word->word->flag (W_REP_CHAR)) {
1583  word->fix_quotes();
1585  word->fix_hyphens();
1586  /* Don't trust fix_quotes! - though I think I've fixed the bug */
1587  if (word->best_choice->length() != word->box_word->length()) {
1588  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1589  " #Blobs=%d\n",
1590  word->best_choice->debug_string().string(),
1591  word->best_choice->length(),
1592  word->box_word->length());
1593 
1594  }
1595  word->tess_accepted = tess_acceptable_word(word);
1596 
1597  // Also sets word->done flag
1598  make_reject_map(word, row, pass_n);
1599  }
1600  }
1601  set_word_fonts(word);
1602 
1603  ASSERT_HOST(word->raw_choice != NULL);
1604 }
1605 
1606 // Helper to return the best rated BLOB_CHOICE in the whole word that matches
1607 // the given char_id, or NULL if none can be found.
1608 static BLOB_CHOICE* FindBestMatchingChoice(UNICHAR_ID char_id,
1609  WERD_RES* word_res) {
1610  // Find the corresponding best BLOB_CHOICE from any position in the word_res.
1611  BLOB_CHOICE* best_choice = NULL;
1612  for (int i = 0; i < word_res->best_choice->length(); ++i) {
1613  BLOB_CHOICE* choice = FindMatchingChoice(char_id,
1614  word_res->GetBlobChoices(i));
1615  if (choice != NULL) {
1616  if (best_choice == NULL || choice->rating() < best_choice->rating())
1617  best_choice = choice;
1618  }
1619  }
1620  return best_choice;
1621 }
1622 
1623 // Helper to insert blob_choice in each location in the leader word if there is
1624 // no matching BLOB_CHOICE there already, and correct any incorrect results
1625 // in the best_choice.
1626 static void CorrectRepcharChoices(BLOB_CHOICE* blob_choice,
1627  WERD_RES* word_res) {
1628  WERD_CHOICE* word = word_res->best_choice;
1629  for (int i = 0; i < word_res->best_choice->length(); ++i) {
1630  BLOB_CHOICE* choice = FindMatchingChoice(blob_choice->unichar_id(),
1631  word_res->GetBlobChoices(i));
1632  if (choice == NULL) {
1633  BLOB_CHOICE_IT choice_it(word_res->GetBlobChoices(i));
1634  choice_it.add_before_stay_put(new BLOB_CHOICE(*blob_choice));
1635  }
1636  }
1637  // Correct any incorrect results in word.
1638  for (int i = 0; i < word->length(); ++i) {
1639  if (word->unichar_id(i) != blob_choice->unichar_id())
1640  word->set_unichar_id(blob_choice->unichar_id(), i);
1641  }
1642 }
1643 
1652  WERD_RES *word_res = page_res_it->word();
1653  const WERD_CHOICE &word = *(word_res->best_choice);
1654 
1655  // Find the frequency of each unique character in the word.
1656  SortHelper<UNICHAR_ID> rep_ch(word.length());
1657  for (int i = 0; i < word.length(); ++i) {
1658  rep_ch.Add(word.unichar_id(i), 1);
1659  }
1660 
1661  // Find the most frequent result.
1662  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1663  int max_count = rep_ch.MaxCount(&maxch_id);
1664  // Find the best exemplar of a classifier result for maxch_id.
1665  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1666  if (best_choice == NULL) {
1667  tprintf("Failed to find a choice for %s, occurring %d times\n",
1668  word_res->uch_set->debug_str(maxch_id).string(), max_count);
1669  return;
1670  }
1671  word_res->done = TRUE;
1672 
1673  // Measure the mean space.
1674  int gap_count = 0;
1675  WERD* werd = word_res->word;
1676  C_BLOB_IT blob_it(werd->cblob_list());
1677  C_BLOB* prev_blob = blob_it.data();
1678  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1679  C_BLOB* blob = blob_it.data();
1680  int gap = blob->bounding_box().left();
1681  gap -= prev_blob->bounding_box().right();
1682  ++gap_count;
1683  prev_blob = blob;
1684  }
1685  // Just correct existing classification.
1686  CorrectRepcharChoices(best_choice, word_res);
1687  word_res->reject_map.initialise(word.length());
1688 }
1689 
1691  const UNICHARSET& char_set, const char *s, const char *lengths) {
1692  int i = 0;
1693  int offset = 0;
1694  int leading_punct_count;
1695  int upper_count = 0;
1696  int hyphen_pos = -1;
1698 
1699  if (strlen (lengths) > 20)
1700  return word_type;
1701 
1702  /* Single Leading punctuation char*/
1703 
1704  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1705  offset += lengths[i++];
1706  leading_punct_count = i;
1707 
1708  /* Initial cap */
1709  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1710  offset += lengths[i++];
1711  upper_count++;
1712  }
1713  if (upper_count > 1) {
1714  word_type = AC_UPPER_CASE;
1715  } else {
1716  /* Lower case word, possibly with an initial cap */
1717  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1718  offset += lengths[i++];
1719  }
1720  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1721  goto not_a_word;
1722  /*
1723  Allow a single hyphen in a lower case word
1724  - don't trust upper case - I've seen several cases of "H" -> "I-I"
1725  */
1726  if (lengths[i] == 1 && s[offset] == '-') {
1727  hyphen_pos = i;
1728  offset += lengths[i++];
1729  if (s[offset] != '\0') {
1730  while ((s[offset] != '\0') &&
1731  char_set.get_islower(s + offset, lengths[i])) {
1732  offset += lengths[i++];
1733  }
1734  if (i < hyphen_pos + 3)
1735  goto not_a_word;
1736  }
1737  } else {
1738  /* Allow "'s" in NON hyphenated lower case words */
1739  if (lengths[i] == 1 && (s[offset] == '\'') &&
1740  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1741  offset += lengths[i++];
1742  offset += lengths[i++];
1743  }
1744  }
1745  if (upper_count > 0)
1746  word_type = AC_INITIAL_CAP;
1747  else
1748  word_type = AC_LOWER_CASE;
1749  }
1750 
1751  /* Up to two different, constrained trailing punctuation chars */
1752  if (lengths[i] == 1 && s[offset] != '\0' &&
1753  STRING(chs_trailing_punct1).contains(s[offset]))
1754  offset += lengths[i++];
1755  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1756  s[offset - lengths[i - 1]] != s[offset] &&
1757  STRING(chs_trailing_punct2).contains (s[offset]))
1758  offset += lengths[i++];
1759 
1760  if (s[offset] != '\0')
1761  word_type = AC_UNACCEPTABLE;
1762 
1763  not_a_word:
1764 
1765  if (word_type == AC_UNACCEPTABLE) {
1766  /* Look for abbreviation string */
1767  i = 0;
1768  offset = 0;
1769  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1770  word_type = AC_UC_ABBREV;
1771  while (s[offset] != '\0' &&
1772  char_set.get_isupper(s + offset, lengths[i]) &&
1773  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1774  offset += lengths[i++];
1775  offset += lengths[i++];
1776  }
1777  }
1778  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1779  word_type = AC_LC_ABBREV;
1780  while (s[offset] != '\0' &&
1781  char_set.get_islower(s + offset, lengths[i]) &&
1782  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1783  offset += lengths[i++];
1784  offset += lengths[i++];
1785  }
1786  }
1787  if (s[offset] != '\0')
1788  word_type = AC_UNACCEPTABLE;
1789  }
1790 
1791  return word_type;
1792 }
1793 
1795  BOOL8 show_map_detail = FALSE;
1796  inT16 i;
1797 
1798  if (!test_pt)
1799  return FALSE;
1800 
1801  tessedit_rejection_debug.set_value (FALSE);
1802  debug_x_ht_level.set_value(0);
1803 
1804  if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1805  if (location < 0)
1806  return TRUE; // For breakpoint use
1807  tessedit_rejection_debug.set_value (TRUE);
1808  debug_x_ht_level.set_value(2);
1809  tprintf ("\n\nTESTWD::");
1810  switch (location) {
1811  case 0:
1812  tprintf ("classify_word_pass1 start\n");
1813  word->word->print();
1814  break;
1815  case 10:
1816  tprintf ("make_reject_map: initial map");
1817  break;
1818  case 20:
1819  tprintf ("make_reject_map: after NN");
1820  break;
1821  case 30:
1822  tprintf ("classify_word_pass2 - START");
1823  break;
1824  case 40:
1825  tprintf ("classify_word_pass2 - Pre Xht");
1826  break;
1827  case 50:
1828  tprintf ("classify_word_pass2 - END");
1829  show_map_detail = TRUE;
1830  break;
1831  case 60:
1832  tprintf ("fixspace");
1833  break;
1834  case 70:
1835  tprintf ("MM pass START");
1836  break;
1837  case 80:
1838  tprintf ("MM pass END");
1839  break;
1840  case 90:
1841  tprintf ("After Poor quality rejection");
1842  break;
1843  case 100:
1844  tprintf ("unrej_good_quality_words - START");
1845  break;
1846  case 110:
1847  tprintf ("unrej_good_quality_words - END");
1848  break;
1849  case 120:
1850  tprintf ("Write results pass");
1851  show_map_detail = TRUE;
1852  break;
1853  }
1854  if (word->best_choice != NULL) {
1855  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1856  word->reject_map.print(debug_fp);
1857  tprintf("\n");
1858  if (show_map_detail) {
1859  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1860  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1861  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1862  word->reject_map[i].full_print(debug_fp);
1863  }
1864  }
1865  } else {
1866  tprintf("null best choice\n");
1867  }
1868  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1869  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1870  return TRUE;
1871  } else {
1872  return FALSE;
1873  }
1874 }
1875 
1881 static void find_modal_font( //good chars in word
1882  STATS *fonts, //font stats
1883  inT16 *font_out, //output font
1884  int8_t *font_count //output count
1885  ) {
1886  inT16 font; //font index
1887  inT32 count; //pile couat
1888 
1889  if (fonts->get_total () > 0) {
1890  font = (inT16) fonts->mode ();
1891  *font_out = font;
1892  count = fonts->pile_count (font);
1893  *font_count = count < MAX_INT8 ? count : MAX_INT8;
1894  fonts->add (font, -*font_count);
1895  }
1896  else {
1897  *font_out = -1;
1898  *font_count = 0;
1899  }
1900 }
1901 
1908  // Don't try to set the word fonts for an lstm word, as the configs
1909  // will be meaningless.
1910  if (word->chopped_word == NULL) return;
1911  ASSERT_HOST(word->best_choice != NULL);
1912 
1913  int fontinfo_size = get_fontinfo_table().size();
1914  if (fontinfo_size == 0) return;
1915  GenericVector<int> font_total_score;
1916  font_total_score.init_to_size(fontinfo_size, 0);
1917 
1918  word->italic = 0;
1919  word->bold = 0;
1920  // Compute the font scores for the word
1921  if (tessedit_debug_fonts) {
1922  tprintf("Examining fonts in %s\n",
1923  word->best_choice->debug_string().string());
1924  }
1925  for (int b = 0; b < word->best_choice->length(); ++b) {
1926  BLOB_CHOICE* choice = word->GetBlobChoice(b);
1927  if (choice == NULL) continue;
1928  const GenericVector<ScoredFont>& fonts = choice->fonts();
1929  for (int f = 0; f < fonts.size(); ++f) {
1930  int fontinfo_id = fonts[f].fontinfo_id;
1931  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1932  font_total_score[fontinfo_id] += fonts[f].score;
1933  }
1934  }
1935  }
1936  // Find the top and 2nd choice for the word.
1937  int score1 = 0, score2 = 0;
1938  inT16 font_id1 = -1, font_id2 = -1;
1939  for (int f = 0; f < fontinfo_size; ++f) {
1940  if (tessedit_debug_fonts && font_total_score[f] > 0) {
1941  tprintf("Font %s, total score = %d\n",
1942  fontinfo_table_.get(f).name, font_total_score[f]);
1943  }
1944  if (font_total_score[f] > score1) {
1945  score2 = score1;
1946  font_id2 = font_id1;
1947  score1 = font_total_score[f];
1948  font_id1 = f;
1949  } else if (font_total_score[f] > score2) {
1950  score2 = font_total_score[f];
1951  font_id2 = f;
1952  }
1953  }
1954  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
1955  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
1956  // Each score has a limit of MAX_UINT16, so divide by that to get the number
1957  // of "votes" for that font, ie number of perfect scores.
1958  word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8);
1959  word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8);
1960  if (score1 > 0) {
1961  FontInfo fi = fontinfo_table_.get(font_id1);
1962  if (tessedit_debug_fonts) {
1963  if (word->fontinfo_id2_count > 0) {
1964  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
1965  fi.name, word->fontinfo_id_count,
1966  fontinfo_table_.get(font_id2).name,
1967  word->fontinfo_id2_count);
1968  } else {
1969  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
1970  fi.name, word->fontinfo_id_count);
1971  }
1972  }
1973  word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
1974  word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
1975  }
1976 }
1977 
1978 
1986  PAGE_RES_IT page_res_it(page_res);
1987  WERD_RES *word; // current word
1988  STATS doc_fonts(0, font_table_size_); // font counters
1989 
1990  // Gather font id statistics.
1991  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1992  page_res_it.forward()) {
1993  word = page_res_it.word();
1994  if (word->fontinfo != NULL) {
1995  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
1996  }
1997  if (word->fontinfo2 != NULL) {
1998  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
1999  }
2000  }
2001  inT16 doc_font; // modal font
2002  int8_t doc_font_count; // modal font
2003  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2004  if (doc_font_count == 0)
2005  return;
2006  // Get the modal font pointer.
2007  const FontInfo* modal_font = NULL;
2008  for (page_res_it.restart_page(); page_res_it.word() != NULL;
2009  page_res_it.forward()) {
2010  word = page_res_it.word();
2011  if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
2012  modal_font = word->fontinfo;
2013  break;
2014  }
2015  if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
2016  modal_font = word->fontinfo2;
2017  break;
2018  }
2019  }
2020  ASSERT_HOST(modal_font != NULL);
2021 
2022  // Assign modal font to weak words.
2023  for (page_res_it.restart_page(); page_res_it.word() != NULL;
2024  page_res_it.forward()) {
2025  word = page_res_it.word();
2026  int length = word->best_choice->length();
2027 
2028  int count = word->fontinfo_id_count;
2029  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2030  word->fontinfo = modal_font;
2031  // Counts only get 1 as it came from the doc.
2032  word->fontinfo_id_count = 1;
2033  word->italic = modal_font->is_italic() ? 1 : -1;
2034  word->bold = modal_font->is_bold() ? 1 : -1;
2035  }
2036  }
2037 }
2038 
2039 // If a word has multiple alternates check if the best choice is in the
2040 // dictionary. If not, replace it with an alternate that exists in the
2041 // dictionary.
2043  PAGE_RES_IT word_it(page_res);
2044  for (WERD_RES* word = word_it.word(); word != NULL;
2045  word = word_it.forward()) {
2046  if (word->best_choices.singleton())
2047  continue; // There are no alternates.
2048 
2049  WERD_CHOICE* best = word->best_choice;
2050  if (word->tesseract->getDict().valid_word(*best) != 0)
2051  continue; // The best choice is in the dictionary.
2052 
2053  WERD_CHOICE_IT choice_it(&word->best_choices);
2054  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2055  choice_it.forward()) {
2056  WERD_CHOICE* alternate = choice_it.data();
2057  if (word->tesseract->getDict().valid_word(*alternate)) {
2058  // The alternate choice is in the dictionary.
2059  if (tessedit_bigram_debug) {
2060  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2061  best->unichar_string().string(),
2062  alternate->unichar_string().string());
2063  }
2064  // Replace the 'best' choice with a better choice.
2065  word->ReplaceBestChoice(alternate);
2066  break;
2067  }
2068  }
2069  }
2070 }
2071 
2072 } // namespace tesseract
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
bool deadline_exceeded() const
Definition: ocrclass.h:158
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:269
void CopyTruth(const BlamerBundle &other)
Definition: blamer.h:187
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1362
inT32 char_count
Definition: pageres.h:60
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:787
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:68
#define LOC_WRITE_RESULTS
Definition: errcode.h:54
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:210
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1468
Unacceptable word.
Definition: control.h:36
#define TRUE
Definition: capi.h:45
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:91
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
Definition: points.h:189
inT32 get_total() const
Definition: statistc.h:86
float max_x_height() const
Definition: ratngs.h:337
int32_t inT32
Definition: host.h:38
volatile inT8 ocr_alive
true if not last
Definition: ocrclass.h:123
bool classify_bln_numeric_mode
Definition: classify.h:499
void plot(ScrollView *window)
Definition: blobs.cpp:916
float min_x_height() const
Definition: ratngs.h:334
void init_to_size(int size, T t)
void print(FILE *fp)
Definition: rejctmap.cpp:391
int UNICHAR_ID
Definition: unichar.h:33
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:48
ParamsVectors * params()
Definition: ccutil.h:62
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1122
float baseline_shift
Definition: pageres.h:297
BOOL8 tess_failed
Definition: pageres.h:272
static const double kXHeightCapRatio
Definition: ccstruct.h:37
const char *const kBackUpConfigFile
Definition: control.cpp:54
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:79
const FontInfo * fontinfo2
Definition: pageres.h:289
WERD_CHOICE * best_choice
Definition: pageres.h:219
Dict & getDict()
Definition: classify.h:65
int length() const
Definition: ratngs.h:301
inT8 bold
Definition: pageres.h:286
ACCEPTABLE_WERD_TYPE
Definition: control.h:34
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1285
void SetScriptPositions()
Definition: pageres.cpp:853
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:325
BlamerBundle * blamer_bundle
Definition: pageres.h:230
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:455
#define MAX_INT32
Definition: host.h:62
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1046
bool IsAllSpaces() const
Definition: ratngs.h:519
voidpf void uLong size
Definition: ioapi.h:39
bool is_bold() const
Definition: fontinfo.h:112
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:1985
bool x_overlap(const TBOX &box) const
Definition: rect.h:391
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:718
Definition: werd.h:36
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:101
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1323
void print()
Definition: werd.cpp:266
const double kMinRefitXHeightFraction
Definition: control.cpp:57
int push_back(T object)
float rating() const
Definition: ratngs.h:79
const UNICHARSET & GetUnicharset() const
inT32 rej_count
Definition: pageres.h:61
const STRING debug_string() const
Definition: ratngs.h:503
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1794
TWERD * rebuild_word
Definition: pageres.h:244
#define tprintf(...)
Definition: tprintf.h:31
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:143
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:836
BLOCK * block
Definition: pageres.h:99
void rej_stat_word()
Definition: pageres.cpp:1675
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:224
a.b.c.
Definition: control.h:40
void BestChoiceToCorrectText()
Definition: pageres.cpp:918
const char * string() const
Definition: strngs.cpp:198
inT8 italic
Definition: pageres.h:285
void * cancel_this
called whenever progress increases
Definition: ocrclass.h:127
ROW * row
Definition: pageres.h:127
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:927
bool AdaptiveClassifierIsFull() const
Definition: classify.h:283
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1434
voidpf uLong offset
Definition: ioapi.h:42
bool empty() const
Definition: genericvector.h:90
#define SUBLOC_NORM
Definition: errcode.h:59
const STRING & misadaption_debug() const
Definition: blamer.h:119
float x_height() const
Definition: ocrrow.h:61
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
#define LOC_MM_ADAPT
Definition: errcode.h:52
inT32 length() const
Definition: strngs.cpp:193
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1413
inT32 pile_count(inT32 value) const
Definition: statistc.h:78
int size() const
Definition: genericvector.h:72
FILE * debug_fp
Definition: tessvars.cpp:24
TBOX bounding_box() const
Definition: werd.cpp:160
tesseract::Tesseract * tesseract
Definition: pageres.h:266
void fix_hyphens()
Definition: pageres.cpp:1042
int16_t inT16
Definition: host.h:36
tesseract::BoxWord * box_word
Definition: pageres.h:250
BOOL8 guessed_x_ht
Definition: pageres.h:292
ROW_RES * row() const
Definition: pageres.h:739
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1907
#define ASSERT_HOST(x)
Definition: errcode.h:84
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
inT16 left() const
Definition: rect.h:68
STRING lang
Definition: ccutil.h:66
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
inT32 length() const
Definition: rejctmap.h:235
uinT8 permuter() const
Definition: ratngs.h:344
WERD_RES * restart_page()
Definition: pageres.h:683
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:450
bool IsText() const
Definition: polyblk.h:52
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:783
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2042
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:119
void fix_quotes()
Definition: pageres.cpp:1013
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:113
PROGRESS_FUNC progress_callback
returns true to cancel
Definition: ocrclass.h:126
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:284
PAGE_RES * page_res
Definition: pageres.h:661
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1519
WERD_RES * forward()
Definition: pageres.h:716
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:694
inT16 reject_count()
Definition: rejctmap.h:241
inT32 x_height() const
return xheight
Definition: ocrblock.h:110
bool tessedit_enable_bigram_correction
unsigned char BOOL8
Definition: host.h:44
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:31
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:82
Definition: strngs.h:45
float caps_height
Definition: pageres.h:296
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
#define FALSE
Definition: capi.h:46
void Add(T value, int count)
Definition: sorthelper.h:65
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1207
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:85
const FontInfo * fontinfo
Definition: pageres.h:288
static void Update()
Definition: scrollview.cpp:715
BOOL8 contains(const char c) const
Definition: strngs.cpp:189
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:69
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:375
Pix * BestPix() const
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:888
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:598
WERD_CHOICE * raw_choice
Definition: pageres.h:224
BOOL8 tess_would_adapt
Definition: pageres.h:281
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:630
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:765
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:40
BOOL8 tess_accepted
Definition: pageres.h:280
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:69
EXTERN ScrollView * fx_win
Definition: drawfx.cpp:51
bool contains(const FCOORD pt) const
Definition: rect.h:323
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1249
UNICHARSET unicharset
Definition: ccutil.h:68
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
void DeleteCurrentWord()
Definition: pageres.cpp:1451
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:392
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:125
float certainty() const
Definition: ratngs.h:328
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
Definition: werd.h:35
BOOL8 part_of_combo
Definition: pageres.h:319
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:547
inT8 fontinfo_id2_count
Definition: pageres.h:291
void ResetWordIterator()
Definition: pageres.cpp:1534
void clear_fx_win()
Definition: drawfx.cpp:73
int length() const
Definition: genericvector.h:85
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:993
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
inT16 top() const
Definition: rect.h:54
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:791
void GetNoiseOutlines(GenericVector< C_OUTLINE *> *outlines)
Definition: werd.cpp:530
void rej_word_bad_quality()
Definition: rejctmap.cpp:485
#define MAX(x, y)
Definition: ndminx.h:24
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:79
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:357
inT32 mode() const
Definition: statistc.cpp:115
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1651
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:151
A.B.C.
Definition: control.h:41
const STRING & unichar_string() const
Definition: ratngs.h:539
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:614
#define LOC_DOC_BLK_REJ
Definition: errcode.h:53
Definition: rect.h:30
T & back() const
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB *> &target_blobs, const GenericVector< C_OUTLINE *> &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:548
#define MIN(x, y)
Definition: ndminx.h:28
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1690
ALL but initial lc.
Definition: control.h:39
#define LOC_FUZZY_SPACE
Definition: errcode.h:50
TBOX bounding_box() const
Definition: stepblob.cpp:250
void full_print(FILE *fp)
Definition: rejctmap.cpp:403
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
double classify_max_rating_ratio
Definition: classify.h:401
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1576
void PrintBestChoices() const
Definition: pageres.cpp:709
inT8 fontinfo_id_count
Definition: pageres.h:290
float y() const
Definition: points.h:212
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742
bool small_caps
Definition: pageres.h:283
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
PointerVector< WERD_RES > lang_words
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:39
BLOB_CHOICE_LIST * GetBlobChoices(int index) const
Definition: pageres.cpp:751
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
Definition: statistc.h:33
bool is_italic() const
Definition: fontinfo.h:111
void print() const
Definition: rect.h:270
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
#define MAX_INT8
Definition: host.h:60
WERD_RES * word() const
Definition: pageres.h:736
bool AnyTessLang() const
bool wordrec_run_blamer
Definition: wordrec.h:168
inT16 bottom() const
Definition: rect.h:61
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:174
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:67
ALL lower case.
Definition: control.h:37
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1270
bool IsAmbiguous()
Definition: pageres.cpp:444
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:160
ALL upper case.
Definition: control.h:38
bool wordrec_debug_blamer
Definition: wordrec.h:167
const UNICHARSET * uch_set
Definition: pageres.h:192
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
double classify_max_certainty_margin
Definition: classify.h:403
inT16 progress
chars in this buffer(0)
Definition: ocrclass.h:118
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
void initialise(inT16 length)
Definition: rejctmap.cpp:318
bool right_to_left() const
Definition: werd.h:60
TWERD * chopped_word
Definition: pageres.h:201
REJMAP reject_map
Definition: pageres.h:271
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:487
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:300
bool top_bottom_useful() const
Definition: unicharset.h:497
GenericVector< STRING > misadaption_log
Definition: pageres.h:73
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:574
Definition: ocrrow.h:32
int count(LIST var_list)
Definition: oldlist.cpp:103
GenericVector< int > blame_reasons
Definition: pageres.h:68
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:173
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:137
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1484
FCOORD classify_rotation() const
Definition: ocrblock.h:144
void create_fx_win()
Definition: drawfx.cpp:60
float x_height
Definition: pageres.h:295
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:121
Definition: ocrblock.h:30
#define MAX_UINT16
Definition: host.h:64
bool AnyLSTMLang() const
bool script_has_xheight() const
Definition: unicharset.h:863
TBOX bounding_box() const
Definition: blobs.cpp:879
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:39
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
BLOCK_RES * block() const
Definition: pageres.h:742
void make_reject_map(WERD_RES *word, ROW *row, inT16 pass)
BOOL8 done
Definition: pageres.h:282
int length() const
Definition: boxword.h:85
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
Definition: adaptions.cpp:45
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:415
float rating() const
Definition: ratngs.h:325