tesseract  4.00.00dev
output.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: output.cpp (Formerly output.c)
3  * Description: Output pass
4  * Author: Phil Cheatle
5  * Created: Thu Aug 4 10:56:08 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include <string.h>
25 #include <ctype.h>
26 #ifdef __UNIX__
27 #include <assert.h>
28 #include <unistd.h>
29 #include <errno.h>
30 #endif
31 #include "helpers.h"
32 #include "tessvars.h"
33 #include "control.h"
34 #include "reject.h"
35 #include "docqual.h"
36 #include "output.h"
37 #include "globals.h"
38 #include "tesseractclass.h"
39 
40 #define EPAPER_EXT ".ep"
41 #define PAGE_YSIZE 3508
42 #define CTRL_INSET '\024' //dc4=text inset
43 #define CTRL_FONT '\016' //so=font change
44 #define CTRL_DEFAULT '\017' //si=default font
45 #define CTRL_SHIFT '\022' //dc2=x shift
46 #define CTRL_TAB '\011' //tab
47 #define CTRL_NEWLINE '\012' //newline
48 #define CTRL_HARDLINE '\015' //cr
49 
50 /**********************************************************************
51  * pixels_to_pts
52  *
53  * Convert an integer number of pixels to the nearest integer
54  * number of points.
55  **********************************************************************/
56 
57 inT32 pixels_to_pts( //convert coords
58  inT32 pixels,
59  inT32 pix_res //resolution
60  ) {
61  float pts; //converted value
62 
63  pts = pixels * 72.0 / pix_res;
64  return (inT32) (pts + 0.5); //round it
65 }
66 
67 namespace tesseract {
68 void Tesseract::output_pass( //Tess output pass //send to api
69  PAGE_RES_IT &page_res_it,
70  const TBOX *target_word_box) {
71  BLOCK_RES *block_of_last_word;
72  BOOL8 force_eol; //During output
73  BLOCK *nextblock; //block of next word
74  WERD *nextword; //next word
75 
76  page_res_it.restart_page ();
77  block_of_last_word = NULL;
78  while (page_res_it.word () != NULL) {
79  check_debug_pt (page_res_it.word (), 120);
80 
81  if (target_word_box) {
82  TBOX current_word_box = page_res_it.word()->word->bounding_box();
83  FCOORD center_pt(
84  (current_word_box.right() + current_word_box.left()) / 2,
85  (current_word_box.bottom() + current_word_box.top()) / 2);
86  if (!target_word_box->contains(center_pt)) {
87  page_res_it.forward();
88  continue;
89  }
90  }
92  block_of_last_word != page_res_it.block ()) {
93  block_of_last_word = page_res_it.block ();
94  }
95 
96  force_eol = (tessedit_write_block_separators &&
97  (page_res_it.block () != page_res_it.next_block ())) ||
98  (page_res_it.next_word () == NULL);
99 
100  if (page_res_it.next_word () != NULL)
101  nextword = page_res_it.next_word ()->word;
102  else
103  nextword = NULL;
104  if (page_res_it.next_block () != NULL)
105  nextblock = page_res_it.next_block ()->block;
106  else
107  nextblock = NULL;
108  //regardless of tilde crunching
109  write_results(page_res_it,
110  determine_newline_type(page_res_it.word()->word,
111  page_res_it.block()->block,
112  nextword, nextblock), force_eol);
113  page_res_it.forward();
114  }
115 }
116 
117 
118 /*************************************************************************
119  * write_results()
120  *
121  * All recognition and rejection has now been done. Generate the following:
122  * .txt file - giving the final best choices with NO highlighting
123  * .raw file - giving the tesseract top choice output for each word
124  * .map file - showing how the .txt file has been rejected in the .ep file
125  * epchoice list - a list of one element per word, containing the text for the
126  * epaper. Reject strings are inserted.
127  * inset list - a list of bounding boxes of reject insets - indexed by the
128  * reject strings in the epchoice text.
129  *************************************************************************/
131  char newline_type, // type of newline
132  BOOL8 force_eol) { // override tilde crunch?
133  WERD_RES *word = page_res_it.word();
134  const UNICHARSET &uchset = *word->uch_set;
135  int i;
136  BOOL8 need_reject = FALSE;
137  UNICHAR_ID space = uchset.unichar_to_id(" ");
138 
139  if ((word->unlv_crunch_mode != CR_NONE ||
140  word->best_choice->length() == 0) &&
142  if ((word->unlv_crunch_mode != CR_DELETE) &&
143  (!stats_.tilde_crunch_written ||
144  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
145  (word->word->space () > 0) &&
146  !word->word->flag (W_FUZZY_NON) &&
147  !word->word->flag (W_FUZZY_SP)))) {
148  if (!word->word->flag (W_BOL) &&
149  (word->word->space () > 0) &&
150  !word->word->flag (W_FUZZY_NON) &&
151  !word->word->flag (W_FUZZY_SP)) {
152  stats_.last_char_was_tilde = false;
153  }
154  need_reject = TRUE;
155  }
156  if ((need_reject && !stats_.last_char_was_tilde) ||
157  (force_eol && stats_.write_results_empty_block)) {
158  /* Write a reject char - mark as rejected unless zero_rejection mode */
159  stats_.last_char_was_tilde = TRUE;
160  stats_.tilde_crunch_written = true;
161  stats_.last_char_was_newline = false;
162  stats_.write_results_empty_block = false;
163  }
164 
165  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
166  stats_.tilde_crunch_written = false;
167  stats_.last_char_was_newline = true;
168  stats_.last_char_was_tilde = false;
169  }
170 
171  if (force_eol)
172  stats_.write_results_empty_block = true;
173  return;
174  }
175 
176  /* NORMAL PROCESSING of non tilde crunched words */
177 
178  stats_.tilde_crunch_written = false;
179  if (newline_type)
180  stats_.last_char_was_newline = true;
181  else
182  stats_.last_char_was_newline = false;
183  stats_.write_results_empty_block = force_eol; // about to write a real word
184 
185  if (unlv_tilde_crunching &&
186  stats_.last_char_was_tilde &&
187  (word->word->space() == 0) &&
189  (word->best_choice->unichar_id(0) == space)) {
190  /* Prevent adjacent tilde across words - we know that adjacent tildes within
191  words have been removed */
192  word->MergeAdjacentBlobs(0);
193  }
194  if (newline_type ||
196  stats_.last_char_was_tilde = false;
197  else {
198  if (word->reject_map.length () > 0) {
199  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
200  stats_.last_char_was_tilde = true;
201  else
202  stats_.last_char_was_tilde = false;
203  }
204  else if (word->word->space () > 0)
205  stats_.last_char_was_tilde = false;
206  /* else it is unchanged as there are no output chars */
207  }
208 
209  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
210 
211  set_unlv_suspects(word);
212  check_debug_pt (word, 120);
214  tprintf ("Dict word: \"%s\": %d\n",
215  word->best_choice->debug_string().string(),
216  dict_word(*(word->best_choice)));
217  }
218  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
220  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
221  for (i = 0; i < word->best_choice->length(); ++i) {
222  if (word->reject_map[i].rejected())
223  word->reject_map[i].setrej_minimal_rej_accept();
224  }
225  }
227  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
228  for (i = 0; i < word->best_choice->length(); ++i) {
229  if ((word->best_choice->unichar_id(i) != space) &&
230  word->reject_map[i].rejected())
231  word->reject_map[i].setrej_minimal_rej_accept();
232  }
233  }
234  }
235 }
236 } // namespace tesseract
237 
238 /**********************************************************************
239  * determine_newline_type
240  *
241  * Find whether we have a wrapping or hard newline.
242  * Return FALSE if not at end of line.
243  **********************************************************************/
244 
245 char determine_newline_type( //test line ends
246  WERD *word, //word to do
247  BLOCK *block, //current block
248  WERD *next_word, //next word
249  BLOCK *next_block //block of next word
250  ) {
251  inT16 end_gap; //to right edge
252  inT16 width; //of next word
253  TBOX word_box; //bounding
254  TBOX next_box; //next word
255  TBOX block_box; //block bounding
256 
257  if (!word->flag (W_EOL))
258  return FALSE; //not end of line
259  if (next_word == NULL || next_block == NULL || block != next_block)
260  return CTRL_NEWLINE;
261  if (next_word->space () > 0)
262  return CTRL_HARDLINE; //it is tabbed
263  word_box = word->bounding_box ();
264  next_box = next_word->bounding_box ();
265  block_box = block->bounding_box ();
266  //gap to eol
267  end_gap = block_box.right () - word_box.right ();
268  end_gap -= (inT32) block->space ();
269  width = next_box.right () - next_box.left ();
270  // tprintf("end_gap=%d-%d=%d, width=%d-%d=%d, nl=%d\n",
271  // block_box.right(),word_box.right(),end_gap,
272  // next_box.right(),next_box.left(),width,
273  // end_gap>width ? CTRL_HARDLINE : CTRL_NEWLINE);
274  return end_gap > width ? CTRL_HARDLINE : CTRL_NEWLINE;
275 }
276 
277 /*************************************************************************
278  * get_rep_char()
279  * Return the first accepted character from the repetition string. This is the
280  * character which is repeated - as determined earlier by fix_rep_char()
281  *************************************************************************/
282 namespace tesseract {
283 UNICHAR_ID Tesseract::get_rep_char(WERD_RES *word) { // what char is repeated?
284  int i;
285  for (i = 0; ((i < word->reject_map.length()) &&
286  (word->reject_map[i].rejected())); ++i);
287 
288  if (i < word->reject_map.length()) {
289  return word->best_choice->unichar_id(i);
290  } else {
291  return word->uch_set->unichar_to_id(unrecognised_char.string());
292  }
293 }
294 
295 /*************************************************************************
296  * SUSPECT LEVELS
297  *
298  * 0 - don't reject ANYTHING
299  * 1,2 - partial rejection
300  * 3 - BEST
301  *
302  * NOTE: to reject JUST tess failures in the .map file set suspect_level 3 and
303  * tessedit_minimal_rejection.
304  *************************************************************************/
306  int len = word_res->reject_map.length();
307  const WERD_CHOICE &word = *(word_res->best_choice);
308  const UNICHARSET &uchset = *word.unicharset();
309  int i;
310  float rating_per_ch;
311 
312  if (suspect_level == 0) {
313  for (i = 0; i < len; i++) {
314  if (word_res->reject_map[i].rejected())
315  word_res->reject_map[i].setrej_minimal_rej_accept();
316  }
317  return;
318  }
319 
320  if (suspect_level >= 3)
321  return; //Use defaults
322 
323  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
324 
325  if (safe_dict_word(word_res) &&
326  (count_alphas(word) > suspect_short_words)) {
327  /* Unreject alphas in dictionary words */
328  for (i = 0; i < len; ++i) {
329  if (word_res->reject_map[i].rejected() &&
330  uchset.get_isalpha(word.unichar_id(i)))
331  word_res->reject_map[i].setrej_minimal_rej_accept();
332  }
333  }
334 
335  rating_per_ch = word.rating() / word_res->reject_map.length();
336 
337  if (rating_per_ch >= suspect_rating_per_ch)
338  return; // Don't touch bad ratings
339 
340  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
341  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
342  for (i = 0; i < len; ++i) {
343  if (word_res->reject_map[i].rejected() &&
344  (!uchset.eq(word.unichar_id(i), " ")))
345  word_res->reject_map[i].setrej_minimal_rej_accept();
346  }
347  }
348 
349  for (i = 0; i < len; i++) {
350  if (word_res->reject_map[i].rejected()) {
351  if (word_res->reject_map[i].flag(R_DOC_REJ))
352  word_res->reject_map[i].setrej_minimal_rej_accept();
353  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
354  word_res->reject_map[i].setrej_minimal_rej_accept();
355  if (word_res->reject_map[i].flag(R_ROW_REJ))
356  word_res->reject_map[i].setrej_minimal_rej_accept();
357  }
358  }
359 
360  if (suspect_level == 2)
361  return;
362 
363  if (!suspect_constrain_1Il ||
364  (word_res->reject_map.length() <= suspect_short_words)) {
365  for (i = 0; i < len; i++) {
366  if (word_res->reject_map[i].rejected()) {
367  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
368  word_res->reject_map[i].flag(R_POSTNN_1IL)))
369  word_res->reject_map[i].setrej_minimal_rej_accept();
370 
371  if (!suspect_constrain_1Il &&
372  word_res->reject_map[i].flag(R_MM_REJECT))
373  word_res->reject_map[i].setrej_minimal_rej_accept();
374  }
375  }
376  }
377 
378  if (acceptable_word_string(*word_res->uch_set,
379  word.unichar_string().string(),
380  word.unichar_lengths().string()) !=
381  AC_UNACCEPTABLE ||
383  word.unichar_lengths().string())) {
384  if (word_res->reject_map.length() > suspect_short_words) {
385  for (i = 0; i < len; i++) {
386  if (word_res->reject_map[i].rejected() &&
387  (!word_res->reject_map[i].perm_rejected() ||
388  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
389  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
390  word_res->reject_map[i].flag (R_MM_REJECT))) {
391  word_res->reject_map[i].setrej_minimal_rej_accept();
392  }
393  }
394  }
395  }
396 }
397 
399  int count = 0;
400  for (int i = 0; i < word.length(); ++i) {
401  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
402  count++;
403  }
404  return count;
405 }
406 
407 
409  int count = 0;
410  for (int i = 0; i < word.length(); ++i) {
411  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
412  word.unicharset()->get_isdigit(word.unichar_id(i)))
413  count++;
414  }
415  return count;
416 }
417 
418 
420  const char *lengths) {
421  BOOL8 prev_digit = FALSE;
422 
423  if (*lengths == 1 && *s == '(')
424  s++;
425 
426  if (*lengths == 1 &&
427  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
428  s++;
429 
430  for (; *s != '\0'; s += *(lengths++)) {
431  if (unicharset.get_isdigit(s, *lengths))
432  prev_digit = TRUE;
433  else if (prev_digit &&
434  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
435  prev_digit = FALSE;
436  else if (prev_digit && *lengths == 1 &&
437  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
438  return TRUE;
439  else if (prev_digit &&
440  *lengths == 1 && (*s == '%') &&
441  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
442  (*(s + *lengths + *(lengths + 1)) == '\0'))
443  return TRUE;
444  else
445  return FALSE;
446  }
447  return TRUE;
448 }
449 } // namespace tesseract
#define CTRL_HARDLINE
Definition: output.cpp:48
inT16 count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:398
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:68
UNICHAR_ID get_rep_char(WERD_RES *word)
Definition: output.cpp:283
WERD_RES * next_word() const
Definition: pageres.h:745
Unacceptable word.
Definition: control.h:36
#define TRUE
Definition: capi.h:45
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
Definition: points.h:189
int32_t inT32
Definition: host.h:38
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
int length() const
Definition: ratngs.h:301
const STRING & unichar_lengths() const
Definition: ratngs.h:546
Definition: werd.h:36
inT32 pixels_to_pts(inT32 pixels, inT32 pix_res)
Definition: output.cpp:57
const STRING debug_string() const
Definition: ratngs.h:503
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1794
#define tprintf(...)
Definition: tprintf.h:31
BLOCK * block
Definition: pageres.h:99
const char * string() const
Definition: strngs.cpp:198
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:305
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:969
TBOX bounding_box() const
Definition: werd.cpp:160
int16_t inT16
Definition: host.h:36
#define ASSERT_HOST(x)
Definition: errcode.h:84
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
inT16 left() const
Definition: rect.h:68
inT32 length() const
Definition: rejctmap.h:235
WERD_RES * restart_page()
Definition: pageres.h:683
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
WERD_RES * forward()
Definition: pageres.h:716
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
inT16 count_alphanums(const WERD_CHOICE &word)
Definition: output.cpp:408
BOOL8 tess_accepted
Definition: pageres.h:280
void write_results(PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
Definition: output.cpp:130
bool contains(const FCOORD pt) const
Definition: rect.h:323
UNICHARSET unicharset
Definition: ccutil.h:68
Definition: werd.h:35
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
inT16 top() const
Definition: rect.h:54
BLOCK_RES * next_block() const
Definition: pageres.h:751
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
BOOL8 acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:419
const STRING & unichar_string() const
Definition: ratngs.h:539
Definition: rect.h:30
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1690
#define CTRL_NEWLINE
Definition: output.cpp:47
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:128
WERD_RES * word() const
Definition: pageres.h:736
inT16 bottom() const
Definition: rect.h:61
const UNICHARSET * uch_set
Definition: pageres.h:192
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
Definition: werd.h:60
REJMAP reject_map
Definition: pageres.h:271
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:245
uinT8 space()
Definition: werd.h:104
int count(LIST var_list)
Definition: oldlist.cpp:103
inT16 space() const
return spacing
Definition: ocrblock.h:102
Definition: ocrblock.h:30
BLOCK_RES * block() const
Definition: pageres.h:742
float rating() const
Definition: ratngs.h:325