tesseract  4.00.00dev
fixxht.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: fixxht.cpp (Formerly fixxht.c)
3  * Description: Improve x_ht and look out for case inconsistencies
4  * Author: Phil Cheatle
5  * Created: Thu Aug 5 14:11:08 BST 1993
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <string.h>
21 #include <ctype.h>
22 #include "params.h"
23 #include "float2int.h"
24 #include "tesseractclass.h"
25 
26 namespace tesseract {
27 
28 // Fixxht overview.
29 // Premise: Initial estimate of x-height is adequate most of the time, but
30 // occasionally it is incorrect. Most notable causes of failure are:
31 // 1. Small caps, where the top of the caps is the same as the body text
32 // xheight. For small caps words the xheight needs to be reduced to correctly
33 // recognize the caps in the small caps word.
34 // 2. All xheight lines, such as summer. Here the initial estimate will have
35 // guessed that the blob tops are caps and will have placed the xheight too low.
36 // 3. Noise/logos beside words, or changes in font size on a line. Such
37 // things can blow the statistics and cause an incorrect estimate.
38 // 4. Incorrect baseline. Can happen when 2 columns are incorrectly merged.
39 // In this case the x-height is often still correct.
40 //
41 // Algorithm.
42 // Compare the vertical position (top only) of alphnumerics in a word with
43 // the range of positions in training data (in the unicharset).
44 // See CountMisfitTops. If any characters disagree sufficiently with the
45 // initial xheight estimate, then recalculate the xheight, re-run OCR on
46 // the word, and if the number of vertical misfits goes down, along with
47 // either the word rating or certainty, then keep the new xheight.
48 // The new xheight is calculated as follows:ComputeCompatibleXHeight
49 // For each alphanumeric character that has a vertically misplaced top
50 // (a misfit), yet its bottom is within the acceptable range (ie it is not
51 // likely a sub-or super-script) calculate the range of acceptable xheight
52 // positions from its range of tops, and give each value in the range a
53 // number of votes equal to the distance of its top from its acceptance range.
54 // The x-height position with the median of the votes becomes the new
55 // x-height. This assumes that most characters will be correctly recognized
56 // even if the x-height is incorrect. This is not a terrible assumption, but
57 // it is not great. An improvement would be to use a classifier that does
58 // not care about vertical position or scaling at all.
59 // Separately collect stats on shifted baselines and apply the same logic to
60 // computing a best-fit shift to fix the error. If the baseline needs to be
61 // shifted, but the x-height is OK, returns the original x-height along with
62 // the baseline shift to indicate that recognition needs to re-run.
63 
64 // If the max-min top of a unicharset char is bigger than kMaxCharTopRange
65 // then the char top cannot be used to judge misfits or suggest a new top.
66 const int kMaxCharTopRange = 48;
67 
68 // Returns the number of misfit blob tops in this word.
70  int bad_blobs = 0;
71  int num_blobs = word_res->rebuild_word->NumBlobs();
72  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
73  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
74  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
75  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
76  int top = blob->bounding_box().top();
77  if (top >= INT_FEAT_RANGE)
78  top = INT_FEAT_RANGE - 1;
79  int min_bottom, max_bottom, min_top, max_top;
80  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
81  &min_top, &max_top);
82  if (max_top - min_top > kMaxCharTopRange)
83  continue;
84  bool bad = top < min_top - x_ht_acceptance_tolerance ||
85  top > max_top + x_ht_acceptance_tolerance;
86  if (bad)
87  ++bad_blobs;
88  if (debug_x_ht_level >= 1) {
89  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
90  unicharset.id_to_unichar(class_id),
91  bad ? "Misfit" : "OK", top, min_top, max_top,
92  static_cast<int>(x_ht_acceptance_tolerance));
93  }
94  }
95  }
96  return bad_blobs;
97 }
98 
99 // Returns a new x-height maximally compatible with the result in word_res.
100 // See comment above for overall algorithm.
102  float* baseline_shift) {
103  STATS top_stats(0, MAX_UINT8);
104  STATS shift_stats(-MAX_UINT8, MAX_UINT8);
105  int bottom_shift = 0;
106  int num_blobs = word_res->rebuild_word->NumBlobs();
107  do {
108  top_stats.clear();
109  shift_stats.clear();
110  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
111  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
112  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
113  if (unicharset.get_isalpha(class_id) ||
114  unicharset.get_isdigit(class_id)) {
115  int top = blob->bounding_box().top() + bottom_shift;
116  // Clip the top to the limit of normalized feature space.
117  if (top >= INT_FEAT_RANGE)
118  top = INT_FEAT_RANGE - 1;
119  int bottom = blob->bounding_box().bottom() + bottom_shift;
120  int min_bottom, max_bottom, min_top, max_top;
121  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
122  &min_top, &max_top);
123  // Chars with a wild top range would mess up the result so ignore them.
124  if (max_top - min_top > kMaxCharTopRange)
125  continue;
126  int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
127  top - (max_top + x_ht_acceptance_tolerance));
128  int height = top - kBlnBaselineOffset;
129  if (debug_x_ht_level >= 2) {
130  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
131  unicharset.id_to_unichar(class_id),
132  height, min_bottom, max_bottom, min_top, max_top,
133  bottom, top);
134  }
135  // Use only chars that fit in the expected bottom range, and where
136  // the range of tops is sensibly near the xheight.
137  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
138  bottom - x_ht_acceptance_tolerance <= max_bottom &&
139  min_top > kBlnBaselineOffset &&
140  max_top - kBlnBaselineOffset >= kBlnXHeight &&
141  misfit_dist > 0) {
142  // Compute the x-height position using proportionality between the
143  // actual height and expected height.
144  int min_xht = DivRounded(height * kBlnXHeight,
145  max_top - kBlnBaselineOffset);
146  int max_xht = DivRounded(height * kBlnXHeight,
147  min_top - kBlnBaselineOffset);
148  if (debug_x_ht_level >= 2) {
149  tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
150  }
151  // The range of expected heights gets a vote equal to the distance
152  // of the actual top from the expected top.
153  for (int y = min_xht; y <= max_xht; ++y)
154  top_stats.add(y, misfit_dist);
155  } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
156  bottom - x_ht_acceptance_tolerance > max_bottom) &&
157  bottom_shift == 0) {
158  // Get the range of required bottom shift.
159  int min_shift = min_bottom - bottom;
160  int max_shift = max_bottom - bottom;
161  if (debug_x_ht_level >= 2) {
162  tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
163  }
164  // The range of expected shifts gets a vote equal to the min distance
165  // of the actual bottom from the expected bottom, spread over the
166  // range of its acceptance.
167  int misfit_weight = abs(min_shift);
168  if (max_shift > min_shift)
169  misfit_weight /= max_shift - min_shift;
170  for (int y = min_shift; y <= max_shift; ++y)
171  shift_stats.add(y, misfit_weight);
172  } else {
173  if (bottom_shift == 0) {
174  // Things with bottoms that are already ok need to say so, on the
175  // 1st iteration only.
176  shift_stats.add(0, kBlnBaselineOffset);
177  }
178  if (debug_x_ht_level >= 2) {
179  tprintf(" already OK\n");
180  }
181  }
182  }
183  }
184  if (shift_stats.get_total() > top_stats.get_total()) {
185  bottom_shift = IntCastRounded(shift_stats.median());
186  if (debug_x_ht_level >= 2) {
187  tprintf("Applying bottom shift=%d\n", bottom_shift);
188  }
189  }
190  } while (bottom_shift != 0 &&
191  top_stats.get_total() < shift_stats.get_total());
192  // Baseline shift is opposite sign to the bottom shift.
193  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
194  if (debug_x_ht_level >= 2) {
195  tprintf("baseline shift=%g\n", *baseline_shift);
196  }
197  if (top_stats.get_total() == 0)
198  return bottom_shift != 0 ? word_res->x_height : 0.0f;
199  // The new xheight is just the median vote, which is then scaled out
200  // of BLN space back to pixel space to get the x-height in pixel space.
201  float new_xht = top_stats.median();
202  if (debug_x_ht_level >= 2) {
203  tprintf("Median xht=%f\n", new_xht);
204  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
205  new_xht, new_xht / word_res->denorm.y_scale());
206  }
207  // The xheight must change by at least x_ht_min_change to be used.
208  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
209  return new_xht / word_res->denorm.y_scale();
210  else
211  return bottom_shift != 0 ? word_res->x_height : 0.0f;
212 }
213 
214 } // namespace tesseract
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
inT32 get_total() const
Definition: statistc.h:86
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define MAX_UINT8
Definition: host.h:63
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:101
const int kBlnXHeight
Definition: normalis.h:28
TWERD * rebuild_word
Definition: pageres.h:244
#define tprintf(...)
Definition: tprintf.h:31
const int kBlnBaselineOffset
Definition: normalis.h:29
float y_scale() const
Definition: normalis.h:272
int IntCastRounded(double x)
Definition: helpers.h:179
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:528
double median() const
Definition: statistc.cpp:239
const int kMaxCharTopRange
Definition: fixxht.cpp:66
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:69
UNICHARSET unicharset
Definition: ccutil.h:68
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
int NumBlobs() const
Definition: blobs.h:425
inT16 top() const
Definition: rect.h:54
#define MAX(x, y)
Definition: ndminx.h:24
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
DENORM denorm
Definition: pageres.h:190
#define INT_FEAT_RANGE
Definition: float2int.h:27
Definition: statistc.h:33
inT16 bottom() const
Definition: rect.h:61
TBOX bounding_box() const
Definition: blobs.cpp:482
int DivRounded(int a, int b)
Definition: helpers.h:173
float x_height
Definition: pageres.h:295
void clear()
Definition: statistc.cpp:81