tesseract  4.00.00dev
tessedit.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: tessedit.cpp (Formerly tessedit.c)
3  * Description: (Previously) Main program for merge of tess and editor.
4  * Now just code to load the language model and various
5  * engine-specific data files.
6  * Author: Ray Smith
7  * Created: Tue Jan 07 15:21:46 GMT 1992
8  *
9  * (C) Copyright 1992, Hewlett-Packard Ltd.
10  ** Licensed under the Apache License, Version 2.0 (the "License");
11  ** you may not use this file except in compliance with the License.
12  ** You may obtain a copy of the License at
13  ** http://www.apache.org/licenses/LICENSE-2.0
14  ** Unless required by applicable law or agreed to in writing, software
15  ** distributed under the License is distributed on an "AS IS" BASIS,
16  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
17  ** See the License for the specific language governing permissions and
18  ** limitations under the License.
19  *
20  **********************************************************************/
21 
22 // Include automatically generated configuration file if running autoconf.
23 #ifdef HAVE_CONFIG_H
24 #include "config_auto.h"
25 #endif
26 
27 #include "stderr.h"
28 #include "basedir.h"
29 #include "tessvars.h"
30 #include "control.h"
31 #include "reject.h"
32 #include "pageres.h"
33 #include "nwmain.h"
34 #include "pgedit.h"
35 #include "tprintf.h"
36 #include "tessedit.h"
37 #include "stopper.h"
38 #include "intmatcher.h"
39 #include "chop.h"
40 #include "efio.h"
41 #include "danerror.h"
42 #include "globals.h"
43 #ifndef ANDROID_BUILD
44 #include "lstmrecognizer.h"
45 #endif
46 #include "tesseractclass.h"
47 #include "params.h"
48 
49 #define VARDIR "configs/" /*variables files */
50  // config under api
51 #define API_CONFIG "configs/api_config"
52 
53 ETEXT_DESC *global_monitor = NULL; // progress monitor
54 
55 namespace tesseract {
56 
57 // Read a "config" file containing a set of variable, value pairs.
58 // Searches the standard places: tessdata/configs, tessdata/tessconfigs
59 // and also accepts a relative or absolute path name.
61  SetParamConstraint constraint) {
62  STRING path = datadir;
63  path += "configs/";
64  path += filename;
65  FILE* fp;
66  if ((fp = fopen(path.string(), "rb")) != NULL) {
67  fclose(fp);
68  } else {
69  path = datadir;
70  path += "tessconfigs/";
71  path += filename;
72  if ((fp = fopen(path.string(), "rb")) != NULL) {
73  fclose(fp);
74  } else {
75  path = filename;
76  }
77  }
78  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
79 }
80 
81 // Returns false if a unicharset file for the specified language was not found
82 // or was invalid.
83 // This function initializes TessdataManager. After TessdataManager is
84 // no longer needed, TessdataManager::End() should be called.
85 //
86 // This function sets tessedit_oem_mode to the given OcrEngineMode oem, unless
87 // it is OEM_DEFAULT, in which case the value of the variable will be obtained
88 // from the language-specific config file (stored in [lang].traineddata), from
89 // the config files specified on the command line or left as the default
90 // OEM_TESSERACT_ONLY if none of the configs specify this variable.
92  const char *arg0, const char *textbase, const char *language,
93  OcrEngineMode oem, char **configs, int configs_size,
94  const GenericVector<STRING> *vars_vec,
95  const GenericVector<STRING> *vars_values, bool set_only_non_debug_params,
96  TessdataManager *mgr) {
97  // Set the basename, compute the data directory.
98  main_setup(arg0, textbase);
99 
100  // Set the language data path prefix
101  lang = language != NULL ? language : "eng";
105 
106  // Initialize TessdataManager.
107  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
108  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
109  // Try without tessdata.
110  m_data_sub_dir.set_value("");
111  main_setup(arg0, textbase);
115  tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
116  if (!mgr->Init(tessdata_path.string())) {
117  tprintf("Error opening data file %s\n", tessdata_path.string());
118  tprintf(
119  "Please make sure the TESSDATA_PREFIX environment variable is set"
120  " to your \"tessdata\" directory.\n");
121  return false;
122  }
123  }
124  if (oem == OEM_DEFAULT) {
125  // Set the engine mode from availability, which can then be overidden by
126  // the config file when we read it below.
127  if (!mgr->IsLSTMAvailable()) {
129  } else if (!mgr->IsBaseAvailable()) {
131  } else {
133  }
134  }
135 
136  // If a language specific config file (lang.config) exists, load it in.
137  TFile fp;
138  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
140  this->params());
141  }
142 
143  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
145  // Load tesseract variables from config files. This is done after loading
146  // language-specific variables from [lang].traineddata file, so that custom
147  // config files can override values in [lang].traineddata file.
148  for (int i = 0; i < configs_size; ++i) {
149  read_config_file(configs[i], set_params_constraint);
150  }
151 
152  // Set params specified in vars_vec (done after setting params from config
153  // files, so that params in vars_vec can override those from files).
154  if (vars_vec != NULL && vars_values != NULL) {
155  for (int i = 0; i < vars_vec->size(); ++i) {
156  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
157  (*vars_values)[i].string(),
158  set_params_constraint, this->params())) {
159  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
160  exit(1);
161  }
162  }
163  }
164 
165  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
166  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
167  if (params_file != NULL) {
168  ParamUtils::PrintParams(params_file, this->params());
169  fclose(params_file);
170  } else {
171  tprintf("Failed to open %s for writing params.\n",
172  tessedit_write_params_to_file.string());
173  }
174  }
175 
176  // Determine which ocr engine(s) should be loaded and used for recognition.
177  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
178 
179  // If we are only loading the config file (and so not planning on doing any
180  // recognition) then there's nothing else do here.
182  return true;
183  }
184 
185 // The various OcrEngineMode settings (see publictypes.h) determine which
186 // engine-specific data files need to be loaded.
187 // If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
188 #ifndef ANDROID_BUILD
191  if (mgr->GetComponent(TESSDATA_LSTM, &fp)) {
192  lstm_recognizer_ = new LSTMRecognizer;
193  ASSERT_HOST(lstm_recognizer_->DeSerialize(&fp));
194  if (lstm_use_matrix) lstm_recognizer_->LoadDictionary(language, mgr);
195  } else {
196  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
198  }
199  }
200 #endif
201 
202  // Load the unicharset
204  // Avoid requiring a unicharset when we aren't running base tesseract.
205 #ifndef ANDROID_BUILD
206  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
207 #endif
208  } else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
209  !unicharset.load_from_file(&fp, false)) {
210  return false;
211  }
212  if (unicharset.size() > MAX_NUM_CLASSES) {
213  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
214  return false;
215  }
216  right_to_left_ = unicharset.major_right_to_left();
217 
218  // Setup initial unichar ambigs table and read universal ambigs.
219  UNICHARSET encoder_unicharset;
220  encoder_unicharset.CopyFrom(unicharset);
222  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
223 
225  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
228  }
229  // Init ParamsModel.
230  // Load pass1 and pass2 weights (for now these two sets are the same, but in
231  // the future separate sets of weights can be generated).
232  for (int p = ParamsModel::PTRAIN_PASS1;
235  static_cast<ParamsModel::PassEnum>(p));
236  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
238  return false;
239  }
240  }
241  }
242 
243  return true;
244 }
245 
246 // Helper returns true if the given string is in the vector of strings.
247 static bool IsStrInList(const STRING& str,
248  const GenericVector<STRING>& str_list) {
249  for (int i = 0; i < str_list.size(); ++i) {
250  if (str_list[i] == str)
251  return true;
252  }
253  return false;
254 }
255 
256 // Parse a string of the form [~]<lang>[+[~]<lang>]*.
257 // Langs with no prefix get appended to to_load, provided they
258 // are not in there already.
259 // Langs with ~ prefix get appended to not_to_load, provided they are not in
260 // there already.
261 void Tesseract::ParseLanguageString(const char* lang_str,
262  GenericVector<STRING>* to_load,
263  GenericVector<STRING>* not_to_load) {
264  STRING remains(lang_str);
265  while (remains.length() > 0) {
266  // Find the start of the lang code and which vector to add to.
267  const char* start = remains.string();
268  while (*start == '+')
269  ++start;
270  GenericVector<STRING>* target = to_load;
271  if (*start == '~') {
272  target = not_to_load;
273  ++start;
274  }
275  // Find the index of the end of the lang code in string start.
276  int end = strlen(start);
277  const char* plus = strchr(start, '+');
278  if (plus != NULL && plus - start < end)
279  end = plus - start;
280  STRING lang_code(start);
281  lang_code.truncate_at(end);
282  STRING next(start + end);
283  remains = next;
284  // Check whether lang_code is already in the target vector and add.
285  if (!IsStrInList(lang_code, *target)) {
286  target->push_back(lang_code);
287  }
288  }
289 }
290 
291 // Initialize for potentially a set of languages defined by the language
292 // string and recursively any additional languages required by any language
293 // traineddata file (via tessedit_load_sublangs in its config) that is loaded.
294 // See init_tesseract_internal for args.
295 int Tesseract::init_tesseract(const char *arg0, const char *textbase,
296  const char *language, OcrEngineMode oem,
297  char **configs, int configs_size,
298  const GenericVector<STRING> *vars_vec,
299  const GenericVector<STRING> *vars_values,
300  bool set_only_non_debug_params,
301  TessdataManager *mgr) {
302  GenericVector<STRING> langs_to_load;
303  GenericVector<STRING> langs_not_to_load;
304  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
305 
306  sub_langs_.delete_data_pointers();
307  sub_langs_.clear();
308  // Find the first loadable lang and load into this.
309  // Add any languages that this language requires
310  bool loaded_primary = false;
311  // Load the rest into sub_langs_.
312  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
313  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
314  const char *lang_str = langs_to_load[lang_index].string();
315  Tesseract *tess_to_init;
316  if (!loaded_primary) {
317  tess_to_init = this;
318  } else {
319  tess_to_init = new Tesseract;
320  }
321 
322  int result = tess_to_init->init_tesseract_internal(
323  arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
324  vars_values, set_only_non_debug_params, mgr);
325  // Forget that language, but keep any reader we were given.
326  mgr->Clear();
327 
328  if (!loaded_primary) {
329  if (result < 0) {
330  tprintf("Failed loading language '%s'\n", lang_str);
331  } else {
332  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
333  &langs_to_load, &langs_not_to_load);
334  loaded_primary = true;
335  }
336  } else {
337  if (result < 0) {
338  tprintf("Failed loading language '%s'\n", lang_str);
339  delete tess_to_init;
340  } else {
341  sub_langs_.push_back(tess_to_init);
342  // Add any languages that this language requires
343  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
344  &langs_to_load, &langs_not_to_load);
345  }
346  }
347  }
348  }
349  if (!loaded_primary) {
350  tprintf("Tesseract couldn't load any languages!\n");
351  return -1; // Couldn't load any language!
352  }
353  if (!sub_langs_.empty()) {
354  // In multilingual mode word ratings have to be directly comparable,
355  // so use the same language model weights for all languages:
356  // use the primary language's params model if
357  // tessedit_use_primary_params_model is set,
358  // otherwise use default language model weights.
360  for (int s = 0; s < sub_langs_.size(); ++s) {
361  sub_langs_[s]->language_model_->getParamsModel().Copy(
363  }
364  tprintf("Using params model of the primary language\n");
365  } else {
367  for (int s = 0; s < sub_langs_.size(); ++s) {
368  sub_langs_[s]->language_model_->getParamsModel().Clear();
369  }
370  }
371  }
372 
374  return 0;
375 }
376 
377 // Common initialization for a single language.
378 // arg0 is the datapath for the tessdata directory, which could be the
379 // path of the tessdata directory with no trailing /, or (if tessdata
380 // lives in the same directory as the executable, the path of the executable,
381 // hence the name arg0.
382 // textbase is an optional output file basename (used only for training)
383 // language is the language code to load.
384 // oem controls which engine(s) will operate on the image
385 // configs (argv) is an array of config filenames to load variables from.
386 // May be NULL.
387 // configs_size (argc) is the number of elements in configs.
388 // vars_vec is an optional vector of variables to set.
389 // vars_values is an optional corresponding vector of values for the variables
390 // in vars_vec.
391 // If set_only_init_params is true, then only the initialization variables
392 // will be set.
393 int Tesseract::init_tesseract_internal(const char *arg0, const char *textbase,
394  const char *language, OcrEngineMode oem,
395  char **configs, int configs_size,
396  const GenericVector<STRING> *vars_vec,
397  const GenericVector<STRING> *vars_values,
398  bool set_only_non_debug_params,
399  TessdataManager *mgr) {
400  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
401  configs_size, vars_vec, vars_values,
402  set_only_non_debug_params, mgr)) {
403  return -1;
404  }
406  return 0;
407  }
408  // If only LSTM will be used, skip loading Tesseract classifier's
409  // pre-trained templates and dictionary.
411  program_editup(textbase, init_tesseract ? mgr : nullptr,
412  init_tesseract ? mgr : nullptr);
413  return 0; //Normal exit
414 }
415 
416 // Helper builds the all_fonts table by adding new fonts from new_fonts.
417 static void CollectFonts(const UnicityTable<FontInfo>& new_fonts,
418  UnicityTable<FontInfo>* all_fonts) {
419  for (int i = 0; i < new_fonts.size(); ++i) {
420  // UnicityTable uniques as we go.
421  all_fonts->push_back(new_fonts.get(i));
422  }
423 }
424 
425 // Helper assigns an id to lang_fonts using the index in all_fonts table.
426 static void AssignIds(const UnicityTable<FontInfo>& all_fonts,
427  UnicityTable<FontInfo>* lang_fonts) {
428  for (int i = 0; i < lang_fonts->size(); ++i) {
429  int index = all_fonts.get_id(lang_fonts->get(i));
430  lang_fonts->get_mutable(i)->universal_id = index;
431  }
432 }
433 
434 // Set the universal_id member of each font to be unique among all
435 // instances of the same font loaded.
437  // Note that we can get away with bitwise copying FontInfo in
438  // all_fonts, as it is a temporary structure and we avoid setting the
439  // delete callback.
440  UnicityTable<FontInfo> all_fonts;
442 
443  // Create the universal ID table.
444  CollectFonts(get_fontinfo_table(), &all_fonts);
445  for (int i = 0; i < sub_langs_.size(); ++i) {
446  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
447  }
448  // Assign ids from the table to each font table.
449  AssignIds(all_fonts, &get_fontinfo_table());
450  for (int i = 0; i < sub_langs_.size(); ++i) {
451  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
452  }
453  font_table_size_ = all_fonts.size();
454 }
455 
456 // init the LM component
457 int Tesseract::init_tesseract_lm(const char *arg0, const char *textbase,
458  const char *language, TessdataManager *mgr) {
459  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
460  NULL, 0, NULL, NULL, false, mgr))
461  return -1;
463  getDict().Load(lang, mgr);
464  getDict().FinishLoad();
465  return 0;
466 }
467 
469  end_recog();
470 }
471 
472 /* Define command type identifiers */
473 
475 {
480 };
481 } // namespace tesseract
int push_back(T object)
Add an element in the table.
void SetupUniversalFontIds()
Definition: tessedit.cpp:436
void truncate_at(inT32 index)
Definition: strngs.cpp:269
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:61
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:67
ParamsVectors * params()
Definition: ccutil.h:62
Dict & getDict()
Definition: classify.h:65
bool LoadFromFp(const char *lang, TFile *fp)
int size() const
Return the size used.
int push_back(T object)
const UNICHARSET & GetUnicharset() const
#define tprintf(...)
Definition: tprintf.h:31
int ambigs_debug_level
Definition: ccutil.h:85
const char * string() const
Definition: strngs.cpp:198
char * m_data_sub_dir
Definition: ccutil.h:80
STRING language_data_path_prefix
Definition: ccutil.h:67
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:261
inT32 length() const
Definition: strngs.cpp:193
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:53
int size() const
Definition: genericvector.h:72
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:91
bool LoadDictionary(const char *lang, TessdataManager *mgr)
#define ASSERT_HOST(x)
Definition: errcode.h:84
STRING lang
Definition: ccutil.h:66
T * get_mutable(int id)
void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:46
bool use_ambigs_for_adaption
Definition: ccutil.h:89
int get_id(T object) const
int init_tesseract_internal(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:393
Definition: strngs.h:45
bool GetComponent(TessdataType type, TFile *fp)
ParamsModel & getParamsModel()
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:295
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:40
UNICHARSET unicharset
Definition: ccutil.h:68
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:91
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:348
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:60
SetParamConstraint
Definition: params.h:36
ETEXT_DESC * global_monitor
Definition: tessedit.cpp:53
const char * filename
Definition: ioapi.h:38
char * tessedit_write_params_to_file
void set_compare_callback(TessResultCallback2< bool, T const &, T const &> *cb)
int size() const
Definition: unicharset.h:299
STRING datadir
Definition: ccutil.h:64
const T & get(int id) const
Return the object from an id.
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:198
LanguageModel * language_model_
Definition: wordrec.h:410
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:69
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:206
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:224
bool Init(const char *data_file_name)
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:53
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:173
bool FinishLoad()
Definition: dict.cpp:327
void SetPass(PassEnum pass)
Definition: params_model.h:72
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:423
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
Definition: tessedit.cpp:457
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:74
bool major_right_to_left() const
Definition: unicharset.cpp:933