tesseract  4.00.00dev
ambiguous_words.cpp File Reference
#include <stdio.h>
#include "baseapi.h"
#include "helpers.h"
#include "strngs.h"
#include "dict.h"
#include "tesseractclass.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 32 of file ambiguous_words.cpp.

32  {
33 
34  // Parse input arguments.
35  if (argc != 4 && (argc != 6 || strcmp(argv[1], "-l") != 0)) {
36  printf("Usage: %s [-l lang] tessdata_dir wordlist_file"
37  " output_ambiguious_wordlist_file\n", argv[0]);
38  return 1;
39  }
40  int argv_offset = 0;
41  STRING lang;
42  if (argc == 6) {
43  lang = argv[2];
44  argv_offset = 2;
45  } else {
46  lang = "eng";
47  }
48  const char *tessdata_dir = argv[++argv_offset];
49  const char *input_file_str = argv[++argv_offset];
50  const char *output_file_str = argv[++argv_offset];
51 
52  // Initialize Tesseract.
54  GenericVector<STRING> vars_vec;
55  GenericVector<STRING> vars_values;
56  vars_vec.push_back("output_ambig_words_file");
57  vars_values.push_back(output_file_str);
58  api.Init(tessdata_dir, lang.string(), tesseract::OEM_TESSERACT_ONLY, nullptr,
59  0, &vars_vec, &vars_values, false);
60  tesseract::Dict &dict = api.tesseract()->getDict();
61  FILE *input_file = fopen(input_file_str, "rb");
62  if (input_file == nullptr) {
63  tprintf("Failed to open input wordlist file %s\n", input_file_str);
64  exit(1);
65  }
66  char str[CHARS_PER_LINE];
67 
68  // Read word list and call Dict::NoDangerousAmbig() for each word
69  // to record ambiguities in the output file.
70  while (fgets(str, CHARS_PER_LINE, input_file) != nullptr) {
71  chomp_string(str); // remove newline
72  WERD_CHOICE word(str, dict.getUnicharset());
73  dict.NoDangerousAmbig(&word, nullptr, false, nullptr);
74  }
75  // Clean up.
76  fclose(input_file);
77 }
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
Dict & getDict()
Definition: classify.h:65
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:278
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
Definition: stopper.cpp:151
void chomp_string(char *str)
Definition: helpers.h:82
Definition: strngs.h:45
Tesseract * tesseract() const
Definition: baseapi.h:769
#define CHARS_PER_LINE
Definition: cutil.h:57