tesseract  4.00.00dev
combine_tessdata.cpp File Reference
#include "tessdatamanager.h"

Go to the source code of this file.

Functions

int main (int argc, char **argv)
 

Function Documentation

◆ main()

int main ( int  argc,
char **  argv 
)

This program reads in a text file consisting of feature samples from a training page in the following format:

   FontName UTF8-char-str xmin ymin xmax ymax page-number
    NumberOfFeatureTypes(N)
      FeatureTypeName1 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      FeatureTypeName2 NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
      ...
      FeatureTypeNameN NumberOfFeatures(M)
         Feature1
         ...
         FeatureM
   FontName CharName ...

The result of this program is a binary inttemp file used by the OCR engine.

Parameters
argcnumber of command line arguments
argvarray of command line arguments
Returns
none
Note
Exceptions: none
History: Fri Aug 18 08:56:17 1989, DSJ, Created.
History: Mon May 18 1998, Christy Russson, Revistion started.

Definition at line 66 of file combine_tessdata.cpp.

66  {
67  int i;
69  if (argc == 2) {
70  printf("Combining tessdata files\n");
71  STRING lang = argv[1];
72  char* last = &argv[1][strlen(argv[1])-1];
73  if (*last != '.')
74  lang += '.';
75  STRING output_file = lang;
76  output_file += kTrainedDataSuffix;
77  if (!tm.CombineDataFiles(lang.string(), output_file.string())) {
78  printf("Error combining tessdata files into %s\n",
79  output_file.string());
80  } else {
81  printf("Output %s created successfully.\n", output_file.string());
82  }
83  } else if (argc >= 4 && (strcmp(argv[1], "-e") == 0 ||
84  strcmp(argv[1], "-u") == 0)) {
85  // Initialize TessdataManager with the data in the given traineddata file.
86  tm.Init(argv[2]);
87  printf("Extracting tessdata components from %s\n", argv[2]);
88  if (strcmp(argv[1], "-e") == 0) {
89  for (i = 3; i < argc; ++i) {
90  if (tm.ExtractToFile(argv[i])) {
91  printf("Wrote %s\n", argv[i]);
92  } else {
93  printf("Not extracting %s, since this component"
94  " is not present\n", argv[i]);
95  }
96  }
97  } else { // extract all the components
98  for (i = 0; i < tesseract::TESSDATA_NUM_ENTRIES; ++i) {
99  STRING filename = argv[3];
100  char* last = &argv[3][strlen(argv[3])-1];
101  if (*last != '.')
102  filename += '.';
103  filename += tesseract::kTessdataFileSuffixes[i];
104  if (tm.ExtractToFile(filename.string())) {
105  printf("Wrote %s\n", filename.string());
106  }
107  }
108  }
109  } else if (argc >= 4 && strcmp(argv[1], "-o") == 0) {
110  // Rename the current traineddata file to a temporary name.
111  const char *new_traineddata_filename = argv[2];
112  STRING traineddata_filename = new_traineddata_filename;
113  traineddata_filename += ".__tmp__";
114  if (rename(new_traineddata_filename, traineddata_filename.string()) != 0) {
115  tprintf("Failed to create a temporary file %s\n",
116  traineddata_filename.string());
117  exit(1);
118  }
119 
120  // Initialize TessdataManager with the data in the given traineddata file.
121  tm.Init(traineddata_filename.string());
122 
123  // Write the updated traineddata file.
124  tm.OverwriteComponents(new_traineddata_filename, argv+3, argc-3);
125  } else {
126  printf("Usage for combining tessdata components:\n"
127  " %s language_data_path_prefix\n"
128  " (e.g. %s tessdata/eng.)\n\n", argv[0], argv[0]);
129  printf("Usage for extracting tessdata components:\n"
130  " %s -e traineddata_file [output_component_file...]\n"
131  " (e.g. %s -e eng.traineddata eng.unicharset)\n\n",
132  argv[0], argv[0]);
133  printf("Usage for overwriting tessdata components:\n"
134  " %s -o traineddata_file [input_component_file...]\n"
135  " (e.g. %s -o eng.traineddata eng.unicharset)\n\n",
136  argv[0], argv[0]);
137  printf("Usage for unpacking all tessdata components:\n"
138  " %s -u traineddata_file output_path_prefix\n"
139  " (e.g. %s -u eng.traineddata tmp/eng.)\n", argv[0], argv[0]);
140  return 1;
141  }
142  tm.Directory();
143 }
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
bool ExtractToFile(const char *filename)
Definition: strngs.h:45
const char * filename
Definition: ioapi.h:38
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
LIST last(LIST var_list)
Definition: oldlist.cpp:271
bool Init(const char *data_file_name)