tesseract  4.00.00dev
classifier_tester.cpp
Go to the documentation of this file.
1 // Copyright 2011 Google Inc. All Rights Reserved.
2 // Author: rays@google.com (Ray Smith)
3 
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 // Filename: classifier_tester.cpp
15 // Purpose: Tests a character classifier on data as formatted for training,
16 // but doesn't have to be the same as the training data.
17 // Author: Ray Smith
18 
19 #include <stdio.h>
20 #ifndef USE_STD_NAMESPACE
21 #include "base/commandlineflags.h"
22 #endif // USE_STD_NAMESPACE
23 #include "baseapi.h"
24 #include "commontraining.h"
25 #include "mastertrainer.h"
26 #include "params.h"
27 #include "strngs.h"
28 #include "tessclassifier.h"
29 #include "tesseractclass.h"
30 
31 STRING_PARAM_FLAG(classifier, "", "Classifier to test");
32 STRING_PARAM_FLAG(lang, "eng", "Language to test");
33 STRING_PARAM_FLAG(tessdata_dir, "", "Directory of traineddata files");
34 DECLARE_INT_PARAM_FLAG(debug_level);
35 
40 };
41 
42 const char* names[] = {"pruner", "full", nullptr};
43 
44 static tesseract::ShapeClassifier* InitializeClassifier(
45  const char* classifer_name, const UNICHARSET& unicharset,
46  int argc, char **argv,
48  // Decode the classifier string.
49  ClassifierName classifier = CN_COUNT;
50  for (int c = 0; c < CN_COUNT; ++c) {
51  if (strcmp(classifer_name, names[c]) == 0) {
52  classifier = static_cast<ClassifierName>(c);
53  break;
54  }
55  }
56  if (classifier == CN_COUNT) {
57  fprintf(stderr, "Invalid classifier name:%s\n", FLAGS_classifier.c_str());
58  return nullptr;
59  }
60 
61  // We need to initialize tesseract to test.
62  *api = new tesseract::TessBaseAPI;
65  tesseract::Classify* classify = nullptr;
66  if (
67  classifier == CN_PRUNER || classifier == CN_FULL) {
68  if ((*api)->Init(FLAGS_tessdata_dir.c_str(), FLAGS_lang.c_str(),
69  engine_mode) < 0) {
70  fprintf(stderr, "Tesseract initialization failed!\n");
71  return nullptr;
72  }
73  tesseract = const_cast<tesseract::Tesseract*>((*api)->tesseract());
74  classify = static_cast<tesseract::Classify*>(tesseract);
75  if (classify->shape_table() == nullptr) {
76  fprintf(stderr, "Tesseract must contain a ShapeTable!\n");
77  return nullptr;
78  }
79  }
80  tesseract::ShapeClassifier* shape_classifier = nullptr;
81 
82  if (classifier == CN_PRUNER) {
83  shape_classifier = new tesseract::TessClassifier(true, classify);
84  } else if (classifier == CN_FULL) {
85  shape_classifier = new tesseract::TessClassifier(false, classify);
86  } else {
87  fprintf(stderr, "%s tester not yet implemented\n", classifer_name);
88  return nullptr;
89  }
90  tprintf("Testing classifier %s:\n", classifer_name);
91  return shape_classifier;
92 }
93 
94 // This program has complex setup requirements, so here is some help:
95 // Two different modes, tr files and serialized mastertrainer.
96 // From tr files:
97 // classifier_tester -U unicharset -F font_properties -X xheights
98 // -classifier x -lang lang [-output_trainer trainer] *.tr
99 // From a serialized trainer:
100 // classifier_tester -input_trainer trainer [-lang lang] -classifier x
101 //
102 // In the first case, the unicharset must be the unicharset from within
103 // the classifier under test, and the font_properties and xheights files must
104 // match the files used during training.
105 // In the second case, the trainer file must have been prepared from
106 // some previous run of shapeclustering, mftraining, or classifier_tester
107 // using the same conditions as above, ie matching unicharset/font_properties.
108 //
109 // Available values of classifier (x above) are:
110 // pruner : Tesseract class pruner only.
111 // full : Tesseract full classifier.
112 // with an input trainer.)
113 int main(int argc, char **argv) {
114  ParseArguments(&argc, &argv);
115  STRING file_prefix;
116  tesseract::MasterTrainer* trainer =
117  tesseract::LoadTrainingData(argc, argv, false, nullptr, &file_prefix);
119  // Decode the classifier string.
120  tesseract::ShapeClassifier* shape_classifier = InitializeClassifier(
121  FLAGS_classifier.c_str(), trainer->unicharset(), argc, argv, &api);
122  if (shape_classifier == nullptr) {
123  fprintf(stderr, "Classifier init failed!:%s\n", FLAGS_classifier.c_str());
124  return 1;
125  }
126 
127  // We want to test junk as well if it is available.
128  // trainer->IncludeJunk();
129  // We want to test with replicated samples too.
131 
133  MAX(3, FLAGS_debug_level), false,
134  shape_classifier, nullptr);
135  delete shape_classifier;
136  delete api;
137  delete trainer;
138 
139  return 0;
140 } /* main */
141 
142 
143 
144 
145 
146 
const char * names[]
struct TessBaseAPI TessBaseAPI
Definition: capi.h:83
#define tprintf(...)
Definition: tprintf.h:31
const UNICHARSET & unicharset() const
void ParseArguments(int *argc, char ***argv)
STRING_PARAM_FLAG(classifier, "", "Classifier to test")
void ReplicateAndRandomizeSamplesIfRequired()
int main(int argc, char **argv)
const ShapeTable * shape_table() const
Definition: classify.h:69
Definition: strngs.h:45
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
ClassifierName
#define MAX(x, y)
Definition: ndminx.h:24
DECLARE_INT_PARAM_FLAG(debug_level)
void TestClassifierOnSamples(CountTypes error_mode, int report_level, bool replicate_samples, ShapeClassifier *test_classifier, STRING *report_string)