tesseract  4.00.00dev
cntraining.cpp
Go to the documentation of this file.
1 /******************************************************************************
2 ** Filename: cntraining.cpp
3 ** Purpose: Generates a normproto and pffmtable.
4 ** Author: Dan Johnson
5 ** Revisment: Christy Russon
6 ** History: Fri Aug 18 08:53:50 1989, DSJ, Created.
7 ** 5/25/90, DSJ, Adapted to multiple feature types.
8 ** Tuesday, May 17, 1998 Changes made to make feature specific and
9 ** simplify structures. First step in simplifying training process.
10 **
11  ** (c) Copyright Hewlett-Packard Company, 1988.
12  ** Licensed under the Apache License, Version 2.0 (the "License");
13  ** you may not use this file except in compliance with the License.
14  ** You may obtain a copy of the License at
15  ** http://www.apache.org/licenses/LICENSE-2.0
16  ** Unless required by applicable law or agreed to in writing, software
17  ** distributed under the License is distributed on an "AS IS" BASIS,
18  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19  ** See the License for the specific language governing permissions and
20  ** limitations under the License.
21 ******************************************************************************/
22 
23 /*----------------------------------------------------------------------------
24  Include Files and Type Defines
25 ----------------------------------------------------------------------------*/
26 #include "oldlist.h"
27 #include "efio.h"
28 #include "emalloc.h"
29 #include "featdefs.h"
30 #include "tessopt.h"
31 #include "ocrfeatures.h"
32 #include "clusttool.h"
33 #include "cluster.h"
34 #include <string.h>
35 #include <stdio.h>
36 #include <math.h>
37 #include "unichar.h"
38 #include "commontraining.h"
39 
40 #define PROGRAM_FEATURE_TYPE "cn"
41 
43 
44 /*----------------------------------------------------------------------------
45  Public Function Prototypes
46 ----------------------------------------------------------------------------*/
47 int main (
48  int argc,
49  char **argv);
50 
51 /*----------------------------------------------------------------------------
52  Private Function Prototypes
53 ----------------------------------------------------------------------------*/
54 
55 void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
56  const FEATURE_DESC_STRUCT *feature_desc);
57 
58 /*
59 PARAMDESC *ConvertToPARAMDESC(
60  PARAM_DESC* Param_Desc,
61  int N);
62 */
63 
64 void WriteProtos(
65  FILE *File,
66  uinT16 N,
67  LIST ProtoList,
68  BOOL8 WriteSigProtos,
69  BOOL8 WriteInsigProtos);
70 
71 /*----------------------------------------------------------------------------
72  Global Data Definitions and Declarations
73 ----------------------------------------------------------------------------*/
74 /* global variable to hold configuration parameters to control clustering */
75 //-M 0.025 -B 0.05 -I 0.8 -C 1e-3
77 {
78  elliptical, 0.025, 0.05, 0.8, 1e-3, 0
79 };
80 
81 /*----------------------------------------------------------------------------
82  Public Code
83 ----------------------------------------------------------------------------*/
84 /*---------------------------------------------------------------------------*/
133 int main(int argc, char *argv[]) {
134  // Set the global Config parameters before parsing the command line.
135  Config = CNConfig;
136 
137  const char *PageName;
138  FILE *TrainingPage;
139  LIST CharList = NIL_LIST;
140  CLUSTERER *Clusterer = nullptr;
141  LIST ProtoList = NIL_LIST;
142  LIST NormProtoList = NIL_LIST;
143  LIST pCharList;
144  LABELEDLIST CharSample;
145  FEATURE_DEFS_STRUCT FeatureDefs;
146  InitFeatureDefs(&FeatureDefs);
147 
148  ParseArguments(&argc, &argv);
149  int num_fonts = 0;
150  while ((PageName = GetNextFilename(argc, argv)) != nullptr) {
151  printf("Reading %s ...\n", PageName);
152  TrainingPage = Efopen(PageName, "rb");
153  ReadTrainingSamples(FeatureDefs, PROGRAM_FEATURE_TYPE, 100, nullptr,
154  TrainingPage, &CharList);
155  fclose(TrainingPage);
156  ++num_fonts;
157  }
158  printf("Clustering ...\n");
159  // To allow an individual font to form a separate cluster,
160  // reduce the min samples:
161  // Config.MinSamples = 0.5 / num_fonts;
162  pCharList = CharList;
163  // The norm protos will count the source protos, so we keep them here in
164  // freeable_protos, so they can be freed later.
165  GenericVector<LIST> freeable_protos;
166  iterate(pCharList) {
167  //Cluster
168  CharSample = (LABELEDLIST)first_node(pCharList);
169  Clusterer =
170  SetUpForClustering(FeatureDefs, CharSample, PROGRAM_FEATURE_TYPE);
171  if (Clusterer == nullptr) { // To avoid a SIGSEGV
172  fprintf(stderr, "Error: NULL clusterer!\n");
173  return 1;
174  }
175  float SavedMinSamples = Config.MinSamples;
176  // To disable the tendency to produce a single cluster for all fonts,
177  // make MagicSamples an impossible to achieve number:
178  // Config.MagicSamples = CharSample->SampleCount * 10;
179  Config.MagicSamples = CharSample->SampleCount;
180  while (Config.MinSamples > 0.001) {
181  ProtoList = ClusterSamples(Clusterer, &Config);
182  if (NumberOfProtos(ProtoList, 1, 0) > 0) {
183  break;
184  } else {
185  Config.MinSamples *= 0.95;
186  printf("0 significant protos for %s."
187  " Retrying clustering with MinSamples = %f%%\n",
188  CharSample->Label, Config.MinSamples);
189  }
190  }
191  Config.MinSamples = SavedMinSamples;
192  AddToNormProtosList(&NormProtoList, ProtoList, CharSample->Label);
193  freeable_protos.push_back(ProtoList);
194  FreeClusterer(Clusterer);
195  }
196  FreeTrainingSamples(CharList);
197  int desc_index = ShortNameToFeatureType(FeatureDefs, PROGRAM_FEATURE_TYPE);
198  WriteNormProtos(FLAGS_D.c_str(), NormProtoList,
199  FeatureDefs.FeatureDesc[desc_index]);
200  FreeNormProtoList(NormProtoList);
201  for (int i = 0; i < freeable_protos.size(); ++i) {
202  FreeProtoList(&freeable_protos[i]);
203  }
204  printf ("\n");
205  return 0;
206 } // main
207 
208 /*----------------------------------------------------------------------------
209  Private Code
210 ----------------------------------------------------------------------------*/
211 
212 /*----------------------------------------------------------------------------*/
224 void WriteNormProtos(const char *Directory, LIST LabeledProtoList,
225  const FEATURE_DESC_STRUCT *feature_desc) {
226  FILE *File;
227  STRING Filename;
228  LABELEDLIST LabeledProto;
229  int N;
230 
231  Filename = "";
232  if (Directory != nullptr && Directory[0] != '\0') {
233  Filename += Directory;
234  Filename += "/";
235  }
236  Filename += "normproto";
237  printf ("\nWriting %s ...", Filename.string());
238  File = Efopen (Filename.string(), "wb");
239  fprintf(File, "%0d\n", feature_desc->NumParams);
240  WriteParamDesc(File, feature_desc->NumParams, feature_desc->ParamDesc);
241  iterate(LabeledProtoList)
242  {
243  LabeledProto = (LABELEDLIST) first_node (LabeledProtoList);
244  N = NumberOfProtos(LabeledProto->List, true, false);
245  if (N < 1) {
246  printf ("\nError! Not enough protos for %s: %d protos"
247  " (%d significant protos"
248  ", %d insignificant protos)\n",
249  LabeledProto->Label, N,
250  NumberOfProtos(LabeledProto->List, 1, 0),
251  NumberOfProtos(LabeledProto->List, 0, 1));
252  exit(1);
253  }
254  fprintf(File, "\n%s %d\n", LabeledProto->Label, N);
255  WriteProtos(File, feature_desc->NumParams, LabeledProto->List, true, false);
256  }
257  fclose (File);
258 
259 } // WriteNormProtos
260 
261 /*-------------------------------------------------------------------------*/
263  FILE *File,
264  uinT16 N,
265  LIST ProtoList,
266  BOOL8 WriteSigProtos,
267  BOOL8 WriteInsigProtos)
268 {
269  PROTOTYPE *Proto;
270 
271  // write prototypes
272  iterate(ProtoList)
273  {
274  Proto = (PROTOTYPE *) first_node ( ProtoList );
275  if (( Proto->Significant && WriteSigProtos ) ||
276  ( ! Proto->Significant && WriteInsigProtos ) )
277  WritePrototype( File, N, Proto );
278  }
279 } // WriteProtos
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:259
#define PROGRAM_FEATURE_TYPE
Definition: cntraining.cpp:40
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:288
void FreeNormProtoList(LIST CharList)
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
int push_back(T object)
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:512
FLOAT32 MinSamples
Definition: cluster.h:50
const char * string() const
Definition: strngs.cpp:198
struct LABELEDLISTNODE * LABELEDLIST
void ParseArguments(int *argc, char ***argv)
#define NIL_LIST
Definition: oldlist.h:126
CLUSTERCONFIG Config
int size() const
Definition: genericvector.h:72
int MagicSamples
Definition: cluster.h:55
int main(int argc, char **argv)
void WriteNormProtos(const char *Directory, LIST LabeledProtoList, const FEATURE_DESC_STRUCT *feature_desc)
Definition: cntraining.cpp:224
void WriteProtos(FILE *File, uinT16 N, LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: cntraining.cpp:262
unsigned char BOOL8
Definition: host.h:44
Definition: strngs.h:45
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:573
const char * GetNextFilename(int argc, const char *const *argv)
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
#define first_node(l)
Definition: oldlist.h:139
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:297
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:546
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
DECLARE_STRING_PARAM_FLAG(D)
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
unsigned Significant
Definition: cluster.h:68
#define iterate(l)
Definition: oldlist.h:159
uint16_t uinT16
Definition: host.h:37
CLUSTERCONFIG CNConfig
Definition: cntraining.cpp:76
void FreeTrainingSamples(LIST CharList)