tesseract  4.00.00dev
commontraining.cpp
Go to the documentation of this file.
1 // Copyright 2008 Google Inc. All Rights Reserved.
2 // Author: scharron@google.com (Samuel Charron)
3 //
4 // Licensed under the Apache License, Version 2.0 (the "License");
5 // you may not use this file except in compliance with the License.
6 // You may obtain a copy of the License at
7 // http://www.apache.org/licenses/LICENSE-2.0
8 // Unless required by applicable law or agreed to in writing, software
9 // distributed under the License is distributed on an "AS IS" BASIS,
10 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
11 // See the License for the specific language governing permissions and
12 // limitations under the License.
13 
14 #include "commontraining.h"
15 
16 #include "allheaders.h"
17 #include "ccutil.h"
18 #include "classify.h"
19 #include "cluster.h"
20 #include "clusttool.h"
21 #include "efio.h"
22 #include "emalloc.h"
23 #include "featdefs.h"
24 #include "fontinfo.h"
25 #include "globals.h"
26 #include "intfeaturespace.h"
27 #include "mastertrainer.h"
28 #include "mf.h"
29 #include "ndminx.h"
30 #include "oldlist.h"
31 #include "params.h"
32 #include "shapetable.h"
33 #include "tessdatamanager.h"
34 #include "tessopt.h"
35 #include "tprintf.h"
36 #include "unicity_table.h"
37 
38 #include <assert.h>
39 #include <math.h>
40 
41 using tesseract::CCUtil;
45 
46 // Global Variables.
47 
48 // global variable to hold configuration parameters to control clustering
49 // -M 0.625 -B 0.05 -I 1.0 -C 1e-6.
50 CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 };
53 
54 INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging");
55 INT_PARAM_FLAG(load_images, 0, "Load images with tr files");
56 STRING_PARAM_FLAG(configfile, "", "File to load more configs from");
57 STRING_PARAM_FLAG(D, "", "Directory to write output files to");
58 STRING_PARAM_FLAG(F, "font_properties", "File listing font properties");
59 STRING_PARAM_FLAG(X, "", "File listing font xheights");
60 STRING_PARAM_FLAG(U, "unicharset", "File to load unicharset from");
61 STRING_PARAM_FLAG(O, "", "File to write unicharset to");
62 STRING_PARAM_FLAG(output_trainer, "", "File to write trainer to");
63 STRING_PARAM_FLAG(test_ch, "", "UTF8 test character string");
64 DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples,
65  "Min number of samples per proto as % of total");
66 DOUBLE_PARAM_FLAG(clusterconfig_max_illegal, Config.MaxIllegal,
67  "Max percentage of samples in a cluster which have more"
68  " than 1 feature in that cluster");
69 DOUBLE_PARAM_FLAG(clusterconfig_independence, Config.Independence,
70  "Desired independence between dimensions");
71 DOUBLE_PARAM_FLAG(clusterconfig_confidence, Config.Confidence,
72  "Desired confidence in prototypes created");
73 
86 void ParseArguments(int* argc, char ***argv) {
87  STRING usage;
88  if (*argc) {
89  usage += (*argv)[0];
90  }
91  usage += " [.tr files ...]";
92  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
93  // Record the index of the first non-flag argument to 1, since we set
94  // remove_flags to true when parsing the flags.
95  tessoptind = 1;
96  // Set some global values based on the flags.
97  Config.MinSamples =
98  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
99  Config.MaxIllegal =
100  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_max_illegal)));
101  Config.Independence =
102  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_independence)));
103  Config.Confidence =
104  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_confidence)));
105  // Set additional parameters from config file if specified.
106  if (!FLAGS_configfile.empty()) {
108  FLAGS_configfile.c_str(),
110  ccutil.params());
111  }
112 }
113 
114 namespace tesseract {
115 // Helper loads shape table from the given file.
116 ShapeTable* LoadShapeTable(const STRING& file_prefix) {
117  ShapeTable* shape_table = nullptr;
118  STRING shape_table_file = file_prefix;
119  shape_table_file += kShapeTableFileSuffix;
120  TFile shape_fp;
121  if (shape_fp.Open(shape_table_file.string(), nullptr)) {
122  shape_table = new ShapeTable;
123  if (!shape_table->DeSerialize(&shape_fp)) {
124  delete shape_table;
125  shape_table = nullptr;
126  tprintf("Error: Failed to read shape table %s\n",
127  shape_table_file.string());
128  } else {
129  int num_shapes = shape_table->NumShapes();
130  tprintf("Read shape table %s of %d shapes\n",
131  shape_table_file.string(), num_shapes);
132  }
133  } else {
134  tprintf("Warning: No shape table file present: %s\n",
135  shape_table_file.string());
136  }
137  return shape_table;
138 }
139 
140 // Helper to write the shape_table.
141 void WriteShapeTable(const STRING& file_prefix, const ShapeTable& shape_table) {
142  STRING shape_table_file = file_prefix;
143  shape_table_file += kShapeTableFileSuffix;
144  FILE* fp = fopen(shape_table_file.string(), "wb");
145  if (fp != nullptr) {
146  if (!shape_table.Serialize(fp)) {
147  fprintf(stderr, "Error writing shape table: %s\n",
148  shape_table_file.string());
149  }
150  fclose(fp);
151  } else {
152  fprintf(stderr, "Error creating shape table: %s\n",
153  shape_table_file.string());
154  }
155 }
156 
172 MasterTrainer* LoadTrainingData(int argc, const char* const * argv,
173  bool replication,
174  ShapeTable** shape_table,
175  STRING* file_prefix) {
176  InitFeatureDefs(&feature_defs);
177  InitIntegerFX();
178  *file_prefix = "";
179  if (!FLAGS_D.empty()) {
180  *file_prefix += FLAGS_D.c_str();
181  *file_prefix += "/";
182  }
183  // If we are shape clustering (nullptr shape_table) or we successfully load
184  // a shape_table written by a previous shape clustering, then
185  // shape_analysis will be true, meaning that the MasterTrainer will replace
186  // some members of the unicharset with their fragments.
187  bool shape_analysis = false;
188  if (shape_table != nullptr) {
189  *shape_table = LoadShapeTable(*file_prefix);
190  if (*shape_table != nullptr) shape_analysis = true;
191  } else {
192  shape_analysis = true;
193  }
195  shape_analysis,
196  replication,
197  FLAGS_debug_level);
198  IntFeatureSpace fs;
200  trainer->LoadUnicharset(FLAGS_U.c_str());
201  // Get basic font information from font_properties.
202  if (!FLAGS_F.empty()) {
203  if (!trainer->LoadFontInfo(FLAGS_F.c_str())) {
204  delete trainer;
205  return nullptr;
206  }
207  }
208  if (!FLAGS_X.empty()) {
209  if (!trainer->LoadXHeights(FLAGS_X.c_str())) {
210  delete trainer;
211  return nullptr;
212  }
213  }
214  trainer->SetFeatureSpace(fs);
215  const char* page_name;
216  // Load training data from .tr files on the command line.
217  while ((page_name = GetNextFilename(argc, argv)) != nullptr) {
218  tprintf("Reading %s ...\n", page_name);
219  trainer->ReadTrainingSamples(page_name, feature_defs, false);
220 
221  // If there is a file with [lang].[fontname].exp[num].fontinfo present,
222  // read font spacing information in to fontinfo_table.
223  int pagename_len = strlen(page_name);
224  char* fontinfo_file_name = new char[pagename_len + 7];
225  strncpy(fontinfo_file_name, page_name, pagename_len - 2); // remove "tr"
226  strcpy(fontinfo_file_name + pagename_len - 2, "fontinfo"); // +"fontinfo"
227  trainer->AddSpacingInfo(fontinfo_file_name);
228  delete[] fontinfo_file_name;
229 
230  // Load the images into memory if required by the classifier.
231  if (FLAGS_load_images) {
232  STRING image_name = page_name;
233  // Chop off the tr and replace with tif. Extension must be tif!
234  image_name.truncate_at(image_name.length() - 2);
235  image_name += "tif";
236  trainer->LoadPageImages(image_name.string());
237  }
238  }
239  trainer->PostLoadCleanup();
240  // Write the master trainer if required.
241  if (!FLAGS_output_trainer.empty()) {
242  FILE* fp = fopen(FLAGS_output_trainer.c_str(), "wb");
243  if (fp == nullptr) {
244  tprintf("Can't create saved trainer data!\n");
245  } else {
246  trainer->Serialize(fp);
247  fclose(fp);
248  }
249  }
250  trainer->PreTrainingSetup();
251  if (!FLAGS_O.empty() &&
252  !trainer->unicharset().save_to_file(FLAGS_O.c_str())) {
253  fprintf(stderr, "Failed to save unicharset to file %s\n", FLAGS_O.c_str());
254  delete trainer;
255  return nullptr;
256  }
257  if (shape_table != nullptr) {
258  // If we previously failed to load a shapetable, then shape clustering
259  // wasn't run so make a flat one now.
260  if (*shape_table == nullptr) {
261  *shape_table = new ShapeTable;
262  trainer->SetupFlatShapeTable(*shape_table);
263  tprintf("Flat shape table summary: %s\n",
264  (*shape_table)->SummaryStr().string());
265  }
266  (*shape_table)->set_unicharset(trainer->unicharset());
267  }
268  return trainer;
269 }
270 
271 } // namespace tesseract.
272 
273 /*---------------------------------------------------------------------------*/
286 const char *GetNextFilename(int argc, const char* const * argv) {
287  if (tessoptind < argc)
288  return argv[tessoptind++];
289  else
290  return nullptr;
291 } /* GetNextFilename */
292 
293 /*---------------------------------------------------------------------------*/
305 LABELEDLIST FindList(LIST List, char* Label) {
306  LABELEDLIST LabeledList;
307 
308  iterate (List)
309  {
310  LabeledList = (LABELEDLIST) first_node (List);
311  if (strcmp (LabeledList->Label, Label) == 0)
312  return (LabeledList);
313  }
314  return (nullptr);
315 
316 } /* FindList */
317 
318 /*---------------------------------------------------------------------------*/
328 LABELEDLIST NewLabeledList(const char* Label) {
329  LABELEDLIST LabeledList;
330 
331  LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
332  LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
333  strcpy (LabeledList->Label, Label);
334  LabeledList->List = NIL_LIST;
335  LabeledList->SampleCount = 0;
336  LabeledList->font_sample_count = 0;
337  return (LabeledList);
338 
339 } /* NewLabeledList */
340 
341 /*---------------------------------------------------------------------------*/
342 // TODO(rays) This is now used only by cntraining. Convert cntraining to use
343 // the new method or get rid of it entirely.
363 void ReadTrainingSamples(const FEATURE_DEFS_STRUCT& feature_defs,
364  const char *feature_name, int max_samples,
365  UNICHARSET* unicharset,
366  FILE* file, LIST* training_samples) {
367  char buffer[2048];
368  char unichar[UNICHAR_LEN + 1];
369  LABELEDLIST char_sample;
370  FEATURE_SET feature_samples;
371  CHAR_DESC char_desc;
372  int ShortNameToFeatureType_res = ShortNameToFeatureType(feature_defs, feature_name);
373  assert(0 <= ShortNameToFeatureType_res);
374  unsigned int feature_type = static_cast<unsigned int>(ShortNameToFeatureType_res);
375  // Zero out the font_sample_count for all the classes.
376  LIST it = *training_samples;
377  iterate(it) {
378  char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
379  char_sample->font_sample_count = 0;
380  }
381 
382  while (fgets(buffer, 2048, file) != nullptr) {
383  if (buffer[0] == '\n')
384  continue;
385 
386  sscanf(buffer, "%*s %s", unichar);
387  if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
388  unicharset->unichar_insert(unichar);
389  if (unicharset->size() > MAX_NUM_CLASSES) {
390  tprintf("Error: Size of unicharset in training is "
391  "greater than MAX_NUM_CLASSES\n");
392  exit(1);
393  }
394  }
395  char_sample = FindList(*training_samples, unichar);
396  if (char_sample == nullptr) {
397  char_sample = NewLabeledList(unichar);
398  *training_samples = push(*training_samples, char_sample);
399  }
400  char_desc = ReadCharDescription(feature_defs, file);
401  feature_samples = char_desc->FeatureSets[feature_type];
402  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
403  char_sample->List = push(char_sample->List, feature_samples);
404  char_sample->SampleCount++;
405  char_sample->font_sample_count++;
406  } else {
407  FreeFeatureSet(feature_samples);
408  }
409  for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
410  if (feature_type != i)
411  FreeFeatureSet(char_desc->FeatureSets[i]);
412  }
413  free(char_desc);
414  }
415 } // ReadTrainingSamples
416 
417 
418 /*---------------------------------------------------------------------------*/
428 void FreeTrainingSamples(LIST CharList) {
429  LABELEDLIST char_sample;
430  FEATURE_SET FeatureSet;
431  LIST FeatureList;
432 
433  LIST nodes = CharList;
434  iterate(CharList) { /* iterate through all of the fonts */
435  char_sample = (LABELEDLIST) first_node(CharList);
436  FeatureList = char_sample->List;
437  iterate(FeatureList) { /* iterate through all of the classes */
438  FeatureSet = (FEATURE_SET) first_node(FeatureList);
439  FreeFeatureSet(FeatureSet);
440  }
441  FreeLabeledList(char_sample);
442  }
443  destroy(nodes);
444 } /* FreeTrainingSamples */
445 
446 /*---------------------------------------------------------------------------*/
457 void FreeLabeledList(LABELEDLIST LabeledList) {
458  destroy(LabeledList->List);
459  free(LabeledList->Label);
460  free(LabeledList);
461 } /* FreeLabeledList */
462 
463 /*---------------------------------------------------------------------------*/
478  LABELEDLIST char_sample,
479  const char* program_feature_type) {
480  uinT16 N;
481  int i, j;
482  FLOAT32* Sample = nullptr;
483  CLUSTERER *Clusterer;
484  inT32 CharID;
485  LIST FeatureList = nullptr;
486  FEATURE_SET FeatureSet = nullptr;
487 
488  int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
489  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
490  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
491 
492  FeatureList = char_sample->List;
493  CharID = 0;
494  iterate(FeatureList) {
495  FeatureSet = (FEATURE_SET) first_node(FeatureList);
496  for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
497  if (Sample == nullptr) Sample = (FLOAT32*)Emalloc(N * sizeof(FLOAT32));
498  for (j = 0; j < N; j++)
499  Sample[j] = FeatureSet->Features[i]->Params[j];
500  MakeSample (Clusterer, Sample, CharID);
501  }
502  CharID++;
503  }
504  free(Sample);
505  return Clusterer;
506 
507 } /* SetUpForClustering */
508 
509 /*------------------------------------------------------------------------*/
510 void MergeInsignificantProtos(LIST ProtoList, const char* label,
511  CLUSTERER* Clusterer, CLUSTERCONFIG* Config) {
512  PROTOTYPE* Prototype;
513  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
514 
515  LIST pProtoList = ProtoList;
516  iterate(pProtoList) {
517  Prototype = (PROTOTYPE *) first_node (pProtoList);
518  if (Prototype->Significant || Prototype->Merged)
519  continue;
520  FLOAT32 best_dist = 0.125;
521  PROTOTYPE* best_match = nullptr;
522  // Find the nearest alive prototype.
523  LIST list_it = ProtoList;
524  iterate(list_it) {
525  PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
526  if (test_p != Prototype && !test_p->Merged) {
527  FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
528  Clusterer->ParamDesc,
529  Prototype->Mean, test_p->Mean);
530  if (dist < best_dist) {
531  best_match = test_p;
532  best_dist = dist;
533  }
534  }
535  }
536  if (best_match != nullptr && !best_match->Significant) {
537  if (debug)
538  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
539  best_match->NumSamples, Prototype->NumSamples,
540  best_match->Mean[0], best_match->Mean[1],
541  Prototype->Mean[0], Prototype->Mean[1]);
542  best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
543  Clusterer->ParamDesc,
544  best_match->NumSamples,
545  Prototype->NumSamples,
546  best_match->Mean,
547  best_match->Mean, Prototype->Mean);
548  Prototype->NumSamples = 0;
549  Prototype->Merged = 1;
550  } else if (best_match != nullptr) {
551  if (debug)
552  tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
553  Prototype->Mean[0], Prototype->Mean[1],
554  best_match->Mean[0], best_match->Mean[1]);
555  Prototype->Merged = 1;
556  }
557  }
558  // Mark significant those that now have enough samples.
559  int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
560  pProtoList = ProtoList;
561  iterate(pProtoList) {
562  Prototype = (PROTOTYPE *) first_node (pProtoList);
563  // Process insignificant protos that do not match a green one
564  if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
565  !Prototype->Merged) {
566  if (debug)
567  tprintf("Red proto at %g,%g becoming green\n",
568  Prototype->Mean[0], Prototype->Mean[1]);
569  Prototype->Significant = true;
570  }
571  }
572 } /* MergeInsignificantProtos */
573 
574 /*-----------------------------------------------------------------------------*/
576  LIST ProtoList)
577 {
578  PROTOTYPE* Prototype;
579 
580  iterate(ProtoList)
581  {
582  Prototype = (PROTOTYPE *) first_node (ProtoList);
583  free(Prototype->Variance.Elliptical);
584  Prototype->Variance.Elliptical = nullptr;
585  free(Prototype->Magnitude.Elliptical);
586  Prototype->Magnitude.Elliptical = nullptr;
587  free(Prototype->Weight.Elliptical);
588  Prototype->Weight.Elliptical = nullptr;
589  }
590 }
591 
592 /*------------------------------------------------------------------------*/
594  LIST ProtoList,
595  BOOL8 KeepSigProtos,
596  BOOL8 KeepInsigProtos,
597  int N)
598 
599 {
600  LIST NewProtoList = NIL_LIST;
601  LIST pProtoList;
602  PROTOTYPE* Proto;
603  PROTOTYPE* NewProto;
604  int i;
605 
606  pProtoList = ProtoList;
607  iterate(pProtoList)
608  {
609  Proto = (PROTOTYPE *) first_node (pProtoList);
610  if ((Proto->Significant && KeepSigProtos) ||
611  (!Proto->Significant && KeepInsigProtos))
612  {
613  NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
614 
615  NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
616  NewProto->Significant = Proto->Significant;
617  NewProto->Style = Proto->Style;
618  NewProto->NumSamples = Proto->NumSamples;
619  NewProto->Cluster = nullptr;
620  NewProto->Distrib = nullptr;
621 
622  for (i=0; i < N; i++)
623  NewProto->Mean[i] = Proto->Mean[i];
624  if (Proto->Variance.Elliptical != nullptr) {
625  NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
626  for (i=0; i < N; i++)
627  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
628  }
629  else
630  NewProto->Variance.Elliptical = nullptr;
631  //---------------------------------------------
632  if (Proto->Magnitude.Elliptical != nullptr) {
633  NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
634  for (i=0; i < N; i++)
635  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
636  }
637  else
638  NewProto->Magnitude.Elliptical = nullptr;
639  //------------------------------------------------
640  if (Proto->Weight.Elliptical != nullptr) {
641  NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
642  for (i=0; i < N; i++)
643  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
644  }
645  else
646  NewProto->Weight.Elliptical = nullptr;
647 
648  NewProto->TotalMagnitude = Proto->TotalMagnitude;
649  NewProto->LogMagnitude = Proto->LogMagnitude;
650  NewProtoList = push_last(NewProtoList, NewProto);
651  }
652  }
653  FreeProtoList(&ProtoList);
654  return (NewProtoList);
655 } /* RemoveInsignificantProtos */
656 
657 /*----------------------------------------------------------------------------*/
658 MERGE_CLASS FindClass(LIST List, const char* Label) {
659  MERGE_CLASS MergeClass;
660 
661  iterate (List)
662  {
663  MergeClass = (MERGE_CLASS) first_node (List);
664  if (strcmp (MergeClass->Label, Label) == 0)
665  return (MergeClass);
666  }
667  return (nullptr);
668 
669 } /* FindClass */
670 
671 /*---------------------------------------------------------------------------*/
672 MERGE_CLASS NewLabeledClass(const char* Label) {
673  MERGE_CLASS MergeClass;
674 
675  MergeClass = new MERGE_CLASS_NODE;
676  MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
677  strcpy (MergeClass->Label, Label);
678  MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
679  return (MergeClass);
680 
681 } /* NewLabeledClass */
682 
683 /*-----------------------------------------------------------------------------*/
693 void FreeLabeledClassList(LIST ClassList) {
694  MERGE_CLASS MergeClass;
695 
696  LIST nodes = ClassList;
697  iterate(ClassList) /* iterate through all of the fonts */
698  {
699  MergeClass = (MERGE_CLASS) first_node (ClassList);
700  free (MergeClass->Label);
701  FreeClass(MergeClass->Class);
702  delete MergeClass;
703  }
704  destroy(nodes);
705 
706 } /* FreeLabeledClassList */
707 
708 /* SetUpForFloat2Int */
710  LIST LabeledClassList) {
711  MERGE_CLASS MergeClass;
712  CLASS_TYPE Class;
713  int NumProtos;
714  int NumConfigs;
715  int NumWords;
716  int i, j;
717  float Values[3];
718  PROTO NewProto;
719  PROTO OldProto;
720  BIT_VECTOR NewConfig;
721  BIT_VECTOR OldConfig;
722 
723  // printf("Float2Int ...\n");
724 
725  CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
726  iterate(LabeledClassList)
727  {
728  UnicityTableEqEq<int> font_set;
729  MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
730  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
731  NumProtos = MergeClass->Class->NumProtos;
732  NumConfigs = MergeClass->Class->NumConfigs;
733  font_set.move(&MergeClass->Class->font_set);
734  Class->NumProtos = NumProtos;
735  Class->MaxNumProtos = NumProtos;
736  Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
737  for(i=0; i < NumProtos; i++)
738  {
739  NewProto = ProtoIn(Class, i);
740  OldProto = ProtoIn(MergeClass->Class, i);
741  Values[0] = OldProto->X;
742  Values[1] = OldProto->Y;
743  Values[2] = OldProto->Angle;
744  Normalize(Values);
745  NewProto->X = OldProto->X;
746  NewProto->Y = OldProto->Y;
747  NewProto->Length = OldProto->Length;
748  NewProto->Angle = OldProto->Angle;
749  NewProto->A = Values[0];
750  NewProto->B = Values[1];
751  NewProto->C = Values[2];
752  }
753 
754  Class->NumConfigs = NumConfigs;
755  Class->MaxNumConfigs = NumConfigs;
756  Class->font_set.move(&font_set);
757  Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
758  NumWords = WordsInVectorOfSize(NumProtos);
759  for(i=0; i < NumConfigs; i++)
760  {
761  NewConfig = NewBitVector(NumProtos);
762  OldConfig = MergeClass->Class->Configurations[i];
763  for(j=0; j < NumWords; j++)
764  NewConfig[j] = OldConfig[j];
765  Class->Configurations[i] = NewConfig;
766  }
767  }
768  return float_classes;
769 } // SetUpForFloat2Int
770 
771 /*--------------------------------------------------------------------------*/
772 void Normalize (
773  float *Values)
774 {
775  float Slope;
776  float Intercept;
777  float Normalizer;
778 
779  Slope = tan (Values [2] * 2 * PI);
780  Intercept = Values [1] - Slope * Values [0];
781  Normalizer = 1 / sqrt (Slope * Slope + 1.0);
782 
783  Values [0] = Slope * Normalizer;
784  Values [1] = - Normalizer;
785  Values [2] = Intercept * Normalizer;
786 } // Normalize
787 
788 /*-------------------------------------------------------------------------*/
789 void FreeNormProtoList(LIST CharList)
790 
791 {
792  LABELEDLIST char_sample;
793 
794  LIST nodes = CharList;
795  iterate(CharList) /* iterate through all of the fonts */
796  {
797  char_sample = (LABELEDLIST) first_node (CharList);
798  FreeLabeledList (char_sample);
799  }
800  destroy(nodes);
801 
802 } // FreeNormProtoList
803 
804 /*---------------------------------------------------------------------------*/
806  LIST* NormProtoList,
807  LIST ProtoList,
808  char* CharName)
809 {
810  PROTOTYPE* Proto;
811  LABELEDLIST LabeledProtoList;
812 
813  LabeledProtoList = NewLabeledList(CharName);
814  iterate(ProtoList)
815  {
816  Proto = (PROTOTYPE *) first_node (ProtoList);
817  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
818  }
819  *NormProtoList = push(*NormProtoList, LabeledProtoList);
820 }
821 
822 /*---------------------------------------------------------------------------*/
823 int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos,
824  BOOL8 CountInsigProtos) {
825  int N = 0;
826  PROTOTYPE* Proto;
827 
828  iterate(ProtoList)
829  {
830  Proto = (PROTOTYPE *) first_node ( ProtoList );
831  if ((Proto->Significant && CountSigProtos) ||
832  (!Proto->Significant && CountInsigProtos))
833  N++;
834  }
835  return(N);
836 }
DISTRIBUTION * Distrib
Definition: cluster.h:77
LABELEDLIST FindList(LIST List, char *Label)
FLOAT32 LogMagnitude
Definition: cluster.h:80
void CleanUpUnusedData(LIST ProtoList)
bool Serialize(FILE *fp) const
Definition: shapetable.cpp:246
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
int tessoptind
Definition: tessopt.cpp:24
bool LoadFontInfo(const char *filename)
void truncate_at(inT32 index)
Definition: strngs.cpp:269
FLOAT32 * Elliptical
Definition: cluster.h:64
void AddToNormProtosList(LIST *NormProtoList, LIST ProtoList, char *CharName)
FEATURE Features[1]
Definition: ocrfeatures.h:72
int32_t inT32
Definition: host.h:38
STRING_PARAM_FLAG(configfile, "", "File to load more configs from")
void SetFeatureSpace(const IntFeatureSpace &fs)
Definition: mastertrainer.h:82
MERGE_CLASS_NODE * MERGE_CLASS
LIST destroy(LIST list)
Definition: oldlist.cpp:182
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
ParamsVectors * params()
Definition: ccutil.h:62
void Normalize(float *Values)
DOUBLE_PARAM_FLAG(clusterconfig_min_samples_fraction, Config.MinSamples, "Min number of samples per proto as % of total")
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:71
void WriteShapeTable(const STRING &file_prefix, const ShapeTable &shape_table)
#define MAX_NUM_PROTOS
Definition: intproto.h:47
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
FLOAT32 A
Definition: protos.h:44
void FreeNormProtoList(LIST CharList)
unsigned Merged
Definition: cluster.h:69
void * Emalloc(int Size)
Definition: emalloc.cpp:47
inT16 MaxNumConfigs
Definition: protos.h:63
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
inT16 NumConfigs
Definition: protos.h:62
#define tprintf(...)
Definition: tprintf.h:31
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 MinSamples
Definition: cluster.h:50
const UNICHARSET & unicharset() const
inT16 NumProtos
Definition: protos.h:59
const char * string() const
Definition: strngs.cpp:198
bool DeSerialize(TFile *fp)
Definition: shapetable.cpp:252
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:332
struct LABELEDLISTNODE * LABELEDLIST
inT32 length() const
Definition: strngs.cpp:193
void ParseArguments(int *argc, char ***argv)
#define NIL_LIST
Definition: oldlist.h:126
FLOAT32 MaxIllegal
Definition: cluster.h:51
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:258
CLUSTERCONFIG Config
int NumShapes() const
Definition: shapetable.h:275
FLOATUNION Weight
Definition: cluster.h:83
void Init(uinT8 xbuckets, uinT8 ybuckets, uinT8 thetabuckets)
MERGE_CLASS FindClass(LIST List, const char *Label)
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:89
CONFIGS Configurations
Definition: protos.h:64
#define UNICHAR_LEN
Definition: unichar.h:30
FLOAT64 Confidence
Definition: cluster.h:54
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:44
FLOATUNION Variance
Definition: cluster.h:81
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399
inT16 SampleSize
Definition: cluster.h:87
unsigned Style
Definition: cluster.h:74
FLOAT32 Length
Definition: protos.h:50
PROTO_STRUCT * PROTO
Definition: protos.h:52
#define PI
Definition: const.h:19
uinT32 NumFeatureSets
Definition: featdefs.h:43
void LoadPageImages(const char *filename)
const int kBoostDirBuckets
unsigned char BOOL8
Definition: host.h:44
Definition: strngs.h:45
MasterTrainer * LoadTrainingData(int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
PROTO Prototypes
Definition: protos.h:61
void move(UnicityTable< T > *from)
CLUSTERER * SetUpForClustering(const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:573
void ReadTrainingSamples(const char *page_name, const FEATURE_DEFS_STRUCT &feature_defs, bool verification)
const char * GetNextFilename(int argc, const char *const *argv)
CLASS_STRUCT * SetUpForFloat2Int(const UNICHARSET &unicharset, LIST LabeledClassList)
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:40
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:74
FLOAT32 * Mean
Definition: cluster.h:78
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:214
LIST RemoveInsignificantProtos(LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N)
FLOAT32 Y
Definition: protos.h:48
#define first_node(l)
Definition: oldlist.h:139
FLOAT32 X
Definition: protos.h:47
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:247
FLOAT32 Independence
Definition: cluster.h:53
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:297
inT32 NumChar
Definition: cluster.h:93
ShapeTable * LoadShapeTable(const STRING &file_prefix)
#define MAX(x, y)
Definition: ndminx.h:24
void MergeInsignificantProtos(LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
void FreeLabeledList(LABELEDLIST LabeledList)
void SetupFlatShapeTable(ShapeTable *shape_table)
float FLOAT32
Definition: host.h:42
FLOAT32 Angle
Definition: protos.h:49
UnicityTableEqEq< int > font_set
Definition: protos.h:65
#define MIN(x, y)
Definition: ndminx.h:28
const int kBoostXYBuckets
bool save_to_file(const char *const filename) const
Definition: unicharset.h:308
FLOAT32 ComputeDistance(int k, PARAM_DESC *dim, FLOAT32 p1[], FLOAT32 p2[])
Definition: kdtree.cpp:471
const char * c_str() const
Definition: strngs.cpp:209
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:38
void ReadTrainingSamples(const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
FEATURE_DEFS_STRUCT feature_defs
void FreeLabeledClassList(LIST ClassList)
unsigned NumSamples
Definition: cluster.h:75
INT_PARAM_FLAG(debug_level, 0, "Level of Trainer debugging")
int size() const
Definition: unicharset.h:299
int NumberOfProtos(LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
CCUtil ccutil
#define MAX_NUM_CONFIGS
Definition: intproto.h:46
CLASS_TYPE Class
FLOAT32 C
Definition: protos.h:46
FLOAT32 B
Definition: protos.h:45
unsigned Significant
Definition: cluster.h:68
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
CLUSTER * Cluster
Definition: cluster.h:76
FLOAT32 Params[1]
Definition: ocrfeatures.h:65
LABELEDLIST NewLabeledList(const char *Label)
bool Serialize(FILE *fp) const
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
inT16 MaxNumProtos
Definition: protos.h:60
bool AddSpacingInfo(const char *filename)
#define iterate(l)
Definition: oldlist.h:159
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
void LoadUnicharset(const char *filename)
bool LoadXHeights(const char *filename)
MERGE_CLASS NewLabeledClass(const char *Label)
FLOATUNION Magnitude
Definition: cluster.h:82
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317
uint16_t uinT16
Definition: host.h:37
FLOAT32 TotalMagnitude
Definition: cluster.h:79
#define ProtoIn(Class, Pid)
Definition: protos.h:123
void InitIntegerFX()
Definition: intfx.cpp:55
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612
void FreeTrainingSamples(LIST CharList)
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:455