tesseract  4.00.00dev
commontraining.cpp File Reference
#include "commontraining.h"
#include "allheaders.h"
#include "ccutil.h"
#include "classify.h"
#include "cluster.h"
#include "clusttool.h"
#include "efio.h"
#include "emalloc.h"
#include "featdefs.h"
#include "fontinfo.h"
#include "globals.h"
#include "intfeaturespace.h"
#include "mastertrainer.h"
#include "mf.h"
#include "ndminx.h"
#include "oldlist.h"
#include "params.h"
#include "shapetable.h"
#include "tessdatamanager.h"
#include "tessopt.h"
#include "tprintf.h"
#include "unicity_table.h"
#include <assert.h>
#include <math.h>

Go to the source code of this file.

Namespaces

 tesseract
 

Functions

 INT_PARAM_FLAG (debug_level, 0, "Level of Trainer debugging")
 
 INT_PARAM_FLAG (load_images, 0, "Load images with tr files")
 
 STRING_PARAM_FLAG (configfile, "", "File to load more configs from")
 
 STRING_PARAM_FLAG (D, "", "Directory to write output files to")
 
 STRING_PARAM_FLAG (F, "font_properties", "File listing font properties")
 
 STRING_PARAM_FLAG (X, "", "File listing font xheights")
 
 STRING_PARAM_FLAG (U, "unicharset", "File to load unicharset from")
 
 STRING_PARAM_FLAG (O, "", "File to write unicharset to")
 
 STRING_PARAM_FLAG (output_trainer, "", "File to write trainer to")
 
 STRING_PARAM_FLAG (test_ch, "", "UTF8 test character string")
 
 DOUBLE_PARAM_FLAG (clusterconfig_min_samples_fraction, Config.MinSamples, "Min number of samples per proto as % of total")
 
 DOUBLE_PARAM_FLAG (clusterconfig_max_illegal, Config.MaxIllegal, "Max percentage of samples in a cluster which have more" " than 1 feature in that cluster")
 
 DOUBLE_PARAM_FLAG (clusterconfig_independence, Config.Independence, "Desired independence between dimensions")
 
 DOUBLE_PARAM_FLAG (clusterconfig_confidence, Config.Confidence, "Desired confidence in prototypes created")
 
void ParseArguments (int *argc, char ***argv)
 
ShapeTabletesseract::LoadShapeTable (const STRING &file_prefix)
 
void tesseract::WriteShapeTable (const STRING &file_prefix, const ShapeTable &shape_table)
 
MasterTrainer * tesseract::LoadTrainingData (int argc, const char *const *argv, bool replication, ShapeTable **shape_table, STRING *file_prefix)
 
const char * GetNextFilename (int argc, const char *const *argv)
 
LABELEDLIST FindList (LIST List, char *Label)
 
LABELEDLIST NewLabeledList (const char *Label)
 
void ReadTrainingSamples (const FEATURE_DEFS_STRUCT &feature_defs, const char *feature_name, int max_samples, UNICHARSET *unicharset, FILE *file, LIST *training_samples)
 
void FreeTrainingSamples (LIST CharList)
 
void FreeLabeledList (LABELEDLIST LabeledList)
 
CLUSTERERSetUpForClustering (const FEATURE_DEFS_STRUCT &FeatureDefs, LABELEDLIST char_sample, const char *program_feature_type)
 
void MergeInsignificantProtos (LIST ProtoList, const char *label, CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void CleanUpUnusedData (LIST ProtoList)
 
LIST RemoveInsignificantProtos (LIST ProtoList, BOOL8 KeepSigProtos, BOOL8 KeepInsigProtos, int N)
 
MERGE_CLASS FindClass (LIST List, const char *Label)
 
MERGE_CLASS NewLabeledClass (const char *Label)
 
void FreeLabeledClassList (LIST ClassList)
 
CLASS_STRUCTSetUpForFloat2Int (const UNICHARSET &unicharset, LIST LabeledClassList)
 
void Normalize (float *Values)
 
void FreeNormProtoList (LIST CharList)
 
void AddToNormProtosList (LIST *NormProtoList, LIST ProtoList, char *CharName)
 
int NumberOfProtos (LIST ProtoList, BOOL8 CountSigProtos, BOOL8 CountInsigProtos)
 

Variables

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }
 
FEATURE_DEFS_STRUCT feature_defs
 
CCUtil ccutil
 

Function Documentation

◆ AddToNormProtosList()

void AddToNormProtosList ( LIST NormProtoList,
LIST  ProtoList,
char *  CharName 
)

Definition at line 805 of file commontraining.cpp.

809 {
810  PROTOTYPE* Proto;
811  LABELEDLIST LabeledProtoList;
812 
813  LabeledProtoList = NewLabeledList(CharName);
814  iterate(ProtoList)
815  {
816  Proto = (PROTOTYPE *) first_node (ProtoList);
817  LabeledProtoList->List = push(LabeledProtoList->List, Proto);
818  }
819  *NormProtoList = push(*NormProtoList, LabeledProtoList);
820 }
#define first_node(l)
Definition: oldlist.h:139
LABELEDLIST NewLabeledList(const char *Label)
#define iterate(l)
Definition: oldlist.h:159
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317

◆ CleanUpUnusedData()

void CleanUpUnusedData ( LIST  ProtoList)

Definition at line 575 of file commontraining.cpp.

577 {
578  PROTOTYPE* Prototype;
579 
580  iterate(ProtoList)
581  {
582  Prototype = (PROTOTYPE *) first_node (ProtoList);
583  free(Prototype->Variance.Elliptical);
584  Prototype->Variance.Elliptical = nullptr;
585  free(Prototype->Magnitude.Elliptical);
586  Prototype->Magnitude.Elliptical = nullptr;
587  free(Prototype->Weight.Elliptical);
588  Prototype->Weight.Elliptical = nullptr;
589  }
590 }
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Variance
Definition: cluster.h:81
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159
FLOATUNION Magnitude
Definition: cluster.h:82

◆ DOUBLE_PARAM_FLAG() [1/4]

DOUBLE_PARAM_FLAG ( clusterconfig_min_samples_fraction  ,
Config.  MinSamples,
"Min number of samples per proto as % of total"   
)

◆ DOUBLE_PARAM_FLAG() [2/4]

DOUBLE_PARAM_FLAG ( clusterconfig_max_illegal  ,
Config.  MaxIllegal,
"Max percentage of samples in a cluster which have more" " than 1 feature in that cluster"   
)

◆ DOUBLE_PARAM_FLAG() [3/4]

DOUBLE_PARAM_FLAG ( clusterconfig_independence  ,
Config.  Independence,
"Desired independence between dimensions"   
)

◆ DOUBLE_PARAM_FLAG() [4/4]

DOUBLE_PARAM_FLAG ( clusterconfig_confidence  ,
Config.  Confidence,
"Desired confidence in prototypes created"   
)

◆ FindClass()

MERGE_CLASS FindClass ( LIST  List,
const char *  Label 
)

Definition at line 658 of file commontraining.cpp.

658  {
659  MERGE_CLASS MergeClass;
660 
661  iterate (List)
662  {
663  MergeClass = (MERGE_CLASS) first_node (List);
664  if (strcmp (MergeClass->Label, Label) == 0)
665  return (MergeClass);
666  }
667  return (nullptr);
668 
669 } /* FindClass */
MERGE_CLASS_NODE * MERGE_CLASS
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159

◆ FindList()

LABELEDLIST FindList ( LIST  List,
char *  Label 
)

This routine searches through a list of labeled lists to find a list with the specified label. If a matching labeled list cannot be found, nullptr is returned.

Parameters
Listlist to search
Labellabel to search for
Returns
Labeled list with the specified label or nullptr.
Note
Globals: none
Exceptions: none
History: Fri Aug 18 15:57:41 1989, DSJ, Created.

Definition at line 305 of file commontraining.cpp.

305  {
306  LABELEDLIST LabeledList;
307 
308  iterate (List)
309  {
310  LabeledList = (LABELEDLIST) first_node (List);
311  if (strcmp (LabeledList->Label, Label) == 0)
312  return (LabeledList);
313  }
314  return (nullptr);
315 
316 } /* FindList */
struct LABELEDLISTNODE * LABELEDLIST
#define first_node(l)
Definition: oldlist.h:139
#define iterate(l)
Definition: oldlist.h:159

◆ FreeLabeledClassList()

void FreeLabeledClassList ( LIST  ClassList)

This routine deallocates all of the space allocated to the specified list of training samples.

Parameters
ClassListlist of all fonts in document
Returns
none
Note
Globals: none
Exceptions: none
History: Fri Aug 18 17:44:27 1989, DSJ, Created.

Definition at line 693 of file commontraining.cpp.

693  {
694  MERGE_CLASS MergeClass;
695 
696  LIST nodes = ClassList;
697  iterate(ClassList) /* iterate through all of the fonts */
698  {
699  MergeClass = (MERGE_CLASS) first_node (ClassList);
700  free (MergeClass->Label);
701  FreeClass(MergeClass->Class);
702  delete MergeClass;
703  }
704  destroy(nodes);
705 
706 } /* FreeLabeledClassList */
MERGE_CLASS_NODE * MERGE_CLASS
LIST destroy(LIST list)
Definition: oldlist.cpp:182
void FreeClass(CLASS_TYPE Class)
Definition: protos.cpp:214
#define first_node(l)
Definition: oldlist.h:139
CLASS_TYPE Class
#define iterate(l)
Definition: oldlist.h:159

◆ FreeLabeledList()

void FreeLabeledList ( LABELEDLIST  LabeledList)

This routine deallocates all of the memory consumed by a labeled list. It does not free any memory which may be consumed by the items in the list.

Parameters
LabeledListlabeled list to be freed
Note
Globals: none
Returns
none
Note
Exceptions: none
History: Fri Aug 18 17:52:45 1989, DSJ, Created.

Definition at line 457 of file commontraining.cpp.

457  {
458  destroy(LabeledList->List);
459  free(LabeledList->Label);
460  free(LabeledList);
461 } /* FreeLabeledList */
LIST destroy(LIST list)
Definition: oldlist.cpp:182

◆ FreeNormProtoList()

void FreeNormProtoList ( LIST  CharList)

Definition at line 789 of file commontraining.cpp.

791 {
792  LABELEDLIST char_sample;
793 
794  LIST nodes = CharList;
795  iterate(CharList) /* iterate through all of the fonts */
796  {
797  char_sample = (LABELEDLIST) first_node (CharList);
798  FreeLabeledList (char_sample);
799  }
800  destroy(nodes);
801 
802 } // FreeNormProtoList
LIST destroy(LIST list)
Definition: oldlist.cpp:182
struct LABELEDLISTNODE * LABELEDLIST
#define first_node(l)
Definition: oldlist.h:139
void FreeLabeledList(LABELEDLIST LabeledList)
#define iterate(l)
Definition: oldlist.h:159

◆ FreeTrainingSamples()

void FreeTrainingSamples ( LIST  CharList)

This routine deallocates all of the space allocated to the specified list of training samples.

Parameters
CharListlist of all fonts in document
Returns
none
Note
Globals: none
Exceptions: none
History: Fri Aug 18 17:44:27 1989, DSJ, Created.

Definition at line 428 of file commontraining.cpp.

428  {
429  LABELEDLIST char_sample;
430  FEATURE_SET FeatureSet;
431  LIST FeatureList;
432 
433  LIST nodes = CharList;
434  iterate(CharList) { /* iterate through all of the fonts */
435  char_sample = (LABELEDLIST) first_node(CharList);
436  FeatureList = char_sample->List;
437  iterate(FeatureList) { /* iterate through all of the classes */
438  FeatureSet = (FEATURE_SET) first_node(FeatureList);
439  FreeFeatureSet(FeatureSet);
440  }
441  FreeLabeledList(char_sample);
442  }
443  destroy(nodes);
444 } /* FreeTrainingSamples */
LIST destroy(LIST list)
Definition: oldlist.cpp:182
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:71
struct LABELEDLISTNODE * LABELEDLIST
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:74
#define first_node(l)
Definition: oldlist.h:139
void FreeLabeledList(LABELEDLIST LabeledList)
#define iterate(l)
Definition: oldlist.h:159

◆ GetNextFilename()

const char* GetNextFilename ( int  argc,
const char *const *  argv 
)

This routine returns the next command line argument. If there are no remaining command line arguments, it returns nullptr. This routine should only be called after all option arguments have been parsed and removed with ParseArguments.

Globals:

  • tessoptind defined by tessopt sys call
    Returns
    Next command line argument or nullptr.
    Note
    Exceptions: none
    History: Fri Aug 18 09:34:12 1989, DSJ, Created.

Definition at line 286 of file commontraining.cpp.

286  {
287  if (tessoptind < argc)
288  return argv[tessoptind++];
289  else
290  return nullptr;
291 } /* GetNextFilename */
int tessoptind
Definition: tessopt.cpp:24

◆ INT_PARAM_FLAG() [1/2]

INT_PARAM_FLAG ( debug_level  ,
,
"Level of Trainer debugging"   
)

◆ INT_PARAM_FLAG() [2/2]

INT_PARAM_FLAG ( load_images  ,
,
"Load images with tr files"   
)

◆ MergeInsignificantProtos()

void MergeInsignificantProtos ( LIST  ProtoList,
const char *  label,
CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

Definition at line 510 of file commontraining.cpp.

511  {
512  PROTOTYPE* Prototype;
513  bool debug = strcmp(FLAGS_test_ch.c_str(), label) == 0;
514 
515  LIST pProtoList = ProtoList;
516  iterate(pProtoList) {
517  Prototype = (PROTOTYPE *) first_node (pProtoList);
518  if (Prototype->Significant || Prototype->Merged)
519  continue;
520  FLOAT32 best_dist = 0.125;
521  PROTOTYPE* best_match = nullptr;
522  // Find the nearest alive prototype.
523  LIST list_it = ProtoList;
524  iterate(list_it) {
525  PROTOTYPE* test_p = (PROTOTYPE *) first_node (list_it);
526  if (test_p != Prototype && !test_p->Merged) {
527  FLOAT32 dist = ComputeDistance(Clusterer->SampleSize,
528  Clusterer->ParamDesc,
529  Prototype->Mean, test_p->Mean);
530  if (dist < best_dist) {
531  best_match = test_p;
532  best_dist = dist;
533  }
534  }
535  }
536  if (best_match != nullptr && !best_match->Significant) {
537  if (debug)
538  tprintf("Merging red clusters (%d+%d) at %g,%g and %g,%g\n",
539  best_match->NumSamples, Prototype->NumSamples,
540  best_match->Mean[0], best_match->Mean[1],
541  Prototype->Mean[0], Prototype->Mean[1]);
542  best_match->NumSamples = MergeClusters(Clusterer->SampleSize,
543  Clusterer->ParamDesc,
544  best_match->NumSamples,
545  Prototype->NumSamples,
546  best_match->Mean,
547  best_match->Mean, Prototype->Mean);
548  Prototype->NumSamples = 0;
549  Prototype->Merged = 1;
550  } else if (best_match != nullptr) {
551  if (debug)
552  tprintf("Red proto at %g,%g matched a green one at %g,%g\n",
553  Prototype->Mean[0], Prototype->Mean[1],
554  best_match->Mean[0], best_match->Mean[1]);
555  Prototype->Merged = 1;
556  }
557  }
558  // Mark significant those that now have enough samples.
559  int min_samples = (inT32) (Config->MinSamples * Clusterer->NumChar);
560  pProtoList = ProtoList;
561  iterate(pProtoList) {
562  Prototype = (PROTOTYPE *) first_node (pProtoList);
563  // Process insignificant protos that do not match a green one
564  if (!Prototype->Significant && Prototype->NumSamples >= min_samples &&
565  !Prototype->Merged) {
566  if (debug)
567  tprintf("Red proto at %g,%g becoming green\n",
568  Prototype->Mean[0], Prototype->Mean[1]);
569  Prototype->Significant = true;
570  }
571  }
572 } /* MergeInsignificantProtos */
int32_t inT32
Definition: host.h:38
unsigned Merged
Definition: cluster.h:69
#define tprintf(...)
Definition: tprintf.h:31
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 MinSamples
Definition: cluster.h:50
inT16 SampleSize
Definition: cluster.h:87
FLOAT32 * Mean
Definition: cluster.h:78
#define first_node(l)
Definition: oldlist.h:139
inT32 NumChar
Definition: cluster.h:93
float FLOAT32
Definition: host.h:42
FLOAT32 ComputeDistance(int k, PARAM_DESC *dim, FLOAT32 p1[], FLOAT32 p2[])
Definition: kdtree.cpp:471
unsigned NumSamples
Definition: cluster.h:75
unsigned Significant
Definition: cluster.h:68
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
#define iterate(l)
Definition: oldlist.h:159

◆ NewLabeledClass()

MERGE_CLASS NewLabeledClass ( const char *  Label)

Definition at line 672 of file commontraining.cpp.

672  {
673  MERGE_CLASS MergeClass;
674 
675  MergeClass = new MERGE_CLASS_NODE;
676  MergeClass->Label = (char*)Emalloc (strlen (Label)+1);
677  strcpy (MergeClass->Label, Label);
678  MergeClass->Class = NewClass (MAX_NUM_PROTOS, MAX_NUM_CONFIGS);
679  return (MergeClass);
680 
681 } /* NewLabeledClass */
#define MAX_NUM_PROTOS
Definition: intproto.h:47
void * Emalloc(int Size)
Definition: emalloc.cpp:47
CLASS_TYPE NewClass(int NumProtos, int NumConfigs)
Definition: protos.cpp:247
#define MAX_NUM_CONFIGS
Definition: intproto.h:46
CLASS_TYPE Class

◆ NewLabeledList()

LABELEDLIST NewLabeledList ( const char *  Label)

This routine allocates a new, empty labeled list and gives it the specified label.

Parameters
Labellabel for new list
Returns
New, empty labeled list.
Note
Globals: none
Exceptions: none
History: Fri Aug 18 16:08:46 1989, DSJ, Created.

Definition at line 328 of file commontraining.cpp.

328  {
329  LABELEDLIST LabeledList;
330 
331  LabeledList = (LABELEDLIST) Emalloc (sizeof (LABELEDLISTNODE));
332  LabeledList->Label = (char*)Emalloc (strlen (Label)+1);
333  strcpy (LabeledList->Label, Label);
334  LabeledList->List = NIL_LIST;
335  LabeledList->SampleCount = 0;
336  LabeledList->font_sample_count = 0;
337  return (LabeledList);
338 
339 } /* NewLabeledList */
void * Emalloc(int Size)
Definition: emalloc.cpp:47
struct LABELEDLISTNODE * LABELEDLIST
#define NIL_LIST
Definition: oldlist.h:126

◆ Normalize()

void Normalize ( float *  Values)

Definition at line 772 of file commontraining.cpp.

774 {
775  float Slope;
776  float Intercept;
777  float Normalizer;
778 
779  Slope = tan (Values [2] * 2 * PI);
780  Intercept = Values [1] - Slope * Values [0];
781  Normalizer = 1 / sqrt (Slope * Slope + 1.0);
782 
783  Values [0] = Slope * Normalizer;
784  Values [1] = - Normalizer;
785  Values [2] = Intercept * Normalizer;
786 } // Normalize
#define PI
Definition: const.h:19

◆ NumberOfProtos()

int NumberOfProtos ( LIST  ProtoList,
BOOL8  CountSigProtos,
BOOL8  CountInsigProtos 
)

Definition at line 823 of file commontraining.cpp.

824  {
825  int N = 0;
826  PROTOTYPE* Proto;
827 
828  iterate(ProtoList)
829  {
830  Proto = (PROTOTYPE *) first_node ( ProtoList );
831  if ((Proto->Significant && CountSigProtos) ||
832  (!Proto->Significant && CountInsigProtos))
833  N++;
834  }
835  return(N);
836 }
#define first_node(l)
Definition: oldlist.h:139
unsigned Significant
Definition: cluster.h:68
#define iterate(l)
Definition: oldlist.h:159

◆ ParseArguments()

void ParseArguments ( int argc,
char ***  argv 
)

This routine parses the command line arguments that were passed to the program and ses them to set relevant training-related global parameters

Globals:

  • Config current clustering parameters
    Parameters
    argcnumber of command line arguments to parse
    argvcommand line arguments
    Returns
    none
    Note
    Exceptions: Illegal options terminate the program.

Definition at line 86 of file commontraining.cpp.

86  {
87  STRING usage;
88  if (*argc) {
89  usage += (*argv)[0];
90  }
91  usage += " [.tr files ...]";
92  tesseract::ParseCommandLineFlags(usage.c_str(), argc, argv, true);
93  // Record the index of the first non-flag argument to 1, since we set
94  // remove_flags to true when parsing the flags.
95  tessoptind = 1;
96  // Set some global values based on the flags.
98  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_min_samples_fraction)));
100  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_max_illegal)));
102  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_independence)));
104  MAX(0.0, MIN(1.0, double(FLAGS_clusterconfig_confidence)));
105  // Set additional parameters from config file if specified.
106  if (!FLAGS_configfile.empty()) {
108  FLAGS_configfile.c_str(),
110  ccutil.params());
111  }
112 }
int tessoptind
Definition: tessopt.cpp:24
ParamsVectors * params()
Definition: ccutil.h:62
FLOAT32 MinSamples
Definition: cluster.h:50
FLOAT32 MaxIllegal
Definition: cluster.h:51
CLUSTERCONFIG Config
FLOAT64 Confidence
Definition: cluster.h:54
Definition: strngs.h:45
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:40
FLOAT32 Independence
Definition: cluster.h:53
#define MAX(x, y)
Definition: ndminx.h:24
#define MIN(x, y)
Definition: ndminx.h:28
const char * c_str() const
Definition: strngs.cpp:209
CCUtil ccutil
void ParseCommandLineFlags(const char *usage, int *argc, char ***argv, const bool remove_flags)

◆ ReadTrainingSamples()

void ReadTrainingSamples ( const FEATURE_DEFS_STRUCT feature_defs,
const char *  feature_name,
int  max_samples,
UNICHARSET unicharset,
FILE *  file,
LIST training_samples 
)

This routine reads training samples from a file and places them into a data structure which organizes the samples by FontName and CharName. It then returns this data structure.

Parameters
fileopen text file to read samples from
feature_defs
feature_name
max_samples
unicharset
training_samples
Returns
none
Note
Globals: none
Exceptions: none
History:
  • Fri Aug 18 13:11:39 1989, DSJ, Created.
  • Tue May 17 1998 simplifications to structure, illiminated font, and feature specification levels of structure.

Definition at line 363 of file commontraining.cpp.

366  {
367  char buffer[2048];
368  char unichar[UNICHAR_LEN + 1];
369  LABELEDLIST char_sample;
370  FEATURE_SET feature_samples;
371  CHAR_DESC char_desc;
372  int ShortNameToFeatureType_res = ShortNameToFeatureType(feature_defs, feature_name);
373  assert(0 <= ShortNameToFeatureType_res);
374  unsigned int feature_type = static_cast<unsigned int>(ShortNameToFeatureType_res);
375  // Zero out the font_sample_count for all the classes.
376  LIST it = *training_samples;
377  iterate(it) {
378  char_sample = reinterpret_cast<LABELEDLIST>(first_node(it));
379  char_sample->font_sample_count = 0;
380  }
381 
382  while (fgets(buffer, 2048, file) != nullptr) {
383  if (buffer[0] == '\n')
384  continue;
385 
386  sscanf(buffer, "%*s %s", unichar);
387  if (unicharset != nullptr && !unicharset->contains_unichar(unichar)) {
388  unicharset->unichar_insert(unichar);
389  if (unicharset->size() > MAX_NUM_CLASSES) {
390  tprintf("Error: Size of unicharset in training is "
391  "greater than MAX_NUM_CLASSES\n");
392  exit(1);
393  }
394  }
395  char_sample = FindList(*training_samples, unichar);
396  if (char_sample == nullptr) {
397  char_sample = NewLabeledList(unichar);
398  *training_samples = push(*training_samples, char_sample);
399  }
400  char_desc = ReadCharDescription(feature_defs, file);
401  feature_samples = char_desc->FeatureSets[feature_type];
402  if (char_sample->font_sample_count < max_samples || max_samples <= 0) {
403  char_sample->List = push(char_sample->List, feature_samples);
404  char_sample->SampleCount++;
405  char_sample->font_sample_count++;
406  } else {
407  FreeFeatureSet(feature_samples);
408  }
409  for (size_t i = 0; i < char_desc->NumFeatureSets; i++) {
410  if (feature_type != i)
411  FreeFeatureSet(char_desc->FeatureSets[i]);
412  }
413  free(char_desc);
414  }
415 } // ReadTrainingSamples
LABELEDLIST FindList(LIST List, char *Label)
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
void FreeFeatureSet(FEATURE_SET FeatureSet)
Definition: ocrfeatures.cpp:71
#define tprintf(...)
Definition: tprintf.h:31
CHAR_DESC ReadCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, FILE *File)
Definition: featdefs.cpp:258
#define UNICHAR_LEN
Definition: unichar.h:30
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:44
uinT32 NumFeatureSets
Definition: featdefs.h:43
#define first_node(l)
Definition: oldlist.h:139
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:297
int size() const
Definition: unicharset.h:299
LABELEDLIST NewLabeledList(const char *Label)
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
#define iterate(l)
Definition: oldlist.h:159
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317
void unichar_insert(const char *const unichar_repr)
Definition: unicharset.cpp:612

◆ RemoveInsignificantProtos()

LIST RemoveInsignificantProtos ( LIST  ProtoList,
BOOL8  KeepSigProtos,
BOOL8  KeepInsigProtos,
int  N 
)

Definition at line 593 of file commontraining.cpp.

599 {
600  LIST NewProtoList = NIL_LIST;
601  LIST pProtoList;
602  PROTOTYPE* Proto;
603  PROTOTYPE* NewProto;
604  int i;
605 
606  pProtoList = ProtoList;
607  iterate(pProtoList)
608  {
609  Proto = (PROTOTYPE *) first_node (pProtoList);
610  if ((Proto->Significant && KeepSigProtos) ||
611  (!Proto->Significant && KeepInsigProtos))
612  {
613  NewProto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
614 
615  NewProto->Mean = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
616  NewProto->Significant = Proto->Significant;
617  NewProto->Style = Proto->Style;
618  NewProto->NumSamples = Proto->NumSamples;
619  NewProto->Cluster = nullptr;
620  NewProto->Distrib = nullptr;
621 
622  for (i=0; i < N; i++)
623  NewProto->Mean[i] = Proto->Mean[i];
624  if (Proto->Variance.Elliptical != nullptr) {
625  NewProto->Variance.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
626  for (i=0; i < N; i++)
627  NewProto->Variance.Elliptical[i] = Proto->Variance.Elliptical[i];
628  }
629  else
630  NewProto->Variance.Elliptical = nullptr;
631  //---------------------------------------------
632  if (Proto->Magnitude.Elliptical != nullptr) {
633  NewProto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
634  for (i=0; i < N; i++)
635  NewProto->Magnitude.Elliptical[i] = Proto->Magnitude.Elliptical[i];
636  }
637  else
638  NewProto->Magnitude.Elliptical = nullptr;
639  //------------------------------------------------
640  if (Proto->Weight.Elliptical != nullptr) {
641  NewProto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
642  for (i=0; i < N; i++)
643  NewProto->Weight.Elliptical[i] = Proto->Weight.Elliptical[i];
644  }
645  else
646  NewProto->Weight.Elliptical = nullptr;
647 
648  NewProto->TotalMagnitude = Proto->TotalMagnitude;
649  NewProto->LogMagnitude = Proto->LogMagnitude;
650  NewProtoList = push_last(NewProtoList, NewProto);
651  }
652  }
653  FreeProtoList(&ProtoList);
654  return (NewProtoList);
655 } /* RemoveInsignificantProtos */
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOAT32 * Elliptical
Definition: cluster.h:64
void * Emalloc(int Size)
Definition: emalloc.cpp:47
LIST push_last(LIST list, void *item)
Definition: oldlist.cpp:332
#define NIL_LIST
Definition: oldlist.h:126
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Variance
Definition: cluster.h:81
unsigned Style
Definition: cluster.h:74
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:573
FLOAT32 * Mean
Definition: cluster.h:78
#define first_node(l)
Definition: oldlist.h:139
float FLOAT32
Definition: host.h:42
unsigned NumSamples
Definition: cluster.h:75
unsigned Significant
Definition: cluster.h:68
CLUSTER * Cluster
Definition: cluster.h:76
#define iterate(l)
Definition: oldlist.h:159
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 TotalMagnitude
Definition: cluster.h:79

◆ SetUpForClustering()

CLUSTERER* SetUpForClustering ( const FEATURE_DEFS_STRUCT FeatureDefs,
LABELEDLIST  char_sample,
const char *  program_feature_type 
)

This routine reads samples from a LABELEDLIST and enters those samples into a clusterer data structure. This data structure is then returned to the caller.

Parameters
char_sampleLABELEDLIST that holds all the feature information for a
FeatureDefs
program_feature_typegiven character.
Returns
Pointer to new clusterer data structure.
Note
Globals: None
Exceptions: None
History: 8/16/89, DSJ, Created.

Definition at line 477 of file commontraining.cpp.

479  {
480  uinT16 N;
481  int i, j;
482  FLOAT32* Sample = nullptr;
483  CLUSTERER *Clusterer;
484  inT32 CharID;
485  LIST FeatureList = nullptr;
486  FEATURE_SET FeatureSet = nullptr;
487 
488  int desc_index = ShortNameToFeatureType(FeatureDefs, program_feature_type);
489  N = FeatureDefs.FeatureDesc[desc_index]->NumParams;
490  Clusterer = MakeClusterer(N, FeatureDefs.FeatureDesc[desc_index]->ParamDesc);
491 
492  FeatureList = char_sample->List;
493  CharID = 0;
494  iterate(FeatureList) {
495  FeatureSet = (FEATURE_SET) first_node(FeatureList);
496  for (i = 0; i < FeatureSet->MaxNumFeatures; i++) {
497  if (Sample == nullptr) Sample = (FLOAT32*)Emalloc(N * sizeof(FLOAT32));
498  for (j = 0; j < N; j++)
499  Sample[j] = FeatureSet->Features[i]->Params[j];
500  MakeSample (Clusterer, Sample, CharID);
501  }
502  CharID++;
503  }
504  free(Sample);
505  return Clusterer;
506 
507 } /* SetUpForClustering */
FEATURE Features[1]
Definition: ocrfeatures.h:72
int32_t inT32
Definition: host.h:38
void * Emalloc(int Size)
Definition: emalloc.cpp:47
const PARAM_DESC * ParamDesc
Definition: ocrfeatures.h:59
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399
const FEATURE_DESC_STRUCT * FeatureDesc[NUM_FEATURE_TYPES]
Definition: featdefs.h:50
FEATURE_SET_STRUCT * FEATURE_SET
Definition: ocrfeatures.h:74
#define first_node(l)
Definition: oldlist.h:139
int ShortNameToFeatureType(const FEATURE_DEFS_STRUCT &FeatureDefs, const char *ShortName)
Definition: featdefs.cpp:297
float FLOAT32
Definition: host.h:42
FLOAT32 Params[1]
Definition: ocrfeatures.h:65
#define iterate(l)
Definition: oldlist.h:159
uint16_t uinT16
Definition: host.h:37
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:455

◆ SetUpForFloat2Int()

CLASS_STRUCT* SetUpForFloat2Int ( const UNICHARSET unicharset,
LIST  LabeledClassList 
)

Definition at line 709 of file commontraining.cpp.

710  {
711  MERGE_CLASS MergeClass;
712  CLASS_TYPE Class;
713  int NumProtos;
714  int NumConfigs;
715  int NumWords;
716  int i, j;
717  float Values[3];
718  PROTO NewProto;
719  PROTO OldProto;
720  BIT_VECTOR NewConfig;
721  BIT_VECTOR OldConfig;
722 
723  // printf("Float2Int ...\n");
724 
725  CLASS_STRUCT* float_classes = new CLASS_STRUCT[unicharset.size()];
726  iterate(LabeledClassList)
727  {
728  UnicityTableEqEq<int> font_set;
729  MergeClass = (MERGE_CLASS) first_node (LabeledClassList);
730  Class = &float_classes[unicharset.unichar_to_id(MergeClass->Label)];
731  NumProtos = MergeClass->Class->NumProtos;
732  NumConfigs = MergeClass->Class->NumConfigs;
733  font_set.move(&MergeClass->Class->font_set);
734  Class->NumProtos = NumProtos;
735  Class->MaxNumProtos = NumProtos;
736  Class->Prototypes = (PROTO) Emalloc (sizeof(PROTO_STRUCT) * NumProtos);
737  for(i=0; i < NumProtos; i++)
738  {
739  NewProto = ProtoIn(Class, i);
740  OldProto = ProtoIn(MergeClass->Class, i);
741  Values[0] = OldProto->X;
742  Values[1] = OldProto->Y;
743  Values[2] = OldProto->Angle;
744  Normalize(Values);
745  NewProto->X = OldProto->X;
746  NewProto->Y = OldProto->Y;
747  NewProto->Length = OldProto->Length;
748  NewProto->Angle = OldProto->Angle;
749  NewProto->A = Values[0];
750  NewProto->B = Values[1];
751  NewProto->C = Values[2];
752  }
753 
754  Class->NumConfigs = NumConfigs;
755  Class->MaxNumConfigs = NumConfigs;
756  Class->font_set.move(&font_set);
757  Class->Configurations = (BIT_VECTOR*) Emalloc (sizeof(BIT_VECTOR) * NumConfigs);
758  NumWords = WordsInVectorOfSize(NumProtos);
759  for(i=0; i < NumConfigs; i++)
760  {
761  NewConfig = NewBitVector(NumProtos);
762  OldConfig = MergeClass->Class->Configurations[i];
763  for(j=0; j < NumWords; j++)
764  NewConfig[j] = OldConfig[j];
765  Class->Configurations[i] = NewConfig;
766  }
767  }
768  return float_classes;
769 } // SetUpForFloat2Int
MERGE_CLASS_NODE * MERGE_CLASS
void Normalize(float *Values)
#define WordsInVectorOfSize(NumBits)
Definition: bitvec.h:63
FLOAT32 A
Definition: protos.h:44
void * Emalloc(int Size)
Definition: emalloc.cpp:47
inT16 MaxNumConfigs
Definition: protos.h:63
inT16 NumConfigs
Definition: protos.h:62
inT16 NumProtos
Definition: protos.h:59
uinT32 * BIT_VECTOR
Definition: bitvec.h:28
BIT_VECTOR NewBitVector(int NumBits)
Definition: bitvec.cpp:89
CONFIGS Configurations
Definition: protos.h:64
FLOAT32 Length
Definition: protos.h:50
PROTO_STRUCT * PROTO
Definition: protos.h:52
PROTO Prototypes
Definition: protos.h:61
void move(UnicityTable< T > *from)
FLOAT32 Y
Definition: protos.h:48
#define first_node(l)
Definition: oldlist.h:139
FLOAT32 X
Definition: protos.h:47
FLOAT32 Angle
Definition: protos.h:49
UnicityTableEqEq< int > font_set
Definition: protos.h:65
int size() const
Definition: unicharset.h:299
CLASS_TYPE Class
FLOAT32 C
Definition: protos.h:46
FLOAT32 B
Definition: protos.h:45
inT16 MaxNumProtos
Definition: protos.h:60
#define iterate(l)
Definition: oldlist.h:159
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
#define ProtoIn(Class, Pid)
Definition: protos.h:123

◆ STRING_PARAM_FLAG() [1/8]

STRING_PARAM_FLAG ( configfile  ,
""  ,
"File to load more configs from"   
)

◆ STRING_PARAM_FLAG() [2/8]

STRING_PARAM_FLAG ( ,
""  ,
"Directory to write output files to"   
)

◆ STRING_PARAM_FLAG() [3/8]

STRING_PARAM_FLAG ( ,
"font_properties"  ,
"File listing font properties"   
)

◆ STRING_PARAM_FLAG() [4/8]

STRING_PARAM_FLAG ( ,
""  ,
"File listing font xheights"   
)

◆ STRING_PARAM_FLAG() [5/8]

STRING_PARAM_FLAG ( ,
"unicharset"  ,
"File to load unicharset from"   
)

◆ STRING_PARAM_FLAG() [6/8]

STRING_PARAM_FLAG ( ,
""  ,
"File to write unicharset to"   
)

◆ STRING_PARAM_FLAG() [7/8]

STRING_PARAM_FLAG ( output_trainer  ,
""  ,
"File to write trainer to"   
)

◆ STRING_PARAM_FLAG() [8/8]

STRING_PARAM_FLAG ( test_ch  ,
""  ,
"UTF8 test character string"   
)

Variable Documentation

◆ ccutil

CCUtil ccutil

Definition at line 52 of file commontraining.cpp.

◆ Config

CLUSTERCONFIG Config = { elliptical, 0.625, 0.05, 1.0, 1e-6, 0 }

Definition at line 50 of file commontraining.cpp.

◆ feature_defs

FEATURE_DEFS_STRUCT feature_defs

Definition at line 51 of file commontraining.cpp.