tesseract  4.00.00dev
cluster.cpp File Reference
#include "const.h"
#include "cluster.h"
#include "emalloc.h"
#include "genericheap.h"
#include "helpers.h"
#include "kdpair.h"
#include "matrix.h"
#include "tprintf.h"
#include "danerror.h"
#include <math.h>

Go to the source code of this file.

Classes

struct  TEMPCLUSTER
 
struct  STATISTICS
 
struct  BUCKETS
 
struct  CHISTRUCT
 
struct  ClusteringContext
 

Macros

#define HOTELLING   1
 
#define FTABLE_X   10
 
#define FTABLE_Y   100
 
#define MINVARIANCE   0.0004
 
#define MINSAMPLESPERBUCKET   5
 
#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)
 
#define MINSAMPLESNEEDED   1
 
#define BUCKETTABLESIZE   1024
 
#define NORMALEXTENT   3.0
 
#define Odd(N)   ((N)%2)
 
#define Mirror(N, R)   ((R) - (N) - 1)
 
#define Abs(N)   ( ( (N) < 0 ) ? ( -(N) ) : (N) )
 
#define SqrtOf2Pi   2.506628275
 
#define LOOKUPTABLESIZE   8
 
#define MAXDEGREESOFFREEDOM   MAXBUCKETS
 
#define MAXNEIGHBORS   2
 
#define MAXDISTANCE   MAX_FLOAT32
 
#define CHIACCURACY   0.01
 
#define MINALPHA   (1e-200)
 
#define INITIALDELTA   0.1
 
#define DELTARATIO   0.1
 
#define ILLEGAL_CHAR   2
 

Typedefs

typedef tesseract::KDPairInc< float, TEMPCLUSTER * > ClusterPair
 
typedef tesseract::GenericHeap< ClusterPairClusterHeap
 
typedef FLOAT64(* DENSITYFUNC) (inT32)
 
typedef FLOAT64(* SOLVEFUNC) (CHISTRUCT *, double)
 

Functions

void CreateClusterTree (CLUSTERER *Clusterer)
 
void MakePotentialClusters (ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
 
CLUSTERFindNearestNeighbor (KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
 
CLUSTERMakeNewCluster (CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
 
inT32 MergeClusters (inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
 
void ComputePrototypes (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
PROTOTYPEMakePrototype (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
 
PROTOTYPEMakeDegenerateProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
 
PROTOTYPETestEllipticalProto (CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPEMakeSphericalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeEllipticalProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
 
PROTOTYPEMakeMixedProto (CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
 
void MakeDimRandom (uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
 
void MakeDimUniform (uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
 
STATISTICSComputeStatistics (inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
 
PROTOTYPENewSphericalProto (uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewEllipticalProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewMixedProto (inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
 
PROTOTYPENewSimpleProto (inT16 N, CLUSTER *Cluster)
 
BOOL8 Independent (PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
 
BUCKETSGetBuckets (CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
BUCKETSMakeBuckets (DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
 
uinT16 OptimumNumberOfBuckets (uinT32 SampleCount)
 
FLOAT64 ComputeChiSquared (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 NormalDensity (inT32 x)
 
FLOAT64 UniformDensity (inT32 x)
 
FLOAT64 Integral (FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
 
void FillBuckets (BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 NormalBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
uinT16 UniformBucket (PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
 
BOOL8 DistributionOK (BUCKETS *Buckets)
 
void FreeStatistics (STATISTICS *Statistics)
 
void FreeBuckets (BUCKETS *Buckets)
 
void FreeCluster (CLUSTER *Cluster)
 
uinT16 DegreesOfFreedom (DISTRIBUTION Distribution, uinT16 HistogramBuckets)
 
int NumBucketsMatch (void *arg1, void *arg2)
 
int ListEntryMatch (void *arg1, void *arg2)
 
void AdjustBuckets (BUCKETS *Buckets, uinT32 NewSampleCount)
 
void InitBuckets (BUCKETS *Buckets)
 
int AlphaMatch (void *arg1, void *arg2)
 
CHISTRUCTNewChiStruct (uinT16 DegreesOfFreedom, FLOAT64 Alpha)
 
FLOAT64 Solve (SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
 
FLOAT64 ChiArea (CHISTRUCT *ChiParams, FLOAT64 x)
 
BOOL8 MultipleCharSamples (CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
 
double InvertMatrix (const float *input, int size, float *inv)
 
CLUSTERERMakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[])
 
SAMPLEMakeSample (CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
 
LIST ClusterSamples (CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
 
void FreeClusterer (CLUSTERER *Clusterer)
 
void FreeProtoList (LIST *ProtoList)
 
void FreePrototype (void *arg)
 
CLUSTERNextSample (LIST *SearchState)
 
FLOAT32 Mean (PROTOTYPE *Proto, uinT16 Dimension)
 
FLOAT32 StandardDeviation (PROTOTYPE *Proto, uinT16 Dimension)
 
inT32 MergeClusters (inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[])
 

Variables

const double FTable [FTABLE_Y][FTABLE_X]
 

Macro Definition Documentation

◆ Abs

#define Abs (   N)    ( ( (N) < 0 ) ? ( -(N) ) : (N) )

Definition at line 207 of file cluster.cpp.

◆ BUCKETTABLESIZE

#define BUCKETTABLESIZE   1024

define the size of the table which maps normalized samples to histogram buckets. Also define the number of standard deviations in a normal distribution which are considered to be significant. The mapping table will be defined in such a way that it covers the specified number of standard deviations on either side of the mean. BUCKETTABLESIZE should always be even.

Definition at line 159 of file cluster.cpp.

◆ CHIACCURACY

#define CHIACCURACY   0.01

◆ DELTARATIO

#define DELTARATIO   0.1

◆ FTABLE_X

#define FTABLE_X   10

Definition at line 30 of file cluster.cpp.

◆ FTABLE_Y

#define FTABLE_Y   100

Definition at line 31 of file cluster.cpp.

◆ HOTELLING

#define HOTELLING   1

Definition at line 29 of file cluster.cpp.

◆ ILLEGAL_CHAR

#define ILLEGAL_CHAR   2

◆ INITIALDELTA

#define INITIALDELTA   0.1

◆ LOOKUPTABLESIZE

#define LOOKUPTABLESIZE   8

define lookup tables used to compute the number of histogram buckets that should be used for a given number of samples.

Definition at line 227 of file cluster.cpp.

◆ MAXDEGREESOFFREEDOM

#define MAXDEGREESOFFREEDOM   MAXBUCKETS

Definition at line 228 of file cluster.cpp.

◆ MAXDISTANCE

#define MAXDISTANCE   MAX_FLOAT32

◆ MAXNEIGHBORS

#define MAXNEIGHBORS   2

◆ MINALPHA

#define MINALPHA   (1e-200)

◆ MINSAMPLES

#define MINSAMPLES   (MINBUCKETS * MINSAMPLESPERBUCKET)

Definition at line 150 of file cluster.cpp.

◆ MINSAMPLESNEEDED

#define MINSAMPLESNEEDED   1

Definition at line 151 of file cluster.cpp.

◆ MINSAMPLESPERBUCKET

#define MINSAMPLESPERBUCKET   5

define the absolute minimum number of samples which must be present in order to accurately test hypotheses about underlying probability distributions. Define separately the minimum samples that are needed before a statistical analysis is attempted; this number should be equal to MINSAMPLES but can be set to a lower number for early testing when very few samples are available.

Definition at line 149 of file cluster.cpp.

◆ MINVARIANCE

#define MINVARIANCE   0.0004

define the variance which will be used as a minimum variance for any dimension of any feature. Since most features are calculated from numbers with a precision no better than 1 in 128, the variance should never be less than the square of this number for parameters whose range is 1.

Definition at line 141 of file cluster.cpp.

◆ Mirror

#define Mirror (   N,
 
)    ((R) - (N) - 1)

Definition at line 206 of file cluster.cpp.

◆ NORMALEXTENT

#define NORMALEXTENT   3.0

Definition at line 160 of file cluster.cpp.

◆ Odd

#define Odd (   N)    ((N)%2)

Definition at line 205 of file cluster.cpp.

◆ SqrtOf2Pi

#define SqrtOf2Pi   2.506628275

the following variables describe a discrete normal distribution which is used by NormalDensity() and NormalBucket(). The constant NORMALEXTENT determines how many standard deviations of the distribution are mapped onto the fixed discrete range of x. x=0 is mapped to -NORMALEXTENT standard deviations and x=BUCKETTABLESIZE is mapped to +NORMALEXTENT standard deviations.

Definition at line 217 of file cluster.cpp.

Typedef Documentation

◆ ClusterHeap

Definition at line 168 of file cluster.cpp.

◆ ClusterPair

Definition at line 167 of file cluster.cpp.

◆ DENSITYFUNC

typedef FLOAT64(* DENSITYFUNC) (inT32)

Definition at line 202 of file cluster.cpp.

◆ SOLVEFUNC

typedef FLOAT64(* SOLVEFUNC) (CHISTRUCT *, double)

Definition at line 203 of file cluster.cpp.

Function Documentation

◆ AdjustBuckets()

void AdjustBuckets ( BUCKETS Buckets,
uinT32  NewSampleCount 
)

This routine multiplies each ExpectedCount histogram entry by NewSampleCount/OldSampleCount so that the histogram is now adjusted to the new sample count.

Parameters
Bucketshistogram data structure to adjust
NewSampleCountnew sample count to adjust to
Returns
none
Note
Exceptions: none
History: Thu Aug 3 14:31:14 1989, DSJ, Created.

Definition at line 2260 of file cluster.cpp.

2260  {
2261  int i;
2262  FLOAT64 AdjustFactor;
2263 
2264  AdjustFactor = (((FLOAT64) NewSampleCount) /
2265  ((FLOAT64) Buckets->SampleCount));
2266 
2267  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2268  Buckets->ExpectedCount[i] *= AdjustFactor;
2269  }
2270 
2271  Buckets->SampleCount = NewSampleCount;
2272 
2273 } // AdjustBuckets
uinT32 SampleCount
Definition: cluster.cpp:179
FLOAT32 * ExpectedCount
Definition: cluster.cpp:185
uinT16 NumberOfBuckets
Definition: cluster.cpp:182
double FLOAT64
Definition: host.h:43

◆ AlphaMatch()

int AlphaMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list of structures which hold pre-computed chi-squared values for a chi-squared value whose corresponding alpha field matches the alpha field of SearchKey.

It is called by the list search routines.

Parameters
arg1chi-squared struct being tested for a match
arg2chi-squared struct that is the search key
Returns
TRUE if ChiStruct's Alpha matches SearchKey's Alpha
Note
Exceptions: none
History: Thu Aug 3 14:17:33 1989, DSJ, Created.

Definition at line 2306 of file cluster.cpp.

2307  { //CHISTRUCT *SearchKey)
2308  CHISTRUCT *ChiStruct = (CHISTRUCT *) arg1;
2309  CHISTRUCT *SearchKey = (CHISTRUCT *) arg2;
2310 
2311  return (ChiStruct->Alpha == SearchKey->Alpha);
2312 
2313 } // AlphaMatch
FLOAT64 Alpha
Definition: cluster.cpp:190

◆ ChiArea()

FLOAT64 ChiArea ( CHISTRUCT ChiParams,
FLOAT64  x 
)

This routine computes the area under a chi density curve from 0 to x, minus the desired area under the curve. The number of degrees of freedom of the chi curve is specified in the ChiParams structure. The desired area is also specified in the ChiParams structure as Alpha ( or 1 minus the desired area ). This routine is intended to be passed to the Solve() function to find the value of chi-squared which will yield a desired area under the right tail of the chi density curve. The function will only work for even degrees of freedom. The equations are based on integrating the chi density curve in parts to obtain a series that can be used to compute the area under the curve.

Parameters
ChiParamscontains degrees of freedom and alpha
xvalue of chi-squared to evaluate
Returns
Error between actual and desired area under the chi curve.
Note
Exceptions: none
History: Fri Aug 4 12:48:41 1989, DSJ, Created.

Definition at line 2418 of file cluster.cpp.

2418  {
2419  int i, N;
2420  FLOAT64 SeriesTotal;
2421  FLOAT64 Denominator;
2422  FLOAT64 PowerOfx;
2423 
2424  N = ChiParams->DegreesOfFreedom / 2 - 1;
2425  SeriesTotal = 1;
2426  Denominator = 1;
2427  PowerOfx = 1;
2428  for (i = 1; i <= N; i++) {
2429  Denominator *= 2 * i;
2430  PowerOfx *= x;
2431  SeriesTotal += PowerOfx / Denominator;
2432  }
2433  return ((SeriesTotal * exp (-0.5 * x)) - ChiParams->Alpha);
2434 
2435 } // ChiArea
uinT16 DegreesOfFreedom
Definition: cluster.cpp:189
FLOAT64 Alpha
Definition: cluster.cpp:190
double FLOAT64
Definition: host.h:43

◆ ClusterSamples()

LIST ClusterSamples ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine first checks to see if the samples in this clusterer have already been clustered before; if so, it does not bother to recreate the cluster tree. It simply recomputes the prototypes based on the new Config info.

If the samples have not been clustered before, the samples in the KD tree are formed into a cluster tree and then the prototypes are computed from the cluster tree.

In either case this routine returns a pointer to a list of prototypes that best represent the samples given the constraints specified in Config.

Parameters
Clustererdata struct containing samples to be clustered
Configparameters which control clustering process
Returns
Pointer to a list of prototypes
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 512 of file cluster.cpp.

512  {
513  //only create cluster tree if samples have never been clustered before
514  if (Clusterer->Root == NULL)
515  CreateClusterTree(Clusterer);
516 
517  //deallocate the old prototype list if one exists
518  FreeProtoList (&Clusterer->ProtoList);
519  Clusterer->ProtoList = NIL_LIST;
520 
521  //compute prototypes starting at the root node in the tree
522  ComputePrototypes(Clusterer, Config);
523  // We don't need the cluster pointers in the protos any more, so null them
524  // out, which makes it safe to delete the clusterer.
525  LIST proto_list = Clusterer->ProtoList;
526  iterate(proto_list) {
527  PROTOTYPE *proto = reinterpret_cast<PROTOTYPE *>(first_node(proto_list));
528  proto->Cluster = NULL;
529  }
530  return Clusterer->ProtoList;
531 } // ClusterSamples
LIST ProtoList
Definition: cluster.h:92
void CreateClusterTree(CLUSTERER *Clusterer)
Definition: cluster.cpp:698
#define NIL_LIST
Definition: oldlist.h:126
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:573
#define first_node(l)
Definition: oldlist.h:139
CLUSTER * Cluster
Definition: cluster.h:76
CLUSTER * Root
Definition: cluster.h:91
#define iterate(l)
Definition: oldlist.h:159
void ComputePrototypes(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:924

◆ ComputeChiSquared()

FLOAT64 ComputeChiSquared ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

This routine computes the chi-squared value which will leave a cumulative probability of Alpha in the right tail of a chi-squared distribution with the specified number of degrees of freedom. Alpha must be between 0 and 1. DegreesOfFreedom must be even. The routine maintains an array of lists. Each list corresponds to a different number of degrees of freedom. Each entry in the list corresponds to a different alpha value and its corresponding chi-squared value. Therefore, once a particular chi-squared value is computed, it is stored in the list and never needs to be computed again.

Parameters
DegreesOfFreedomdetermines shape of distribution
Alphaprobability of right tail
Returns
Desired chi-squared value
Note
Exceptions: none
History: 6/5/89, DSJ, Created.

Definition at line 1869 of file cluster.cpp.

1872 {
1873  static LIST ChiWith[MAXDEGREESOFFREEDOM + 1];
1874 
1875  CHISTRUCT *OldChiSquared;
1876  CHISTRUCT SearchKey;
1877 
1878  // limit the minimum alpha that can be used - if alpha is too small
1879  // it may not be possible to compute chi-squared.
1880  Alpha = ClipToRange(Alpha, MINALPHA, 1.0);
1881  if (Odd (DegreesOfFreedom))
1882  DegreesOfFreedom++;
1883 
1884  /* find the list of chi-squared values which have already been computed
1885  for the specified number of degrees of freedom. Search the list for
1886  the desired chi-squared. */
1887  SearchKey.Alpha = Alpha;
1888  OldChiSquared = (CHISTRUCT *) first_node (search (ChiWith[DegreesOfFreedom],
1889  &SearchKey, AlphaMatch));
1890 
1891  if (OldChiSquared == NULL) {
1892  OldChiSquared = NewChiStruct (DegreesOfFreedom, Alpha);
1893  OldChiSquared->ChiSquared = Solve (ChiArea, OldChiSquared,
1894  (FLOAT64) DegreesOfFreedom,
1895  (FLOAT64) CHIACCURACY);
1896  ChiWith[DegreesOfFreedom] = push (ChiWith[DegreesOfFreedom],
1897  OldChiSquared);
1898  }
1899  else {
1900  // further optimization might move OldChiSquared to front of list
1901  }
1902 
1903  return (OldChiSquared->ChiSquared);
1904 
1905 } // ComputeChiSquared
#define Odd(N)
Definition: cluster.cpp:205
#define MINALPHA
FLOAT64 ChiSquared
Definition: cluster.cpp:191
FLOAT64 Solve(SOLVEFUNC Function, void *FunctionParams, FLOAT64 InitialGuess, FLOAT64 Accuracy)
Definition: cluster.cpp:2352
LIST search(LIST list, void *key, int_compare is_equal)
Definition: oldlist.cpp:406
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2205
#define first_node(l)
Definition: oldlist.h:139
FLOAT64 ChiArea(CHISTRUCT *ChiParams, FLOAT64 x)
Definition: cluster.cpp:2418
#define MAXDEGREESOFFREEDOM
Definition: cluster.cpp:228
#define CHIACCURACY
FLOAT64 Alpha
Definition: cluster.cpp:190
int AlphaMatch(void *arg1, void *arg2)
Definition: cluster.cpp:2306
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317
double FLOAT64
Definition: host.h:43
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2326

◆ ComputePrototypes()

void ComputePrototypes ( CLUSTERER Clusterer,
CLUSTERCONFIG Config 
)

This routine decides which clusters in the cluster tree should be represented by prototypes, forms a list of these prototypes, and places the list in the Clusterer data structure.

Parameters
Clustererdata structure holding cluster tree
Configparameters used to control prototype generation
Returns
None
Note
Exceptions: None
History: 5/30/89, DSJ, Created.

Definition at line 924 of file cluster.cpp.

924  {
925  LIST ClusterStack = NIL_LIST;
926  CLUSTER *Cluster;
927  PROTOTYPE *Prototype;
928 
929  // use a stack to keep track of clusters waiting to be processed
930  // initially the only cluster on the stack is the root cluster
931  if (Clusterer->Root != NULL)
932  ClusterStack = push (NIL_LIST, Clusterer->Root);
933 
934  // loop until we have analyzed all clusters which are potential prototypes
935  while (ClusterStack != NIL_LIST) {
936  // remove the next cluster to be analyzed from the stack
937  // try to make a prototype from the cluster
938  // if successful, put it on the proto list, else split the cluster
939  Cluster = (CLUSTER *) first_node (ClusterStack);
940  ClusterStack = pop (ClusterStack);
941  Prototype = MakePrototype(Clusterer, Config, Cluster);
942  if (Prototype != NULL) {
943  Clusterer->ProtoList = push (Clusterer->ProtoList, Prototype);
944  }
945  else {
946  ClusterStack = push (ClusterStack, Cluster->Right);
947  ClusterStack = push (ClusterStack, Cluster->Left);
948  }
949  }
950 } // ComputePrototypes
LIST ProtoList
Definition: cluster.h:92
struct sample * Left
Definition: cluster.h:36
#define NIL_LIST
Definition: oldlist.h:126
LIST pop(LIST list)
Definition: oldlist.cpp:299
PROTOTYPE * MakePrototype(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster)
Definition: cluster.cpp:969
struct sample * Right
Definition: cluster.h:37
#define first_node(l)
Definition: oldlist.h:139
CLUSTER * Root
Definition: cluster.h:91
Definition: cluster.h:32
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317

◆ ComputeStatistics()

STATISTICS * ComputeStatistics ( inT16  N,
PARAM_DESC  ParamDesc[],
CLUSTER Cluster 
)

This routine searches the cluster tree for all leaf nodes which are samples in the specified cluster. It computes a full covariance matrix for these samples as well as keeping track of the ranges (min and max) for each dimension. A special data structure is allocated to return this information to the caller. An incremental algorithm for computing statistics is not used because it will not work with circular dimensions.

Parameters
Nnumber of dimensions
ParamDescarray of dimension descriptions
Clustercluster whose stats are to be computed
Returns
Pointer to new data structure containing statistics
Note
Exceptions: None
History: 6/2/89, DSJ, Created.

Definition at line 1412 of file cluster.cpp.

1412  {
1413  STATISTICS *Statistics;
1414  int i, j;
1415  FLOAT32 *CoVariance;
1416  FLOAT32 *Distance;
1417  LIST SearchState;
1418  SAMPLE *Sample;
1419  uinT32 SampleCountAdjustedForBias;
1420 
1421  // allocate memory to hold the statistics results
1422  Statistics = (STATISTICS *) Emalloc (sizeof (STATISTICS));
1423  Statistics->CoVariance = (FLOAT32 *) Emalloc (N * N * sizeof (FLOAT32));
1424  Statistics->Min = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1425  Statistics->Max = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1426 
1427  // allocate temporary memory to hold the sample to mean distances
1428  Distance = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1429 
1430  // initialize the statistics
1431  Statistics->AvgVariance = 1.0;
1432  CoVariance = Statistics->CoVariance;
1433  for (i = 0; i < N; i++) {
1434  Statistics->Min[i] = 0.0;
1435  Statistics->Max[i] = 0.0;
1436  for (j = 0; j < N; j++, CoVariance++)
1437  *CoVariance = 0;
1438  }
1439  // find each sample in the cluster and merge it into the statistics
1440  InitSampleSearch(SearchState, Cluster);
1441  while ((Sample = NextSample (&SearchState)) != NULL) {
1442  for (i = 0; i < N; i++) {
1443  Distance[i] = Sample->Mean[i] - Cluster->Mean[i];
1444  if (ParamDesc[i].Circular) {
1445  if (Distance[i] > ParamDesc[i].HalfRange)
1446  Distance[i] -= ParamDesc[i].Range;
1447  if (Distance[i] < -ParamDesc[i].HalfRange)
1448  Distance[i] += ParamDesc[i].Range;
1449  }
1450  if (Distance[i] < Statistics->Min[i])
1451  Statistics->Min[i] = Distance[i];
1452  if (Distance[i] > Statistics->Max[i])
1453  Statistics->Max[i] = Distance[i];
1454  }
1455  CoVariance = Statistics->CoVariance;
1456  for (i = 0; i < N; i++)
1457  for (j = 0; j < N; j++, CoVariance++)
1458  *CoVariance += Distance[i] * Distance[j];
1459  }
1460  // normalize the variances by the total number of samples
1461  // use SampleCount-1 instead of SampleCount to get an unbiased estimate
1462  // also compute the geometic mean of the diagonal variances
1463  // ensure that clusters with only 1 sample are handled correctly
1464  if (Cluster->SampleCount > 1)
1465  SampleCountAdjustedForBias = Cluster->SampleCount - 1;
1466  else
1467  SampleCountAdjustedForBias = 1;
1468  CoVariance = Statistics->CoVariance;
1469  for (i = 0; i < N; i++)
1470  for (j = 0; j < N; j++, CoVariance++) {
1471  *CoVariance /= SampleCountAdjustedForBias;
1472  if (j == i) {
1473  if (*CoVariance < MINVARIANCE)
1474  *CoVariance = MINVARIANCE;
1475  Statistics->AvgVariance *= *CoVariance;
1476  }
1477  }
1478  Statistics->AvgVariance = (float)pow((double)Statistics->AvgVariance,
1479  1.0 / N);
1480 
1481  // release temporary memory and return
1482  free(Distance);
1483  return (Statistics);
1484 } // ComputeStatistics
FLOAT32 * CoVariance
Definition: cluster.cpp:172
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOAT32 Range
Definition: ocrfeatures.h:51
#define InitSampleSearch(S, C)
Definition: cluster.h:105
#define MINVARIANCE
Definition: cluster.cpp:141
uint32_t uinT32
Definition: host.h:39
FLOAT32 * Min
Definition: cluster.cpp:173
FLOAT32 AvgVariance
Definition: cluster.cpp:171
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:620
float FLOAT32
Definition: host.h:42
FLOAT32 * Max
Definition: cluster.cpp:174
unsigned SampleCount
Definition: cluster.h:35
Definition: cluster.h:32

◆ CreateClusterTree()

void CreateClusterTree ( CLUSTERER Clusterer)

This routine performs a bottoms-up clustering on the samples held in the kd-tree of the Clusterer data structure. The result is a cluster tree. Each node in the tree represents a cluster which conceptually contains a subset of the samples. More precisely, the cluster contains all of the samples which are contained in its two sub-clusters. The leaves of the tree are the individual samples themselves; they have no sub-clusters. The root node of the tree conceptually contains all of the samples.

Parameters
Clustererdata structure holdings samples to be clustered
Returns
None (the Clusterer data structure is changed)
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 698 of file cluster.cpp.

698  {
699  ClusteringContext context;
700  ClusterPair HeapEntry;
701  TEMPCLUSTER *PotentialCluster;
702 
703  // each sample and its nearest neighbor form a "potential" cluster
704  // save these in a heap with the "best" potential clusters on top
705  context.tree = Clusterer->KDTree;
706  context.candidates = (TEMPCLUSTER *)
707  Emalloc(Clusterer->NumberOfSamples * sizeof(TEMPCLUSTER));
708  context.next = 0;
709  context.heap = new ClusterHeap(Clusterer->NumberOfSamples);
710  KDWalk(context.tree, (void_proc)MakePotentialClusters, &context);
711 
712  // form potential clusters into actual clusters - always do "best" first
713  while (context.heap->Pop(&HeapEntry)) {
714  PotentialCluster = HeapEntry.data;
715 
716  // if main cluster of potential cluster is already in another cluster
717  // then we don't need to worry about it
718  if (PotentialCluster->Cluster->Clustered) {
719  continue;
720  }
721 
722  // if main cluster is not yet clustered, but its nearest neighbor is
723  // then we must find a new nearest neighbor
724  else if (PotentialCluster->Neighbor->Clustered) {
725  PotentialCluster->Neighbor =
726  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
727  &HeapEntry.key);
728  if (PotentialCluster->Neighbor != NULL) {
729  context.heap->Push(&HeapEntry);
730  }
731  }
732 
733  // if neither cluster is already clustered, form permanent cluster
734  else {
735  PotentialCluster->Cluster =
736  MakeNewCluster(Clusterer, PotentialCluster);
737  PotentialCluster->Neighbor =
738  FindNearestNeighbor(context.tree, PotentialCluster->Cluster,
739  &HeapEntry.key);
740  if (PotentialCluster->Neighbor != NULL) {
741  context.heap->Push(&HeapEntry);
742  }
743  }
744  }
745 
746  // the root node in the cluster tree is now the only node in the kd-tree
747  Clusterer->Root = (CLUSTER *) RootOf(Clusterer->KDTree);
748 
749  // free up the memory used by the K-D tree, heap, and temp clusters
750  FreeKDTree(context.tree);
751  Clusterer->KDTree = NULL;
752  delete context.heap;
753  free(context.candidates);
754 } // CreateClusterTree
ClusterHeap * heap
Definition: cluster.cpp:196
unsigned Clustered
Definition: cluster.h:33
void KDWalk(KDTREE *Tree, void_proc action, void *context)
Definition: kdtree.cpp:330
void Push(Pair *entry)
Definition: genericheap.h:95
void MakePotentialClusters(ClusteringContext *context, CLUSTER *Cluster, inT32 Level)
Definition: cluster.cpp:765
void * Emalloc(int Size)
Definition: emalloc.cpp:47
CLUSTER * MakeNewCluster(CLUSTERER *Clusterer, TEMPCLUSTER *TempCluster)
Definition: cluster.cpp:836
inT32 NumberOfSamples
Definition: cluster.h:89
CLUSTER * Neighbor
Definition: cluster.cpp:164
CLUSTER * Cluster
Definition: cluster.cpp:163
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:349
TEMPCLUSTER * candidates
Definition: cluster.cpp:197
#define RootOf(T)
Definition: kdtree.h:58
CLUSTER * Root
Definition: cluster.h:91
KDTREE * KDTree
Definition: cluster.h:90
Definition: cluster.h:32
tesseract::GenericHeap< ClusterPair > ClusterHeap
Definition: cluster.cpp:168
bool Pop(Pair *entry)
Definition: genericheap.h:118
void(* void_proc)(...)
Definition: cutil.h:66
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:798

◆ DegreesOfFreedom()

uinT16 DegreesOfFreedom ( DISTRIBUTION  Distribution,
uinT16  HistogramBuckets 
)

This routine computes the degrees of freedom that should be used in a chi-squared test with the specified number of histogram buckets. The result is always rounded up to the next even number so that the value of chi-squared can be computed more easily. This will cause the value of chi-squared to be higher than the optimum value, resulting in the chi-square test being more lenient than optimum.

Parameters
Distributiondistribution being tested for
HistogramBucketsnumber of buckets in chi-square test
Returns
The number of degrees of freedom for a chi-square test
Note
Exceptions: none
History: Thu Aug 3 14:04:18 1989, DSJ, Created.

Definition at line 2205 of file cluster.cpp.

2205  {
2206  static uinT8 DegreeOffsets[] = { 3, 3, 1 };
2207 
2208  uinT16 AdjustedNumBuckets;
2209 
2210  AdjustedNumBuckets = HistogramBuckets - DegreeOffsets[(int) Distribution];
2211  if (Odd (AdjustedNumBuckets))
2212  AdjustedNumBuckets++;
2213  return (AdjustedNumBuckets);
2214 
2215 } // DegreesOfFreedom
#define Odd(N)
Definition: cluster.cpp:205
typedef int(ZCALLBACK *close_file_func) OF((voidpf opaque
uint8_t uinT8
Definition: host.h:35
uint16_t uinT16
Definition: host.h:37

◆ DistributionOK()

BOOL8 DistributionOK ( BUCKETS Buckets)

This routine performs a chi-square goodness of fit test on the histogram data in the Buckets data structure. TRUE is returned if the histogram matches the probability distribution which was specified when the Buckets structure was originally created. Otherwise FALSE is returned.

Parameters
Bucketshistogram data to perform chi-square test on
Returns
TRUE if samples match distribution, FALSE otherwise
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2125 of file cluster.cpp.

2125  {
2126  FLOAT32 FrequencyDifference;
2127  FLOAT32 TotalDifference;
2128  int i;
2129 
2130  // compute how well the histogram matches the expected histogram
2131  TotalDifference = 0.0;
2132  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2133  FrequencyDifference = Buckets->Count[i] - Buckets->ExpectedCount[i];
2134  TotalDifference += (FrequencyDifference * FrequencyDifference) /
2135  Buckets->ExpectedCount[i];
2136  }
2137 
2138  // test to see if the difference is more than expected
2139  if (TotalDifference > Buckets->ChiSquared)
2140  return FALSE;
2141  else
2142  return TRUE;
2143 } // DistributionOK
#define TRUE
Definition: capi.h:45
uinT32 * Count
Definition: cluster.cpp:184
#define FALSE
Definition: capi.h:46
FLOAT32 * ExpectedCount
Definition: cluster.cpp:185
uinT16 NumberOfBuckets
Definition: cluster.cpp:182
float FLOAT32
Definition: host.h:42
FLOAT64 ChiSquared
Definition: cluster.cpp:181

◆ FillBuckets()

void FillBuckets ( BUCKETS Buckets,
CLUSTER Cluster,
uinT16  Dim,
PARAM_DESC ParamDesc,
FLOAT32  Mean,
FLOAT32  StdDev 
)

This routine counts the number of cluster samples which fall within the various histogram buckets in Buckets. Only one dimension of each sample is examined. The exact meaning of the Mean and StdDev parameters depends on the distribution which is being analyzed (this info is in the Buckets data structure). For normal distributions, Mean and StdDev have the expected meanings. For uniform and random distributions the Mean is the center point of the range and the StdDev is 1/2 the range. A dimension with zero standard deviation cannot be statistically analyzed. In this case, a pseudo-analysis is used.

Parameters
Bucketshistogram buckets to count samples
Clustercluster whose samples are being analyzed
Dimdimension of samples which is being analyzed
ParamDescdescription of the dimension
Mean"mean" of the distribution
StdDev"standard deviation" of the distribution
Returns
None (the Buckets data structure is filled in)
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1984 of file cluster.cpp.

1989  {
1990  uinT16 BucketID;
1991  int i;
1992  LIST SearchState;
1993  SAMPLE *Sample;
1994 
1995  // initialize the histogram bucket counts to 0
1996  for (i = 0; i < Buckets->NumberOfBuckets; i++)
1997  Buckets->Count[i] = 0;
1998 
1999  if (StdDev == 0.0) {
2000  /* if the standard deviation is zero, then we can't statistically
2001  analyze the cluster. Use a pseudo-analysis: samples exactly on
2002  the mean are distributed evenly across all buckets. Samples greater
2003  than the mean are placed in the last bucket; samples less than the
2004  mean are placed in the first bucket. */
2005 
2006  InitSampleSearch(SearchState, Cluster);
2007  i = 0;
2008  while ((Sample = NextSample (&SearchState)) != NULL) {
2009  if (Sample->Mean[Dim] > Mean)
2010  BucketID = Buckets->NumberOfBuckets - 1;
2011  else if (Sample->Mean[Dim] < Mean)
2012  BucketID = 0;
2013  else
2014  BucketID = i;
2015  Buckets->Count[BucketID] += 1;
2016  i++;
2017  if (i >= Buckets->NumberOfBuckets)
2018  i = 0;
2019  }
2020  }
2021  else {
2022  // search for all samples in the cluster and add to histogram buckets
2023  InitSampleSearch(SearchState, Cluster);
2024  while ((Sample = NextSample (&SearchState)) != NULL) {
2025  switch (Buckets->Distribution) {
2026  case normal:
2027  BucketID = NormalBucket (ParamDesc, Sample->Mean[Dim],
2028  Mean, StdDev);
2029  break;
2030  case D_random:
2031  case uniform:
2032  BucketID = UniformBucket (ParamDesc, Sample->Mean[Dim],
2033  Mean, StdDev);
2034  break;
2035  default:
2036  BucketID = 0;
2037  }
2038  Buckets->Count[Buckets->Bucket[BucketID]] += 1;
2039  }
2040  }
2041 } // FillBuckets
FLOAT32 Mean[1]
Definition: cluster.h:39
uinT32 * Count
Definition: cluster.cpp:184
DISTRIBUTION Distribution
Definition: cluster.cpp:178
#define InitSampleSearch(S, C)
Definition: cluster.h:105
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:644
uinT16 NumberOfBuckets
Definition: cluster.cpp:182
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:620
uinT16 NormalBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2056
Definition: cluster.h:59
uinT16 UniformBucket(PARAM_DESC *ParamDesc, FLOAT32 x, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:2091
Definition: cluster.h:32
uint16_t uinT16
Definition: host.h:37
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:183

◆ FindNearestNeighbor()

CLUSTER * FindNearestNeighbor ( KDTREE Tree,
CLUSTER Cluster,
FLOAT32 Distance 
)

This routine searches the specified kd-tree for the nearest neighbor of the specified cluster. It actually uses the kd routines to find the 2 nearest neighbors since one of them will be the original cluster. A pointer to the nearest neighbor is returned, if it can be found, otherwise NULL is returned. The distance between the 2 nodes is placed in the specified variable.

Parameters
Treekd-tree to search in for nearest neighbor
Clustercluster whose nearest neighbor is to be found
Distanceptr to variable to report distance found
Returns
Pointer to the nearest neighbor of Cluster, or NULL
Note
Exceptions: none
History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 798 of file cluster.cpp.

801 {
802  CLUSTER *Neighbor[MAXNEIGHBORS];
803  FLOAT32 Dist[MAXNEIGHBORS];
804  int NumberOfNeighbors;
805  inT32 i;
806  CLUSTER *BestNeighbor;
807 
808  // find the 2 nearest neighbors of the cluster
810  &NumberOfNeighbors, (void **)Neighbor, Dist);
811 
812  // search for the nearest neighbor that is not the cluster itself
813  *Distance = MAXDISTANCE;
814  BestNeighbor = NULL;
815  for (i = 0; i < NumberOfNeighbors; i++) {
816  if ((Dist[i] < *Distance) && (Neighbor[i] != Cluster)) {
817  *Distance = Dist[i];
818  BestNeighbor = Neighbor[i];
819  }
820  }
821  return BestNeighbor;
822 } // FindNearestNeighbor
int32_t inT32
Definition: host.h:38
FLOAT32 Mean[1]
Definition: cluster.h:39
#define MAXDISTANCE
void KDNearestNeighborSearch(KDTREE *Tree, FLOAT32 Query[], int QuerySize, FLOAT32 MaxDistance, int *NumberOfResults, void **NBuffer, FLOAT32 DBuffer[])
Definition: kdtree.cpp:320
float FLOAT32
Definition: host.h:42
#define MAXNEIGHBORS
Definition: cluster.h:32

◆ FreeBuckets()

void FreeBuckets ( BUCKETS buckets)

This routine properly frees the memory used by a BUCKETS.

Parameters
bucketspointer to data structure to be freed

Definition at line 2165 of file cluster.cpp.

2165  {
2166  Efree(buckets->Count);
2167  Efree(buckets->ExpectedCount);
2168  Efree(buckets);
2169 } // FreeBuckets
uinT32 * Count
Definition: cluster.cpp:184
FLOAT32 * ExpectedCount
Definition: cluster.cpp:185
void Efree(void *ptr)
Definition: emalloc.cpp:79

◆ FreeCluster()

void FreeCluster ( CLUSTER Cluster)

This routine frees the memory consumed by the specified cluster and all of its subclusters. This is done by recursive calls to FreeCluster().

Parameters
Clusterpointer to cluster to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 2183 of file cluster.cpp.

2183  {
2184  if (Cluster != NULL) {
2185  FreeCluster (Cluster->Left);
2186  FreeCluster (Cluster->Right);
2187  free(Cluster);
2188  }
2189 } // FreeCluster
struct sample * Left
Definition: cluster.h:36
struct sample * Right
Definition: cluster.h:37
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2183

◆ FreeClusterer()

void FreeClusterer ( CLUSTERER Clusterer)

This routine frees all of the memory allocated to the specified data structure. It will not, however, free the memory used by the prototype list. The pointers to the clusters for each prototype in the list will be set to NULL to indicate that the cluster data structures no longer exist. Any sample lists that have been obtained via calls to GetSamples are no longer valid.

Parameters
Clustererpointer to data structure to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 546 of file cluster.cpp.

546  {
547  if (Clusterer != NULL) {
548  free(Clusterer->ParamDesc);
549  if (Clusterer->KDTree != NULL)
550  FreeKDTree (Clusterer->KDTree);
551  if (Clusterer->Root != NULL)
552  FreeCluster (Clusterer->Root);
553  // Free up all used buckets structures.
554  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
555  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
556  if (Clusterer->bucket_cache[d][c] != NULL)
557  FreeBuckets(Clusterer->bucket_cache[d][c]);
558  }
559 
560  free(Clusterer);
561  }
562 } // FreeClusterer
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
#define MAXBUCKETS
Definition: cluster.h:27
#define MINBUCKETS
Definition: cluster.h:26
PARAM_DESC * ParamDesc
Definition: cluster.h:88
void FreeCluster(CLUSTER *Cluster)
Definition: cluster.cpp:2183
void FreeKDTree(KDTREE *Tree)
Definition: kdtree.cpp:349
CLUSTER * Root
Definition: cluster.h:91
KDTREE * KDTree
Definition: cluster.h:90
void FreeBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2165

◆ FreeProtoList()

void FreeProtoList ( LIST ProtoList)

This routine frees all of the memory allocated to the specified list of prototypes. The clusters which are pointed to by the prototypes are not freed.

Parameters
ProtoListpointer to list of prototypes to be freed
Returns
None
Note
Exceptions: None
History: 6/6/89, DSJ, Created.

Definition at line 573 of file cluster.cpp.

573  {
574  destroy_nodes(*ProtoList, FreePrototype);
575 } // FreeProtoList
void destroy_nodes(LIST list, void_dest destructor)
Definition: oldlist.cpp:199
void FreePrototype(void *arg)
Definition: cluster.cpp:587

◆ FreePrototype()

void FreePrototype ( void *  arg)

This routine deallocates the memory consumed by the specified prototype and modifies the corresponding cluster so that it is no longer marked as a prototype. The cluster is NOT deallocated by this routine.

Parameters
argprototype data structure to be deallocated
Returns
None
Note
Exceptions: None
History: 5/30/89, DSJ, Created.

Definition at line 587 of file cluster.cpp.

587  { //PROTOTYPE *Prototype)
588  PROTOTYPE *Prototype = (PROTOTYPE *) arg;
589 
590  // unmark the corresponding cluster (if there is one
591  if (Prototype->Cluster != NULL)
592  Prototype->Cluster->Prototype = FALSE;
593 
594  // deallocate the prototype statistics and then the prototype itself
595  free (Prototype->Distrib);
596  free (Prototype->Mean);
597  if (Prototype->Style != spherical) {
598  free (Prototype->Variance.Elliptical);
599  free (Prototype->Magnitude.Elliptical);
600  free (Prototype->Weight.Elliptical);
601  }
602  free(Prototype);
603 } // FreePrototype
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOATUNION Weight
Definition: cluster.h:83
FLOATUNION Variance
Definition: cluster.h:81
unsigned Style
Definition: cluster.h:74
#define FALSE
Definition: capi.h:46
FLOAT32 * Mean
Definition: cluster.h:78
unsigned Prototype
Definition: cluster.h:34
CLUSTER * Cluster
Definition: cluster.h:76
FLOATUNION Magnitude
Definition: cluster.h:82

◆ FreeStatistics()

void FreeStatistics ( STATISTICS Statistics)

This routine frees the memory used by the statistics data structure.

Parameters
Statisticspointer to data structure to be freed
Returns
None
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2153 of file cluster.cpp.

2153  {
2154  free(Statistics->CoVariance);
2155  free(Statistics->Min);
2156  free(Statistics->Max);
2157  free(Statistics);
2158 } // FreeStatistics
FLOAT32 * CoVariance
Definition: cluster.cpp:172
FLOAT32 * Min
Definition: cluster.cpp:173
FLOAT32 * Max
Definition: cluster.cpp:174

◆ GetBuckets()

BUCKETS * GetBuckets ( CLUSTERER clusterer,
DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

This routine returns a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The routine keeps a list of bucket data structures which have already been created so that it minimizes the computation time needed to create a new bucket.

Parameters
clustererwhich keeps a bucket_cache for us.
Distributiontype of probability distribution to test for
SampleCountnumber of samples that are available
Confidenceprobability of a Type I error
Returns
Bucket data structure
Note
Exceptions: none
History: Thu Aug 3 12:58:10 1989, DSJ, Created.

Definition at line 1688 of file cluster.cpp.

1691  {
1692  // Get an old bucket structure with the same number of buckets.
1693  uinT16 NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1694  BUCKETS *Buckets =
1695  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS];
1696 
1697  // If a matching bucket structure is not found, make one and save it.
1698  if (Buckets == NULL) {
1699  Buckets = MakeBuckets(Distribution, SampleCount, Confidence);
1700  clusterer->bucket_cache[Distribution][NumberOfBuckets - MINBUCKETS] =
1701  Buckets;
1702  } else {
1703  // Just adjust the existing buckets.
1704  if (SampleCount != Buckets->SampleCount)
1705  AdjustBuckets(Buckets, SampleCount);
1706  if (Confidence != Buckets->Confidence) {
1707  Buckets->Confidence = Confidence;
1708  Buckets->ChiSquared = ComputeChiSquared(
1709  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets),
1710  Confidence);
1711  }
1712  InitBuckets(Buckets);
1713  }
1714  return Buckets;
1715 } // GetBuckets
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
void InitBuckets(BUCKETS *Buckets)
Definition: cluster.cpp:2283
void AdjustBuckets(BUCKETS *Buckets, uinT32 NewSampleCount)
Definition: cluster.cpp:2260
BUCKETS * MakeBuckets(DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1735
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1832
#define MINBUCKETS
Definition: cluster.h:26
FLOAT64 Confidence
Definition: cluster.cpp:180
uinT32 SampleCount
Definition: cluster.cpp:179
uinT16 NumberOfBuckets
Definition: cluster.cpp:182
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2205
uint16_t uinT16
Definition: host.h:37
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1869
FLOAT64 ChiSquared
Definition: cluster.cpp:181

◆ Independent()

BOOL8 Independent ( PARAM_DESC  ParamDesc[],
inT16  N,
FLOAT32 CoVariance,
FLOAT32  Independence 
)

This routine returns TRUE if the specified covariance matrix indicates that all N dimensions are independent of one another. One dimension is judged to be independent of another when the magnitude of the corresponding correlation coefficient is less than the specified Independence factor. The correlation coefficient is calculated as: (see Duda and Hart, pg. 247) coeff[ij] = stddev[ij] / sqrt (stddev[ii] * stddev[jj]) The covariance matrix is assumed to be symmetric (which should always be true).

Parameters
ParamDescdescriptions of each feature space dimension
Nnumber of dimensions
CoVarianceptr to a covariance matrix
Independencemax off-diagonal correlation coefficient
Returns
TRUE if dimensions are independent, FALSE otherwise
Note
Exceptions: None
History: 6/4/89, DSJ, Created.

Definition at line 1641 of file cluster.cpp.

1642  {
1643  int i, j;
1644  FLOAT32 *VARii; // points to ith on-diagonal element
1645  FLOAT32 *VARjj; // points to jth on-diagonal element
1646  FLOAT32 CorrelationCoeff;
1647 
1648  VARii = CoVariance;
1649  for (i = 0; i < N; i++, VARii += N + 1) {
1650  if (ParamDesc[i].NonEssential)
1651  continue;
1652 
1653  VARjj = VARii + N + 1;
1654  CoVariance = VARii + 1;
1655  for (j = i + 1; j < N; j++, CoVariance++, VARjj += N + 1) {
1656  if (ParamDesc[j].NonEssential)
1657  continue;
1658 
1659  if ((*VARii == 0.0) || (*VARjj == 0.0))
1660  CorrelationCoeff = 0.0;
1661  else
1662  CorrelationCoeff =
1663  sqrt (sqrt (*CoVariance * *CoVariance / (*VARii * *VARjj)));
1664  if (CorrelationCoeff > Independence)
1665  return (FALSE);
1666  }
1667  }
1668  return (TRUE);
1669 } // Independent
#define TRUE
Definition: capi.h:45
#define FALSE
Definition: capi.h:46
float FLOAT32
Definition: host.h:42

◆ InitBuckets()

void InitBuckets ( BUCKETS Buckets)

This routine sets the bucket counts in the specified histogram to zero.

Parameters
Bucketshistogram data structure to init
Returns
none
Note
Exceptions: none
History: Thu Aug 3 14:31:14 1989, DSJ, Created.

Definition at line 2283 of file cluster.cpp.

2283  {
2284  int i;
2285 
2286  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
2287  Buckets->Count[i] = 0;
2288  }
2289 
2290 } // InitBuckets
uinT32 * Count
Definition: cluster.cpp:184
uinT16 NumberOfBuckets
Definition: cluster.cpp:182

◆ Integral()

FLOAT64 Integral ( FLOAT64  f1,
FLOAT64  f2,
FLOAT64  Dx 
)

This routine computes a trapezoidal approximation to the integral of a function over a small delta in x.

Parameters
f1value of function at x1
f2value of function at x2
Dxx2 - x1 (should always be positive)
Returns
Approximation of the integral of the function from x1 to x2.
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1958 of file cluster.cpp.

1958  {
1959  return (f1 + f2) * Dx / 2.0;
1960 } // Integral

◆ InvertMatrix()

double InvertMatrix ( const float *  input,
int  size,
float *  inv 
)

Compute the inverse of a matrix using LU decomposition with partial pivoting. The return value is the sum of norms of the off-diagonal terms of the product of a and inv. (A measure of the error.)

Definition at line 2521 of file cluster.cpp.

2521  {
2522  // Allocate memory for the 2D arrays.
2524  GENERIC_2D_ARRAY<double> U_inv(size, size, 0.0);
2526 
2527  // Initialize the working matrices. U starts as input, L as I and U_inv as O.
2528  int row;
2529  int col;
2530  for (row = 0; row < size; row++) {
2531  for (col = 0; col < size; col++) {
2532  U[row][col] = input[row*size + col];
2533  L[row][col] = row == col ? 1.0 : 0.0;
2534  U_inv[row][col] = 0.0;
2535  }
2536  }
2537 
2538  // Compute forward matrix by inversion by LU decomposition of input.
2539  for (col = 0; col < size; ++col) {
2540  // Find best pivot
2541  int best_row = 0;
2542  double best_pivot = -1.0;
2543  for (row = col; row < size; ++row) {
2544  if (Abs(U[row][col]) > best_pivot) {
2545  best_pivot = Abs(U[row][col]);
2546  best_row = row;
2547  }
2548  }
2549  // Exchange pivot rows.
2550  if (best_row != col) {
2551  for (int k = 0; k < size; ++k) {
2552  double tmp = U[best_row][k];
2553  U[best_row][k] = U[col][k];
2554  U[col][k] = tmp;
2555  tmp = L[best_row][k];
2556  L[best_row][k] = L[col][k];
2557  L[col][k] = tmp;
2558  }
2559  }
2560  // Now do the pivot itself.
2561  for (row = col + 1; row < size; ++row) {
2562  double ratio = -U[row][col] / U[col][col];
2563  for (int j = col; j < size; ++j) {
2564  U[row][j] += U[col][j] * ratio;
2565  }
2566  for (int k = 0; k < size; ++k) {
2567  L[row][k] += L[col][k] * ratio;
2568  }
2569  }
2570  }
2571  // Next invert U.
2572  for (col = 0; col < size; ++col) {
2573  U_inv[col][col] = 1.0 / U[col][col];
2574  for (row = col - 1; row >= 0; --row) {
2575  double total = 0.0;
2576  for (int k = col; k > row; --k) {
2577  total += U[row][k] * U_inv[k][col];
2578  }
2579  U_inv[row][col] = -total / U[row][row];
2580  }
2581  }
2582  // Now the answer is U_inv.L.
2583  for (row = 0; row < size; row++) {
2584  for (col = 0; col < size; col++) {
2585  double sum = 0.0;
2586  for (int k = row; k < size; ++k) {
2587  sum += U_inv[row][k] * L[k][col];
2588  }
2589  inv[row*size + col] = sum;
2590  }
2591  }
2592  // Check matrix product.
2593  double error_sum = 0.0;
2594  for (row = 0; row < size; row++) {
2595  for (col = 0; col < size; col++) {
2596  double sum = 0.0;
2597  for (int k = 0; k < size; ++k) {
2598  sum += input[row*size + k] * inv[k *size + col];
2599  }
2600  if (row != col) {
2601  error_sum += Abs(sum);
2602  }
2603  }
2604  }
2605  return error_sum;
2606 }
#define Abs(N)
Definition: cluster.cpp:207
voidpf void uLong size
Definition: ioapi.h:39

◆ ListEntryMatch()

int ListEntryMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list for a list node whose contents match Key. It is called by the list delete_d routine.

Returns
TRUE if ListNode matches Key
Note
Exceptions: none
History: Thu Aug 3 14:23:58 1989, DSJ, Created.

Definition at line 2244 of file cluster.cpp.

2245  { //Key
2246  return (arg1 == arg2);
2247 
2248 } // ListEntryMatch

◆ MakeBuckets()

BUCKETS * MakeBuckets ( DISTRIBUTION  Distribution,
uinT32  SampleCount,
FLOAT64  Confidence 
)

This routine creates a histogram data structure which can be used by other routines to place samples into histogram buckets, and then apply a goodness of fit test to the histogram data to determine if the samples belong to the specified probability distribution. The buckets are allocated in such a way that the expected frequency of samples in each bucket is approximately the same. In order to make this possible, a mapping table is computed which maps "normalized" samples into the appropriate bucket.

Parameters
Distributiontype of probability distribution to test for
SampleCountnumber of samples that are available
Confidenceprobability of a Type I error
Returns
Pointer to new histogram data structure
Note
Exceptions: None
History: 6/4/89, DSJ, Created.

Definition at line 1735 of file cluster.cpp.

1737  {
1738  const DENSITYFUNC DensityFunction[] =
1739  { NormalDensity, UniformDensity, UniformDensity };
1740  int i, j;
1741  BUCKETS *Buckets;
1742  FLOAT64 BucketProbability;
1743  FLOAT64 NextBucketBoundary;
1744  FLOAT64 Probability;
1745  FLOAT64 ProbabilityDelta;
1746  FLOAT64 LastProbDensity;
1747  FLOAT64 ProbDensity;
1748  uinT16 CurrentBucket;
1749  BOOL8 Symmetrical;
1750 
1751  // allocate memory needed for data structure
1752  Buckets = static_cast<BUCKETS*>(Emalloc(sizeof(BUCKETS)));
1753  Buckets->NumberOfBuckets = OptimumNumberOfBuckets(SampleCount);
1754  Buckets->SampleCount = SampleCount;
1755  Buckets->Confidence = Confidence;
1756  Buckets->Count = static_cast<uinT32*>(
1757  Emalloc(Buckets->NumberOfBuckets * sizeof(uinT32)));
1758  Buckets->ExpectedCount = static_cast<FLOAT32*>(
1759  Emalloc(Buckets->NumberOfBuckets * sizeof(FLOAT32)));
1760 
1761  // initialize simple fields
1762  Buckets->Distribution = Distribution;
1763  for (i = 0; i < Buckets->NumberOfBuckets; i++) {
1764  Buckets->Count[i] = 0;
1765  Buckets->ExpectedCount[i] = 0.0;
1766  }
1767 
1768  // all currently defined distributions are symmetrical
1769  Symmetrical = TRUE;
1770  Buckets->ChiSquared = ComputeChiSquared(
1771  DegreesOfFreedom(Distribution, Buckets->NumberOfBuckets), Confidence);
1772 
1773  if (Symmetrical) {
1774  // allocate buckets so that all have approx. equal probability
1775  BucketProbability = 1.0 / (FLOAT64) (Buckets->NumberOfBuckets);
1776 
1777  // distribution is symmetric so fill in upper half then copy
1778  CurrentBucket = Buckets->NumberOfBuckets / 2;
1779  if (Odd (Buckets->NumberOfBuckets))
1780  NextBucketBoundary = BucketProbability / 2;
1781  else
1782  NextBucketBoundary = BucketProbability;
1783 
1784  Probability = 0.0;
1785  LastProbDensity =
1786  (*DensityFunction[(int) Distribution]) (BUCKETTABLESIZE / 2);
1787  for (i = BUCKETTABLESIZE / 2; i < BUCKETTABLESIZE; i++) {
1788  ProbDensity = (*DensityFunction[(int) Distribution]) (i + 1);
1789  ProbabilityDelta = Integral (LastProbDensity, ProbDensity, 1.0);
1790  Probability += ProbabilityDelta;
1791  if (Probability > NextBucketBoundary) {
1792  if (CurrentBucket < Buckets->NumberOfBuckets - 1)
1793  CurrentBucket++;
1794  NextBucketBoundary += BucketProbability;
1795  }
1796  Buckets->Bucket[i] = CurrentBucket;
1797  Buckets->ExpectedCount[CurrentBucket] +=
1798  (FLOAT32) (ProbabilityDelta * SampleCount);
1799  LastProbDensity = ProbDensity;
1800  }
1801  // place any leftover probability into the last bucket
1802  Buckets->ExpectedCount[CurrentBucket] +=
1803  (FLOAT32) ((0.5 - Probability) * SampleCount);
1804 
1805  // copy upper half of distribution to lower half
1806  for (i = 0, j = BUCKETTABLESIZE - 1; i < j; i++, j--)
1807  Buckets->Bucket[i] =
1808  Mirror(Buckets->Bucket[j], Buckets->NumberOfBuckets);
1809 
1810  // copy upper half of expected counts to lower half
1811  for (i = 0, j = Buckets->NumberOfBuckets - 1; i <= j; i++, j--)
1812  Buckets->ExpectedCount[i] += Buckets->ExpectedCount[j];
1813  }
1814  return Buckets;
1815 } // MakeBuckets
#define Mirror(N, R)
Definition: cluster.cpp:206
#define Odd(N)
Definition: cluster.cpp:205
#define TRUE
Definition: capi.h:45
FLOAT64(* DENSITYFUNC)(inT32)
Definition: cluster.cpp:202
FLOAT64 UniformDensity(inT32 x)
Definition: cluster.cpp:1939
FLOAT64 Integral(FLOAT64 f1, FLOAT64 f2, FLOAT64 Dx)
Definition: cluster.cpp:1958
void * Emalloc(int Size)
Definition: emalloc.cpp:47
uinT16 OptimumNumberOfBuckets(uinT32 SampleCount)
Definition: cluster.cpp:1832
FLOAT64 Confidence
Definition: cluster.cpp:180
uinT32 * Count
Definition: cluster.cpp:184
DISTRIBUTION Distribution
Definition: cluster.cpp:178
#define BUCKETTABLESIZE
Definition: cluster.cpp:159
uint32_t uinT32
Definition: host.h:39
unsigned char BOOL8
Definition: host.h:44
uinT32 SampleCount
Definition: cluster.cpp:179
FLOAT32 * ExpectedCount
Definition: cluster.cpp:185
uinT16 NumberOfBuckets
Definition: cluster.cpp:182
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2205
float FLOAT32
Definition: host.h:42
typedef int(ZCALLBACK *close_file_func) OF((voidpf opaque
FLOAT64 NormalDensity(inT32 x)
Definition: cluster.cpp:1923
uint16_t uinT16
Definition: host.h:37
FLOAT64 ComputeChiSquared(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:1869
uinT16 Bucket[BUCKETTABLESIZE]
Definition: cluster.cpp:183
double FLOAT64
Definition: host.h:43
FLOAT64 ChiSquared
Definition: cluster.cpp:181

◆ MakeClusterer()

CLUSTERER* MakeClusterer ( inT16  SampleSize,
const PARAM_DESC  ParamDesc[] 
)

This routine creates a new clusterer data structure, initializes it, and returns a pointer to it.

Parameters
SampleSizenumber of dimensions in feature space
ParamDescdescription of each dimension
Returns
pointer to the new clusterer data structure
Note
Exceptions: None
History: 5/29/89, DSJ, Created.

Definition at line 399 of file cluster.cpp.

399  {
400  CLUSTERER *Clusterer;
401  int i;
402 
403  // allocate main clusterer data structure and init simple fields
404  Clusterer = (CLUSTERER *) Emalloc (sizeof (CLUSTERER));
405  Clusterer->SampleSize = SampleSize;
406  Clusterer->NumberOfSamples = 0;
407  Clusterer->NumChar = 0;
408 
409  // init fields which will not be used initially
410  Clusterer->Root = NULL;
411  Clusterer->ProtoList = NIL_LIST;
412 
413  // maintain a copy of param descriptors in the clusterer data structure
414  Clusterer->ParamDesc =
415  (PARAM_DESC *) Emalloc (SampleSize * sizeof (PARAM_DESC));
416  for (i = 0; i < SampleSize; i++) {
417  Clusterer->ParamDesc[i].Circular = ParamDesc[i].Circular;
418  Clusterer->ParamDesc[i].NonEssential = ParamDesc[i].NonEssential;
419  Clusterer->ParamDesc[i].Min = ParamDesc[i].Min;
420  Clusterer->ParamDesc[i].Max = ParamDesc[i].Max;
421  Clusterer->ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
422  Clusterer->ParamDesc[i].HalfRange = Clusterer->ParamDesc[i].Range / 2;
423  Clusterer->ParamDesc[i].MidRange =
424  (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
425  }
426 
427  // allocate a kd tree to hold the samples
428  Clusterer->KDTree = MakeKDTree (SampleSize, ParamDesc);
429 
430  // Initialize cache of histogram buckets to minimize recomputing them.
431  for (int d = 0; d < DISTRIBUTION_COUNT; ++d) {
432  for (int c = 0; c < MAXBUCKETS + 1 - MINBUCKETS; ++c)
433  Clusterer->bucket_cache[d][c] = NULL;
434  }
435 
436  return Clusterer;
437 } // MakeClusterer
LIST ProtoList
Definition: cluster.h:92
BUCKETS * bucket_cache[DISTRIBUTION_COUNT][MAXBUCKETS+1 - MINBUCKETS]
Definition: cluster.h:95
#define MAXBUCKETS
Definition: cluster.h:27
KDTREE * MakeKDTree(inT16 KeySize, const PARAM_DESC KeyDesc[])
Definition: kdtree.cpp:182
FLOAT32 MidRange
Definition: ocrfeatures.h:53
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOAT32 Range
Definition: ocrfeatures.h:51
#define MINBUCKETS
Definition: cluster.h:26
PARAM_DESC * ParamDesc
Definition: cluster.h:88
#define NIL_LIST
Definition: oldlist.h:126
inT8 Circular
Definition: ocrfeatures.h:47
FLOAT32 Min
Definition: ocrfeatures.h:49
inT32 NumberOfSamples
Definition: cluster.h:89
inT16 SampleSize
Definition: cluster.h:87
inT8 NonEssential
Definition: ocrfeatures.h:48
inT32 NumChar
Definition: cluster.h:93
FLOAT32 Max
Definition: ocrfeatures.h:50
CLUSTER * Root
Definition: cluster.h:91
KDTREE * KDTree
Definition: cluster.h:90
FLOAT32 HalfRange
Definition: ocrfeatures.h:52

◆ MakeDegenerateProto()

PROTOTYPE * MakeDegenerateProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics,
PROTOSTYLE  Style,
inT32  MinSamples 
)

This routine checks for clusters which are degenerate and therefore cannot be analyzed in a statistically valid way. A cluster is defined as degenerate if it does not have at least MINSAMPLESNEEDED samples in it. If the cluster is found to be degenerate, a prototype of the specified style is generated and marked as insignificant. A cluster is also degenerate if it does not have at least MinSamples samples in it.

If the cluster is not degenerate, NULL is returned.

Parameters
Nnumber of dimensions
Clustercluster being analyzed
Statisticsstatistical info about cluster
Styletype of prototype to be generated
MinSamplesminimum number of samples in a cluster
Returns
Pointer to degenerate prototype or NULL.
Note
Exceptions: None
History: 6/20/89, DSJ, Created. 7/12/89, DSJ, Changed name and added check for 0 stddev. 8/8/89, DSJ, Removed check for 0 stddev (handled elsewhere).

Definition at line 1063 of file cluster.cpp.

1068  {
1069  PROTOTYPE *Proto = NULL;
1070 
1071  if (MinSamples < MINSAMPLESNEEDED)
1072  MinSamples = MINSAMPLESNEEDED;
1073 
1074  if (Cluster->SampleCount < MinSamples) {
1075  switch (Style) {
1076  case spherical:
1077  Proto = NewSphericalProto (N, Cluster, Statistics);
1078  break;
1079  case elliptical:
1080  case automatic:
1081  Proto = NewEllipticalProto (N, Cluster, Statistics);
1082  break;
1083  case mixed:
1084  Proto = NewMixedProto (N, Cluster, Statistics);
1085  break;
1086  }
1087  Proto->Significant = FALSE;
1088  }
1089  return (Proto);
1090 } // MakeDegenerateProto
#define MINSAMPLESNEEDED
Definition: cluster.cpp:151
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1499
#define FALSE
Definition: capi.h:46
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1532
Definition: cluster.h:45
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1576
unsigned Significant
Definition: cluster.h:68
unsigned SampleCount
Definition: cluster.h:35

◆ MakeDimRandom()

void MakeDimRandom ( uinT16  i,
PROTOTYPE Proto,
PARAM_DESC ParamDesc 
)

This routine alters the ith dimension of the specified mixed prototype to be D_random.

Parameters
iindex of dimension to be changed
Protoprototype whose dimension is to be altered
ParamDescdescription of specified dimension
Returns
None
Note
Exceptions: None
History: 6/20/89, DSJ, Created.

Definition at line 1352 of file cluster.cpp.

1352  {
1353  Proto->Distrib[i] = D_random;
1354  Proto->Mean[i] = ParamDesc->MidRange;
1355  Proto->Variance.Elliptical[i] = ParamDesc->HalfRange;
1356 
1357  // subtract out the previous magnitude of this dimension from the total
1358  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1359  Proto->Magnitude.Elliptical[i] = 1.0 / ParamDesc->Range;
1360  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1361  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1362 
1363  // note that the proto Weight is irrelevant for D_random protos
1364 } // MakeDimRandom
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOAT32 MidRange
Definition: ocrfeatures.h:53
FLOAT32 Range
Definition: ocrfeatures.h:51
FLOATUNION Variance
Definition: cluster.h:81
FLOAT32 * Mean
Definition: cluster.h:78
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 HalfRange
Definition: ocrfeatures.h:52

◆ MakeDimUniform()

void MakeDimUniform ( uinT16  i,
PROTOTYPE Proto,
STATISTICS Statistics 
)

This routine alters the ith dimension of the specified mixed prototype to be uniform.

Parameters
iindex of dimension to be changed
Protoprototype whose dimension is to be altered
Statisticsstatistical info about prototype
Returns
None
Note
Exceptions: None
History: 6/20/89, DSJ, Created.

Definition at line 1376 of file cluster.cpp.

1376  {
1377  Proto->Distrib[i] = uniform;
1378  Proto->Mean[i] = Proto->Cluster->Mean[i] +
1379  (Statistics->Min[i] + Statistics->Max[i]) / 2;
1380  Proto->Variance.Elliptical[i] =
1381  (Statistics->Max[i] - Statistics->Min[i]) / 2;
1382  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1383  Proto->Variance.Elliptical[i] = MINVARIANCE;
1384 
1385  // subtract out the previous magnitude of this dimension from the total
1386  Proto->TotalMagnitude /= Proto->Magnitude.Elliptical[i];
1387  Proto->Magnitude.Elliptical[i] =
1388  1.0 / (2.0 * Proto->Variance.Elliptical[i]);
1389  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1390  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1391 
1392  // note that the proto Weight is irrelevant for uniform protos
1393 } // MakeDimUniform
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOAT32 * Elliptical
Definition: cluster.h:64
FLOAT32 Mean[1]
Definition: cluster.h:39
FLOATUNION Variance
Definition: cluster.h:81
#define MINVARIANCE
Definition: cluster.cpp:141
FLOAT32 * Min
Definition: cluster.cpp:173
FLOAT32 * Mean
Definition: cluster.h:78
FLOAT32 * Max
Definition: cluster.cpp:174
CLUSTER * Cluster
Definition: cluster.h:76
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 TotalMagnitude
Definition: cluster.h:79

◆ MakeEllipticalProto()

PROTOTYPE * MakeEllipticalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

This routine tests the specified cluster to see if it can be approximated by an elliptical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about cluster
Bucketshistogram struct used to analyze distribution
Returns
Pointer to new elliptical prototype or NULL.
Note
Exceptions: None
History: 6/12/89, DSJ, Created.

Definition at line 1249 of file cluster.cpp.

1252  {
1253  PROTOTYPE *Proto = NULL;
1254  int i;
1255 
1256  // check that each dimension is a normal distribution
1257  for (i = 0; i < Clusterer->SampleSize; i++) {
1258  if (Clusterer->ParamDesc[i].NonEssential)
1259  continue;
1260 
1261  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1262  Cluster->Mean[i],
1263  sqrt ((FLOAT64) Statistics->
1264  CoVariance[i * (Clusterer->SampleSize + 1)]));
1265  if (!DistributionOK (Buckets))
1266  break;
1267  }
1268  // if all dimensions matched a normal distribution, make a proto
1269  if (i >= Clusterer->SampleSize)
1270  Proto = NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1271  return (Proto);
1272 } // MakeEllipticalProto
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:1984
FLOAT32 Mean[1]
Definition: cluster.h:39
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2125
PARAM_DESC * ParamDesc
Definition: cluster.h:88
inT16 SampleSize
Definition: cluster.h:87
inT8 NonEssential
Definition: ocrfeatures.h:48
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1532
double FLOAT64
Definition: host.h:43

◆ MakeMixedProto()

PROTOTYPE * MakeMixedProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS NormalBuckets,
FLOAT64  Confidence 
)

This routine tests each dimension of the specified cluster to see what distribution would best approximate that dimension. Each dimension is compared to the following distributions in order: normal, random, uniform. If each dimension can be represented by one of these distributions, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into a prototype
Statisticsstatistical info about cluster
NormalBucketshistogram struct used to analyze distribution
Confidenceconfidence level for alternate distributions
Returns
Pointer to new mixed prototype or NULL.
Note
Exceptions: None
History: 6/12/89, DSJ, Created.

Definition at line 1291 of file cluster.cpp.

1295  {
1296  PROTOTYPE *Proto;
1297  int i;
1298  BUCKETS *UniformBuckets = NULL;
1299  BUCKETS *RandomBuckets = NULL;
1300 
1301  // create a mixed proto to work on - initially assume all dimensions normal*/
1302  Proto = NewMixedProto (Clusterer->SampleSize, Cluster, Statistics);
1303 
1304  // find the proper distribution for each dimension
1305  for (i = 0; i < Clusterer->SampleSize; i++) {
1306  if (Clusterer->ParamDesc[i].NonEssential)
1307  continue;
1308 
1309  FillBuckets (NormalBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1310  Proto->Mean[i],
1311  sqrt ((FLOAT64) Proto->Variance.Elliptical[i]));
1312  if (DistributionOK (NormalBuckets))
1313  continue;
1314 
1315  if (RandomBuckets == NULL)
1316  RandomBuckets =
1317  GetBuckets(Clusterer, D_random, Cluster->SampleCount, Confidence);
1318  MakeDimRandom (i, Proto, &(Clusterer->ParamDesc[i]));
1319  FillBuckets (RandomBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1320  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1321  if (DistributionOK (RandomBuckets))
1322  continue;
1323 
1324  if (UniformBuckets == NULL)
1325  UniformBuckets =
1326  GetBuckets(Clusterer, uniform, Cluster->SampleCount, Confidence);
1327  MakeDimUniform(i, Proto, Statistics);
1328  FillBuckets (UniformBuckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1329  Proto->Mean[i], Proto->Variance.Elliptical[i]);
1330  if (DistributionOK (UniformBuckets))
1331  continue;
1332  break;
1333  }
1334  // if any dimension failed to match a distribution, discard the proto
1335  if (i < Clusterer->SampleSize) {
1336  FreePrototype(Proto);
1337  Proto = NULL;
1338  }
1339  return (Proto);
1340 } // MakeMixedProto
void MakeDimUniform(uinT16 i, PROTOTYPE *Proto, STATISTICS *Statistics)
Definition: cluster.cpp:1376
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:1984
FLOAT32 * Elliptical
Definition: cluster.h:64
void MakeDimRandom(uinT16 i, PROTOTYPE *Proto, PARAM_DESC *ParamDesc)
Definition: cluster.cpp:1352
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2125
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOATUNION Variance
Definition: cluster.h:81
inT16 SampleSize
Definition: cluster.h:87
inT8 NonEssential
Definition: ocrfeatures.h:48
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1688
FLOAT32 * Mean
Definition: cluster.h:78
void FreePrototype(void *arg)
Definition: cluster.cpp:587
PROTOTYPE * NewMixedProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1576
unsigned SampleCount
Definition: cluster.h:35
double FLOAT64
Definition: host.h:43

◆ MakeNewCluster()

CLUSTER * MakeNewCluster ( CLUSTERER Clusterer,
TEMPCLUSTER TempCluster 
)

This routine creates a new permanent cluster from the clusters specified in TempCluster. The 2 clusters in TempCluster are marked as "clustered" and deleted from the kd-tree. The new cluster is then added to the kd-tree.

Parameters
Clusterercurrent clustering environment
TempClusterpotential cluster to make permanent
Returns
Pointer to the new permanent cluster
Note
Exceptions: none
History: 5/29/89, DSJ, Created. 7/13/89, DSJ, Removed visibility of kd-tree node data struct

Definition at line 836 of file cluster.cpp.

836  {
837  CLUSTER *Cluster;
838 
839  // allocate the new cluster and initialize it
840  Cluster = (CLUSTER *) Emalloc(
841  sizeof(CLUSTER) + (Clusterer->SampleSize - 1) * sizeof(FLOAT32));
842  Cluster->Clustered = FALSE;
843  Cluster->Prototype = FALSE;
844  Cluster->Left = TempCluster->Cluster;
845  Cluster->Right = TempCluster->Neighbor;
846  Cluster->CharID = -1;
847 
848  // mark the old clusters as "clustered" and delete them from the kd-tree
849  Cluster->Left->Clustered = TRUE;
850  Cluster->Right->Clustered = TRUE;
851  KDDelete(Clusterer->KDTree, Cluster->Left->Mean, Cluster->Left);
852  KDDelete(Clusterer->KDTree, Cluster->Right->Mean, Cluster->Right);
853 
854  // compute the mean and sample count for the new cluster
855  Cluster->SampleCount =
856  MergeClusters(Clusterer->SampleSize, Clusterer->ParamDesc,
857  Cluster->Left->SampleCount, Cluster->Right->SampleCount,
858  Cluster->Mean, Cluster->Left->Mean, Cluster->Right->Mean);
859 
860  // add the new cluster to the KD tree
861  KDStore(Clusterer->KDTree, Cluster->Mean, Cluster);
862  return Cluster;
863 } // MakeNewCluster
unsigned Clustered
Definition: cluster.h:33
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:218
#define TRUE
Definition: capi.h:45
struct sample * Left
Definition: cluster.h:36
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(int Size)
Definition: emalloc.cpp:47
PARAM_DESC * ParamDesc
Definition: cluster.h:88
CLUSTER * Neighbor
Definition: cluster.cpp:164
inT16 SampleSize
Definition: cluster.h:87
CLUSTER * Cluster
Definition: cluster.cpp:163
inT32 CharID
Definition: cluster.h:38
#define FALSE
Definition: capi.h:46
struct sample * Right
Definition: cluster.h:37
unsigned Prototype
Definition: cluster.h:34
float FLOAT32
Definition: host.h:42
void KDDelete(KDTREE *Tree, FLOAT32 Key[], void *Data)
Definition: kdtree.cpp:264
inT32 MergeClusters(inT16 N, register PARAM_DESC ParamDesc[], register inT32 n1, register inT32 n2, register FLOAT32 m[], register FLOAT32 m1[], register FLOAT32 m2[])
unsigned SampleCount
Definition: cluster.h:35
KDTREE * KDTree
Definition: cluster.h:90
Definition: cluster.h:32

◆ MakePotentialClusters()

void MakePotentialClusters ( ClusteringContext context,
CLUSTER Cluster,
inT32  Level 
)

This routine is designed to be used in concert with the KDWalk routine. It will create a potential cluster for each sample in the kd-tree that is being walked. This potential cluster will then be pushed on the heap.

Parameters
contextClusteringContext (see definition above)
Clustercurrent cluster being visited in kd-tree walk
Levellevel of this cluster in the kd-tree

Definition at line 765 of file cluster.cpp.

766  {
767  ClusterPair HeapEntry;
768  int next = context->next;
769  context->candidates[next].Cluster = Cluster;
770  HeapEntry.data = &(context->candidates[next]);
771  context->candidates[next].Neighbor =
772  FindNearestNeighbor(context->tree,
773  context->candidates[next].Cluster,
774  &HeapEntry.key);
775  if (context->candidates[next].Neighbor != NULL) {
776  context->heap->Push(&HeapEntry);
777  context->next++;
778  }
779 } // MakePotentialClusters
ClusterHeap * heap
Definition: cluster.cpp:196
void Push(Pair *entry)
Definition: genericheap.h:95
CLUSTER * Neighbor
Definition: cluster.cpp:164
CLUSTER * Cluster
Definition: cluster.cpp:163
TEMPCLUSTER * candidates
Definition: cluster.cpp:197
CLUSTER * FindNearestNeighbor(KDTREE *Tree, CLUSTER *Cluster, FLOAT32 *Distance)
Definition: cluster.cpp:798

◆ MakePrototype()

PROTOTYPE * MakePrototype ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster 
)

This routine attempts to create a prototype from the specified cluster that conforms to the distribution specified in Config. If there are too few samples in the cluster to perform a statistical analysis, then a prototype is generated but labelled as insignificant. If the dimensions of the cluster are not independent, no prototype is generated and NULL is returned. If a prototype can be found that matches the desired distribution then a pointer to it is returned, otherwise NULL is returned.

Parameters
Clustererdata structure holding cluster tree
Configparameters used to control prototype generation
Clustercluster to be made into a prototype
Returns
Pointer to new prototype or NULL
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 969 of file cluster.cpp.

971  {
972  STATISTICS *Statistics;
973  PROTOTYPE *Proto;
974  BUCKETS *Buckets;
975 
976  // filter out clusters which contain samples from the same character
977  if (MultipleCharSamples (Clusterer, Cluster, Config->MaxIllegal))
978  return NULL;
979 
980  // compute the covariance matrix and ranges for the cluster
981  Statistics =
982  ComputeStatistics(Clusterer->SampleSize, Clusterer->ParamDesc, Cluster);
983 
984  // check for degenerate clusters which need not be analyzed further
985  // note that the MinSamples test assumes that all clusters with multiple
986  // character samples have been removed (as above)
987  Proto = MakeDegenerateProto(
988  Clusterer->SampleSize, Cluster, Statistics, Config->ProtoStyle,
989  (inT32) (Config->MinSamples * Clusterer->NumChar));
990  if (Proto != NULL) {
991  FreeStatistics(Statistics);
992  return Proto;
993  }
994  // check to ensure that all dimensions are independent
995  if (!Independent(Clusterer->ParamDesc, Clusterer->SampleSize,
996  Statistics->CoVariance, Config->Independence)) {
997  FreeStatistics(Statistics);
998  return NULL;
999  }
1000 
1001  if (HOTELLING && Config->ProtoStyle == elliptical) {
1002  Proto = TestEllipticalProto(Clusterer, Config, Cluster, Statistics);
1003  if (Proto != NULL) {
1004  FreeStatistics(Statistics);
1005  return Proto;
1006  }
1007  }
1008 
1009  // create a histogram data structure used to evaluate distributions
1010  Buckets = GetBuckets(Clusterer, normal, Cluster->SampleCount,
1011  Config->Confidence);
1012 
1013  // create a prototype based on the statistics and test it
1014  switch (Config->ProtoStyle) {
1015  case spherical:
1016  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1017  break;
1018  case elliptical:
1019  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1020  break;
1021  case mixed:
1022  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1023  Config->Confidence);
1024  break;
1025  case automatic:
1026  Proto = MakeSphericalProto(Clusterer, Cluster, Statistics, Buckets);
1027  if (Proto != NULL)
1028  break;
1029  Proto = MakeEllipticalProto(Clusterer, Cluster, Statistics, Buckets);
1030  if (Proto != NULL)
1031  break;
1032  Proto = MakeMixedProto(Clusterer, Cluster, Statistics, Buckets,
1033  Config->Confidence);
1034  break;
1035  }
1036  FreeStatistics(Statistics);
1037  return Proto;
1038 } // MakePrototype
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
FLOAT32 * CoVariance
Definition: cluster.cpp:172
int32_t inT32
Definition: host.h:38
PROTOTYPE * MakeEllipticalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1249
PROTOTYPE * MakeDegenerateProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics, PROTOSTYLE Style, inT32 MinSamples)
Definition: cluster.cpp:1063
PROTOTYPE * MakeMixedProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *NormalBuckets, FLOAT64 Confidence)
Definition: cluster.cpp:1291
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 MinSamples
Definition: cluster.h:50
FLOAT32 MaxIllegal
Definition: cluster.h:51
PROTOTYPE * MakeSphericalProto(CLUSTERER *Clusterer, CLUSTER *Cluster, STATISTICS *Statistics, BUCKETS *Buckets)
Definition: cluster.cpp:1212
FLOAT64 Confidence
Definition: cluster.h:54
inT16 SampleSize
Definition: cluster.h:87
BUCKETS * GetBuckets(CLUSTERER *clusterer, DISTRIBUTION Distribution, uinT32 SampleCount, FLOAT64 Confidence)
Definition: cluster.cpp:1688
PROTOTYPE * TestEllipticalProto(CLUSTERER *Clusterer, CLUSTERCONFIG *Config, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1105
FLOAT32 Independence
Definition: cluster.h:53
inT32 NumChar
Definition: cluster.h:93
STATISTICS * ComputeStatistics(inT16 N, PARAM_DESC ParamDesc[], CLUSTER *Cluster)
Definition: cluster.cpp:1412
Definition: cluster.h:45
Definition: cluster.h:59
BOOL8 Independent(PARAM_DESC ParamDesc[], inT16 N, FLOAT32 *CoVariance, FLOAT32 Independence)
Definition: cluster.cpp:1641
#define HOTELLING
Definition: cluster.cpp:29
unsigned SampleCount
Definition: cluster.h:35
void FreeStatistics(STATISTICS *Statistics)
Definition: cluster.cpp:2153
BOOL8 MultipleCharSamples(CLUSTERER *Clusterer, CLUSTER *Cluster, FLOAT32 MaxIllegal)
Definition: cluster.cpp:2465

◆ MakeSample()

SAMPLE* MakeSample ( CLUSTERER Clusterer,
const FLOAT32 Feature,
inT32  CharID 
)

This routine creates a new sample data structure to hold the specified feature. This sample is added to the clusterer data structure (so that it knows which samples are to be clustered later), and a pointer to the sample is returned to the caller.

Parameters
Clustererclusterer data structure to add sample to
Featurefeature to be added to clusterer
CharIDunique ident. of char that sample came from
Returns
Pointer to the new sample data structure
Note
Exceptions: ALREADYCLUSTERED MakeSample can't be called after ClusterSamples has been called
History: 5/29/89, DSJ, Created.

Definition at line 455 of file cluster.cpp.

456  {
457  SAMPLE *Sample;
458  int i;
459 
460  // see if the samples have already been clustered - if so trap an error
461  if (Clusterer->Root != NULL)
463  "Can't add samples after they have been clustered");
464 
465  // allocate the new sample and initialize it
466  Sample = (SAMPLE *) Emalloc (sizeof (SAMPLE) +
467  (Clusterer->SampleSize -
468  1) * sizeof (FLOAT32));
469  Sample->Clustered = FALSE;
470  Sample->Prototype = FALSE;
471  Sample->SampleCount = 1;
472  Sample->Left = NULL;
473  Sample->Right = NULL;
474  Sample->CharID = CharID;
475 
476  for (i = 0; i < Clusterer->SampleSize; i++)
477  Sample->Mean[i] = Feature[i];
478 
479  // add the sample to the KD tree - keep track of the total # of samples
480  Clusterer->NumberOfSamples++;
481  KDStore (Clusterer->KDTree, Sample->Mean, (char *) Sample);
482  if (CharID >= Clusterer->NumChar)
483  Clusterer->NumChar = CharID + 1;
484 
485  // execute hook for monitoring clustering operation
486  // (*SampleCreationHook)( Sample );
487 
488  return (Sample);
489 } // MakeSample
unsigned Clustered
Definition: cluster.h:33
void KDStore(KDTREE *Tree, FLOAT32 *Key, void *Data)
Definition: kdtree.cpp:218
struct sample * Left
Definition: cluster.h:36
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(int Size)
Definition: emalloc.cpp:47
inT32 NumberOfSamples
Definition: cluster.h:89
inT16 SampleSize
Definition: cluster.h:87
inT32 CharID
Definition: cluster.h:38
#define FALSE
Definition: capi.h:46
#define ALREADYCLUSTERED
Definition: cluster.h:133
struct sample * Right
Definition: cluster.h:37
inT32 NumChar
Definition: cluster.h:93
unsigned Prototype
Definition: cluster.h:34
float FLOAT32
Definition: host.h:42
void DoError(int Error, const char *Message)
Definition: danerror.cpp:42
CLUSTER * Root
Definition: cluster.h:91
unsigned SampleCount
Definition: cluster.h:35
KDTREE * KDTree
Definition: cluster.h:90
Definition: cluster.h:32

◆ MakeSphericalProto()

PROTOTYPE * MakeSphericalProto ( CLUSTERER Clusterer,
CLUSTER Cluster,
STATISTICS Statistics,
BUCKETS Buckets 
)

This routine tests the specified cluster to see if it can be approximated by a spherical normal distribution. If it can be, then a new prototype is formed and returned to the caller. If it can't be, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Clustercluster to be made into a spherical prototype
Statisticsstatistical info about cluster
Bucketshistogram struct used to analyze distribution
Returns
Pointer to new spherical prototype or NULL.
Note
Exceptions: None
History: 6/1/89, DSJ, Created.

Definition at line 1212 of file cluster.cpp.

1215  {
1216  PROTOTYPE *Proto = NULL;
1217  int i;
1218 
1219  // check that each dimension is a normal distribution
1220  for (i = 0; i < Clusterer->SampleSize; i++) {
1221  if (Clusterer->ParamDesc[i].NonEssential)
1222  continue;
1223 
1224  FillBuckets (Buckets, Cluster, i, &(Clusterer->ParamDesc[i]),
1225  Cluster->Mean[i],
1226  sqrt ((FLOAT64) (Statistics->AvgVariance)));
1227  if (!DistributionOK (Buckets))
1228  break;
1229  }
1230  // if all dimensions matched a normal distribution, make a proto
1231  if (i >= Clusterer->SampleSize)
1232  Proto = NewSphericalProto (Clusterer->SampleSize, Cluster, Statistics);
1233  return (Proto);
1234 } // MakeSphericalProto
void FillBuckets(BUCKETS *Buckets, CLUSTER *Cluster, uinT16 Dim, PARAM_DESC *ParamDesc, FLOAT32 Mean, FLOAT32 StdDev)
Definition: cluster.cpp:1984
FLOAT32 Mean[1]
Definition: cluster.h:39
BOOL8 DistributionOK(BUCKETS *Buckets)
Definition: cluster.cpp:2125
PARAM_DESC * ParamDesc
Definition: cluster.h:88
PROTOTYPE * NewSphericalProto(uinT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1499
inT16 SampleSize
Definition: cluster.h:87
inT8 NonEssential
Definition: ocrfeatures.h:48
FLOAT32 AvgVariance
Definition: cluster.cpp:171
double FLOAT64
Definition: host.h:43

◆ Mean()

FLOAT32 Mean ( PROTOTYPE Proto,
uinT16  Dimension 
)

This routine returns the mean of the specified prototype in the indicated dimension.

Parameters
Protoprototype to return mean of
Dimensiondimension whose mean is to be returned
Returns
Mean of Prototype in Dimension
Note
Exceptions: none
History: 7/6/89, DSJ, Created.

Definition at line 644 of file cluster.cpp.

644  {
645  return (Proto->Mean[Dimension]);
646 } // Mean
FLOAT32 * Mean
Definition: cluster.h:78

◆ MergeClusters() [1/2]

inT32 MergeClusters ( inT16  N,
register PARAM_DESC  ParamDesc[],
register inT32  n1,
register inT32  n2,
register FLOAT32  m[],
register FLOAT32  m1[],
register FLOAT32  m2[] 
)

◆ MergeClusters() [2/2]

inT32 MergeClusters ( inT16  N,
PARAM_DESC  ParamDesc[],
inT32  n1,
inT32  n2,
FLOAT32  m[],
FLOAT32  m1[],
FLOAT32  m2[] 
)

This routine merges two clusters into one larger cluster. To do this it computes the number of samples in the new cluster and the mean of the new cluster. The ParamDesc information is used to ensure that circular dimensions are handled correctly.

Parameters
N# of dimensions (size of arrays)
ParamDescarray of dimension descriptions
n1,n2number of samples in each old cluster
marray to hold mean of new cluster
m1,m2arrays containing means of old clusters
Returns
The number of samples in the new cluster.
Note
Exceptions: None
History: 5/31/89, DSJ, Created.

Definition at line 880 of file cluster.cpp.

885  {
886  inT32 i, n;
887 
888  n = n1 + n2;
889  for (i = N; i > 0; i--, ParamDesc++, m++, m1++, m2++) {
890  if (ParamDesc->Circular) {
891  // if distance between means is greater than allowed
892  // reduce upper point by one "rotation" to compute mean
893  // then normalize the mean back into the accepted range
894  if ((*m2 - *m1) > ParamDesc->HalfRange) {
895  *m = (n1 * *m1 + n2 * (*m2 - ParamDesc->Range)) / n;
896  if (*m < ParamDesc->Min)
897  *m += ParamDesc->Range;
898  }
899  else if ((*m1 - *m2) > ParamDesc->HalfRange) {
900  *m = (n1 * (*m1 - ParamDesc->Range) + n2 * *m2) / n;
901  if (*m < ParamDesc->Min)
902  *m += ParamDesc->Range;
903  }
904  else
905  *m = (n1 * *m1 + n2 * *m2) / n;
906  }
907  else
908  *m = (n1 * *m1 + n2 * *m2) / n;
909  }
910  return n;
911 } // MergeClusters
int32_t inT32
Definition: host.h:38

◆ MultipleCharSamples()

BOOL8 MultipleCharSamples ( CLUSTERER Clusterer,
CLUSTER Cluster,
FLOAT32  MaxIllegal 
)

This routine looks at all samples in the specified cluster. It computes a running estimate of the percentage of the charaters which have more than 1 sample in the cluster. When this percentage exceeds MaxIllegal, TRUE is returned. Otherwise FALSE is returned. The CharID fields must contain integers which identify the training characters which were used to generate the sample. One integer is used for each sample. The NumChar field in the Clusterer must contain the number of characters in the training set. All CharID fields must be between 0 and NumChar-1. The main function of this routine is to help identify clusters which need to be split further, i.e. if numerous training characters have 2 or more features which are contained in the same cluster, then the cluster should be split.

Parameters
Clustererdata structure holding cluster tree
Clustercluster containing samples to be tested
MaxIllegalmax percentage of samples allowed to have more than 1 feature in the cluster
Returns
TRUE if the cluster should be split, FALSE otherwise.
Note
Exceptions: none
History: Wed Aug 30 11:13:05 1989, DSJ, Created. 2/22/90, DSJ, Added MaxIllegal control rather than always splitting illegal clusters.

Definition at line 2465 of file cluster.cpp.

2468 {
2469  static BOOL8 *CharFlags = NULL;
2470  static inT32 NumFlags = 0;
2471  int i;
2472  LIST SearchState;
2473  SAMPLE *Sample;
2474  inT32 CharID;
2475  inT32 NumCharInCluster;
2476  inT32 NumIllegalInCluster;
2477  FLOAT32 PercentIllegal;
2478 
2479  // initial estimate assumes that no illegal chars exist in the cluster
2480  NumCharInCluster = Cluster->SampleCount;
2481  NumIllegalInCluster = 0;
2482 
2483  if (Clusterer->NumChar > NumFlags) {
2484  free(CharFlags);
2485  NumFlags = Clusterer->NumChar;
2486  CharFlags = (BOOL8 *) Emalloc (NumFlags * sizeof (BOOL8));
2487  }
2488 
2489  for (i = 0; i < NumFlags; i++)
2490  CharFlags[i] = FALSE;
2491 
2492  // find each sample in the cluster and check if we have seen it before
2493  InitSampleSearch(SearchState, Cluster);
2494  while ((Sample = NextSample (&SearchState)) != NULL) {
2495  CharID = Sample->CharID;
2496  if (CharFlags[CharID] == FALSE) {
2497  CharFlags[CharID] = TRUE;
2498  }
2499  else {
2500  if (CharFlags[CharID] == TRUE) {
2501  NumIllegalInCluster++;
2502  CharFlags[CharID] = ILLEGAL_CHAR;
2503  }
2504  NumCharInCluster--;
2505  PercentIllegal = (FLOAT32) NumIllegalInCluster / NumCharInCluster;
2506  if (PercentIllegal > MaxIllegal) {
2507  destroy(SearchState);
2508  return (TRUE);
2509  }
2510  }
2511  }
2512  return (FALSE);
2513 
2514 } // MultipleCharSamples
#define TRUE
Definition: capi.h:45
int32_t inT32
Definition: host.h:38
LIST destroy(LIST list)
Definition: oldlist.cpp:182
void * Emalloc(int Size)
Definition: emalloc.cpp:47
#define InitSampleSearch(S, C)
Definition: cluster.h:105
unsigned char BOOL8
Definition: host.h:44
inT32 CharID
Definition: cluster.h:38
#define FALSE
Definition: capi.h:46
inT32 NumChar
Definition: cluster.h:93
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:620
float FLOAT32
Definition: host.h:42
unsigned SampleCount
Definition: cluster.h:35
Definition: cluster.h:32
#define ILLEGAL_CHAR

◆ NewChiStruct()

CHISTRUCT * NewChiStruct ( uinT16  DegreesOfFreedom,
FLOAT64  Alpha 
)

This routine allocates a new data structure which is used to hold a chi-squared value along with its associated number of degrees of freedom and alpha value.

Parameters
DegreesOfFreedomdegrees of freedom for new chi value
Alphaconfidence level for new chi value
Returns
none
Note
Exceptions: none
History: Fri Aug 4 11:04:59 1989, DSJ, Created.

Definition at line 2326 of file cluster.cpp.

2326  {
2328 
2329  NewChiStruct = (CHISTRUCT *) Emalloc (sizeof (CHISTRUCT));
2330  NewChiStruct->DegreesOfFreedom = DegreesOfFreedom;
2331  NewChiStruct->Alpha = Alpha;
2332  return (NewChiStruct);
2333 
2334 } // NewChiStruct
void * Emalloc(int Size)
Definition: emalloc.cpp:47
uinT16 DegreesOfFreedom
Definition: cluster.cpp:189
uinT16 DegreesOfFreedom(DISTRIBUTION Distribution, uinT16 HistogramBuckets)
Definition: cluster.cpp:2205
FLOAT64 Alpha
Definition: cluster.cpp:190
CHISTRUCT * NewChiStruct(uinT16 DegreesOfFreedom, FLOAT64 Alpha)
Definition: cluster.cpp:2326

◆ NewEllipticalProto()

PROTOTYPE * NewEllipticalProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates an elliptical prototype data structure to approximate the samples in the specified cluster. Elliptical prototypes have a variance for each dimension. All dimensions are normally distributed and independent.

Parameters
Nnumber of dimensions
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new elliptical prototype data structure
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1532 of file cluster.cpp.

1534  {
1535  PROTOTYPE *Proto;
1536  FLOAT32 *CoVariance;
1537  int i;
1538 
1539  Proto = NewSimpleProto (N, Cluster);
1540  Proto->Variance.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1541  Proto->Magnitude.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1542  Proto->Weight.Elliptical = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1543 
1544  CoVariance = Statistics->CoVariance;
1545  Proto->TotalMagnitude = 1.0;
1546  for (i = 0; i < N; i++, CoVariance += N + 1) {
1547  Proto->Variance.Elliptical[i] = *CoVariance;
1548  if (Proto->Variance.Elliptical[i] < MINVARIANCE)
1549  Proto->Variance.Elliptical[i] = MINVARIANCE;
1550 
1551  Proto->Magnitude.Elliptical[i] =
1552  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Elliptical[i]));
1553  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
1554  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
1555  }
1556  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1557  Proto->Style = elliptical;
1558  return (Proto);
1559 } // NewEllipticalProto
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOAT32 * CoVariance
Definition: cluster.cpp:172
FLOAT32 * Elliptical
Definition: cluster.h:64
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOATUNION Weight
Definition: cluster.h:83
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1600
FLOATUNION Variance
Definition: cluster.h:81
#define MINVARIANCE
Definition: cluster.cpp:141
unsigned Style
Definition: cluster.h:74
#define PI
Definition: const.h:19
float FLOAT32
Definition: host.h:42
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 TotalMagnitude
Definition: cluster.h:79

◆ NewMixedProto()

PROTOTYPE * NewMixedProto ( inT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates a mixed prototype data structure to approximate the samples in the specified cluster. Mixed prototypes can have different distributions for each dimension. All dimensions are independent. The structure is initially filled in as though it were an elliptical prototype. The actual distributions of the dimensions can be altered by other routines.

Parameters
Nnumber of dimensions
Clustercluster to be made into a mixed prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new mixed prototype data structure
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1576 of file cluster.cpp.

1576  {
1577  PROTOTYPE *Proto;
1578  int i;
1579 
1580  Proto = NewEllipticalProto (N, Cluster, Statistics);
1581  Proto->Distrib = (DISTRIBUTION *) Emalloc (N * sizeof (DISTRIBUTION));
1582 
1583  for (i = 0; i < N; i++) {
1584  Proto->Distrib[i] = normal;
1585  }
1586  Proto->Style = mixed;
1587  return (Proto);
1588 } // NewMixedProto
DISTRIBUTION * Distrib
Definition: cluster.h:77
void * Emalloc(int Size)
Definition: emalloc.cpp:47
unsigned Style
Definition: cluster.h:74
DISTRIBUTION
Definition: cluster.h:58
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1532
Definition: cluster.h:45
Definition: cluster.h:59

◆ NewSimpleProto()

PROTOTYPE * NewSimpleProto ( inT16  N,
CLUSTER Cluster 
)

This routine allocates memory to hold a simple prototype data structure, i.e. one without independent distributions and variances for each dimension.

Parameters
Nnumber of dimensions
Clustercluster to be made into a prototype
Returns
Pointer to new simple prototype
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1600 of file cluster.cpp.

1600  {
1601  PROTOTYPE *Proto;
1602  int i;
1603 
1604  Proto = (PROTOTYPE *) Emalloc (sizeof (PROTOTYPE));
1605  Proto->Mean = (FLOAT32 *) Emalloc (N * sizeof (FLOAT32));
1606 
1607  for (i = 0; i < N; i++)
1608  Proto->Mean[i] = Cluster->Mean[i];
1609  Proto->Distrib = NULL;
1610 
1611  Proto->Significant = TRUE;
1612  Proto->Merged = FALSE;
1613  Proto->Style = spherical;
1614  Proto->NumSamples = Cluster->SampleCount;
1615  Proto->Cluster = Cluster;
1616  Proto->Cluster->Prototype = TRUE;
1617  return (Proto);
1618 } // NewSimpleProto
DISTRIBUTION * Distrib
Definition: cluster.h:77
#define TRUE
Definition: capi.h:45
FLOAT32 Mean[1]
Definition: cluster.h:39
unsigned Merged
Definition: cluster.h:69
void * Emalloc(int Size)
Definition: emalloc.cpp:47
unsigned Style
Definition: cluster.h:74
#define FALSE
Definition: capi.h:46
FLOAT32 * Mean
Definition: cluster.h:78
unsigned Prototype
Definition: cluster.h:34
float FLOAT32
Definition: host.h:42
unsigned NumSamples
Definition: cluster.h:75
unsigned Significant
Definition: cluster.h:68
CLUSTER * Cluster
Definition: cluster.h:76
unsigned SampleCount
Definition: cluster.h:35

◆ NewSphericalProto()

PROTOTYPE * NewSphericalProto ( uinT16  N,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine creates a spherical prototype data structure to approximate the samples in the specified cluster. Spherical prototypes have a single variance which is common across all dimensions. All dimensions are normally distributed and independent.

Parameters
Nnumber of dimensions
Clustercluster to be made into a spherical prototype
Statisticsstatistical info about samples in cluster
Returns
Pointer to a new spherical prototype data structure
Note
Exceptions: None
History: 6/19/89, DSJ, Created.

Definition at line 1499 of file cluster.cpp.

1501  {
1502  PROTOTYPE *Proto;
1503 
1504  Proto = NewSimpleProto (N, Cluster);
1505 
1506  Proto->Variance.Spherical = Statistics->AvgVariance;
1507  if (Proto->Variance.Spherical < MINVARIANCE)
1508  Proto->Variance.Spherical = MINVARIANCE;
1509 
1510  Proto->Magnitude.Spherical =
1511  1.0 / sqrt ((double) (2.0 * PI * Proto->Variance.Spherical));
1512  Proto->TotalMagnitude = (float)pow((double)Proto->Magnitude.Spherical,
1513  (double) N);
1514  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
1515  Proto->LogMagnitude = log ((double) Proto->TotalMagnitude);
1516 
1517  return (Proto);
1518 } // NewSphericalProto
FLOAT32 LogMagnitude
Definition: cluster.h:80
FLOATUNION Weight
Definition: cluster.h:83
PROTOTYPE * NewSimpleProto(inT16 N, CLUSTER *Cluster)
Definition: cluster.cpp:1600
FLOATUNION Variance
Definition: cluster.h:81
#define MINVARIANCE
Definition: cluster.cpp:141
#define PI
Definition: const.h:19
FLOAT32 AvgVariance
Definition: cluster.cpp:171
FLOAT32 Spherical
Definition: cluster.h:63
FLOATUNION Magnitude
Definition: cluster.h:82
FLOAT32 TotalMagnitude
Definition: cluster.h:79

◆ NextSample()

CLUSTER* NextSample ( LIST SearchState)

This routine is used to find all of the samples which belong to a cluster. It starts by removing the top cluster on the cluster list (SearchState). If this cluster is a leaf it is returned. Otherwise, the right subcluster is pushed on the list and we continue the search in the left subcluster. This continues until a leaf is found. If all samples have been found, NULL is returned. InitSampleSearch() must be called before NextSample() to initialize the search.

Parameters
SearchStateptr to list containing clusters to be searched
Returns
Pointer to the next leaf cluster (sample) or NULL.
Note
Exceptions: None
History: 6/16/89, DSJ, Created.

Definition at line 620 of file cluster.cpp.

620  {
621  CLUSTER *Cluster;
622 
623  if (*SearchState == NIL_LIST)
624  return (NULL);
625  Cluster = (CLUSTER *) first_node (*SearchState);
626  *SearchState = pop (*SearchState);
627  while (TRUE) {
628  if (Cluster->Left == NULL)
629  return (Cluster);
630  *SearchState = push (*SearchState, Cluster->Right);
631  Cluster = Cluster->Left;
632  }
633 } // NextSample
#define TRUE
Definition: capi.h:45
struct sample * Left
Definition: cluster.h:36
#define NIL_LIST
Definition: oldlist.h:126
LIST pop(LIST list)
Definition: oldlist.cpp:299
struct sample * Right
Definition: cluster.h:37
#define first_node(l)
Definition: oldlist.h:139
Definition: cluster.h:32
LIST push(LIST list, void *element)
Definition: oldlist.cpp:317

◆ NormalBucket()

uinT16 NormalBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

This routine determines which bucket x falls into in the discrete normal distribution defined by kNormalMean and kNormalStdDev. x values which exceed the range of the discrete distribution are clipped.

Parameters
ParamDescused to identify circular dimensions
xvalue to be normalized
Meanmean of normal distribution
StdDevstandard deviation of normal distribution
Returns
Bucket number into which x falls
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2056 of file cluster.cpp.

2059  {
2060  FLOAT32 X;
2061 
2062  // wraparound circular parameters if necessary
2063  if (ParamDesc->Circular) {
2064  if (x - Mean > ParamDesc->HalfRange)
2065  x -= ParamDesc->Range;
2066  else if (x - Mean < -ParamDesc->HalfRange)
2067  x += ParamDesc->Range;
2068  }
2069 
2070  X = ((x - Mean) / StdDev) * kNormalStdDev + kNormalMean;
2071  if (X < 0)
2072  return 0;
2073  if (X > BUCKETTABLESIZE - 1)
2074  return ((uinT16) (BUCKETTABLESIZE - 1));
2075  return (uinT16) floor((FLOAT64) X);
2076 } // NormalBucket
FLOAT32 Range
Definition: ocrfeatures.h:51
inT8 Circular
Definition: ocrfeatures.h:47
#define BUCKETTABLESIZE
Definition: cluster.cpp:159
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:644
float FLOAT32
Definition: host.h:42
uint16_t uinT16
Definition: host.h:37
double FLOAT64
Definition: host.h:43
FLOAT32 HalfRange
Definition: ocrfeatures.h:52

◆ NormalDensity()

FLOAT64 NormalDensity ( inT32  x)

This routine computes the probability density function of a discrete normal distribution defined by the global variables kNormalMean, kNormalVariance, and kNormalMagnitude. Normal magnitude could, of course, be computed in terms of the normal variance but it is precomputed for efficiency.

Parameters
xnumber to compute the normal probability density for
Note
Globals: kNormalMean mean of a discrete normal distribution kNormalVariance variance of a discrete normal distribution kNormalMagnitude magnitude of a discrete normal distribution
Returns
The value of the normal distribution at x.
Note
Exceptions: None
History: 6/4/89, DSJ, Created.

Definition at line 1923 of file cluster.cpp.

1923  {
1924  FLOAT64 Distance;
1925 
1926  Distance = x - kNormalMean;
1927  return kNormalMagnitude * exp(-0.5 * Distance * Distance / kNormalVariance);
1928 } // NormalDensity
double FLOAT64
Definition: host.h:43

◆ NumBucketsMatch()

int NumBucketsMatch ( void *  arg1,
void *  arg2 
)

This routine is used to search a list of histogram data structures to find one with the specified number of buckets. It is called by the list search routines.

Parameters
arg1current histogram being tested for a match
arg2match key
Returns
TRUE if arg1 matches arg2
Note
Exceptions: none
History: Thu Aug 3 14:17:33 1989, DSJ, Created.

Definition at line 2227 of file cluster.cpp.

2228  { // uinT16 *DesiredNumberOfBuckets)
2229  BUCKETS *Histogram = (BUCKETS *) arg1;
2230  uinT16 *DesiredNumberOfBuckets = (uinT16 *) arg2;
2231 
2232  return (*DesiredNumberOfBuckets == Histogram->NumberOfBuckets);
2233 
2234 } // NumBucketsMatch
uinT16 NumberOfBuckets
Definition: cluster.cpp:182
uint16_t uinT16
Definition: host.h:37

◆ OptimumNumberOfBuckets()

uinT16 OptimumNumberOfBuckets ( uinT32  SampleCount)

This routine computes the optimum number of histogram buckets that should be used in a chi-squared goodness of fit test for the specified number of samples. The optimum number is computed based on Table 4.1 on pg. 147 of "Measurement and Analysis of Random Data" by Bendat & Piersol. Linear interpolation is used to interpolate between table values. The table is intended for a 0.05 level of significance (alpha). This routine assumes that it is equally valid for other alpha's, which may not be true.

Parameters
SampleCountnumber of samples to be tested
Returns
Optimum number of histogram buckets
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1832 of file cluster.cpp.

1832  {
1833  uinT8 Last, Next;
1834  FLOAT32 Slope;
1835 
1836  if (SampleCount < kCountTable[0])
1837  return kBucketsTable[0];
1838 
1839  for (Last = 0, Next = 1; Next < LOOKUPTABLESIZE; Last++, Next++) {
1840  if (SampleCount <= kCountTable[Next]) {
1841  Slope = (FLOAT32) (kBucketsTable[Next] - kBucketsTable[Last]) /
1842  (FLOAT32) (kCountTable[Next] - kCountTable[Last]);
1843  return ((uinT16) (kBucketsTable[Last] +
1844  Slope * (SampleCount - kCountTable[Last])));
1845  }
1846  }
1847  return kBucketsTable[Last];
1848 } // OptimumNumberOfBuckets
#define LOOKUPTABLESIZE
Definition: cluster.cpp:227
float FLOAT32
Definition: host.h:42
uint8_t uinT8
Definition: host.h:35
uint16_t uinT16
Definition: host.h:37

◆ Solve()

FLOAT64 Solve ( SOLVEFUNC  Function,
void *  FunctionParams,
FLOAT64  InitialGuess,
FLOAT64  Accuracy 
)

This routine attempts to find an x value at which Function goes to zero (i.e. a root of the function ). It will only work correctly if a solution actually exists and there are no extrema between the solution and the InitialGuess. The algorithms used are extremely primitive.

Parameters
Functionfunction whose zero is to be found
FunctionParamsarbitrary data to pass to function
InitialGuesspoint to start solution search at
Accuracymaximum allowed error
Returns
Solution of function ( x for which f(x) = 0 ).
Note
Exceptions: none
History: Fri Aug 4 11:08:59 1989, DSJ, Created.

Definition at line 2352 of file cluster.cpp.

2356 {
2357  FLOAT64 x;
2358  FLOAT64 f;
2359  FLOAT64 Slope;
2360  FLOAT64 Delta;
2361  FLOAT64 NewDelta;
2362  FLOAT64 xDelta;
2363  FLOAT64 LastPosX, LastNegX;
2364 
2365  x = InitialGuess;
2366  Delta = INITIALDELTA;
2367  LastPosX = MAX_FLOAT32;
2368  LastNegX = -MAX_FLOAT32;
2369  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2370  while (Abs (LastPosX - LastNegX) > Accuracy) {
2371  // keep track of outer bounds of current estimate
2372  if (f < 0)
2373  LastNegX = x;
2374  else
2375  LastPosX = x;
2376 
2377  // compute the approx. slope of f(x) at the current point
2378  Slope =
2379  ((*Function) ((CHISTRUCT *) FunctionParams, x + Delta) - f) / Delta;
2380 
2381  // compute the next solution guess */
2382  xDelta = f / Slope;
2383  x -= xDelta;
2384 
2385  // reduce the delta used for computing slope to be a fraction of
2386  //the amount moved to get to the new guess
2387  NewDelta = Abs (xDelta) * DELTARATIO;
2388  if (NewDelta < Delta)
2389  Delta = NewDelta;
2390 
2391  // compute the value of the function at the new guess
2392  f = (*Function) ((CHISTRUCT *) FunctionParams, x);
2393  }
2394  return (x);
2395 
2396 } // Solve
#define INITIALDELTA
#define Abs(N)
Definition: cluster.cpp:207
#define DELTARATIO
#define MAX_FLOAT32
Definition: host.h:66
double FLOAT64
Definition: host.h:43

◆ StandardDeviation()

FLOAT32 StandardDeviation ( PROTOTYPE Proto,
uinT16  Dimension 
)

This routine returns the standard deviation of the prototype in the indicated dimension.

Parameters
Protoprototype to return standard deviation of
Dimensiondimension whose stddev is to be returned
Returns
Standard deviation of Prototype in Dimension
Note
Exceptions: none
History: 7/6/89, DSJ, Created.

Definition at line 657 of file cluster.cpp.

657  {
658  switch (Proto->Style) {
659  case spherical:
660  return ((FLOAT32) sqrt ((double) Proto->Variance.Spherical));
661  case elliptical:
662  return ((FLOAT32)
663  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
664  case mixed:
665  switch (Proto->Distrib[Dimension]) {
666  case normal:
667  return ((FLOAT32)
668  sqrt ((double) Proto->Variance.Elliptical[Dimension]));
669  case uniform:
670  case D_random:
671  return (Proto->Variance.Elliptical[Dimension]);
672  case DISTRIBUTION_COUNT:
673  ASSERT_HOST(!"Distribution count not allowed!");
674  }
675  }
676  return 0.0f;
677 } // StandardDeviation
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 * Elliptical
Definition: cluster.h:64
#define ASSERT_HOST(x)
Definition: errcode.h:84
FLOATUNION Variance
Definition: cluster.h:81
unsigned Style
Definition: cluster.h:74
float FLOAT32
Definition: host.h:42
Definition: cluster.h:45
FLOAT32 Spherical
Definition: cluster.h:63
Definition: cluster.h:59

◆ TestEllipticalProto()

PROTOTYPE * TestEllipticalProto ( CLUSTERER Clusterer,
CLUSTERCONFIG Config,
CLUSTER Cluster,
STATISTICS Statistics 
)

This routine tests the specified cluster to see if ** there is a statistically significant difference between the sub-clusters that would be made if the cluster were to be split. If not, then a new prototype is formed and returned to the caller. If there is, then NULL is returned to the caller.

Parameters
Clustererdata struct containing samples being clustered
Configprovides the magic number of samples that make a good cluster
Clustercluster to be made into an elliptical prototype
Statisticsstatistical info about cluster
Returns
Pointer to new elliptical prototype or NULL.

Definition at line 1105 of file cluster.cpp.

1108  {
1109  // Fraction of the number of samples used as a range around 1 within
1110  // which a cluster has the magic size that allows a boost to the
1111  // FTable by kFTableBoostMargin, thus allowing clusters near the
1112  // magic size (equal to the number of sample characters) to be more
1113  // likely to stay together.
1114  const double kMagicSampleMargin = 0.0625;
1115  const double kFTableBoostMargin = 2.0;
1116 
1117  int N = Clusterer->SampleSize;
1118  CLUSTER* Left = Cluster->Left;
1119  CLUSTER* Right = Cluster->Right;
1120  if (Left == NULL || Right == NULL)
1121  return NULL;
1122  int TotalDims = Left->SampleCount + Right->SampleCount;
1123  if (TotalDims < N + 1 || TotalDims < 2)
1124  return NULL;
1125  const int kMatrixSize = N * N * sizeof(FLOAT32);
1126  FLOAT32* Covariance = static_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1127  FLOAT32* Inverse = static_cast<FLOAT32 *>(Emalloc(kMatrixSize));
1128  FLOAT32* Delta = static_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
1129  // Compute a new covariance matrix that only uses essential features.
1130  for (int i = 0; i < N; ++i) {
1131  int row_offset = i * N;
1132  if (!Clusterer->ParamDesc[i].NonEssential) {
1133  for (int j = 0; j < N; ++j) {
1134  if (!Clusterer->ParamDesc[j].NonEssential)
1135  Covariance[j + row_offset] = Statistics->CoVariance[j + row_offset];
1136  else
1137  Covariance[j + row_offset] = 0.0f;
1138  }
1139  } else {
1140  for (int j = 0; j < N; ++j) {
1141  if (i == j)
1142  Covariance[j + row_offset] = 1.0f;
1143  else
1144  Covariance[j + row_offset] = 0.0f;
1145  }
1146  }
1147  }
1148  double err = InvertMatrix(Covariance, N, Inverse);
1149  if (err > 1) {
1150  tprintf("Clustering error: Matrix inverse failed with error %g\n", err);
1151  }
1152  int EssentialN = 0;
1153  for (int dim = 0; dim < N; ++dim) {
1154  if (!Clusterer->ParamDesc[dim].NonEssential) {
1155  Delta[dim] = Left->Mean[dim] - Right->Mean[dim];
1156  ++EssentialN;
1157  } else {
1158  Delta[dim] = 0.0f;
1159  }
1160  }
1161  // Compute Hotelling's T-squared.
1162  double Tsq = 0.0;
1163  for (int x = 0; x < N; ++x) {
1164  double temp = 0.0;
1165  for (int y = 0; y < N; ++y) {
1166  temp += Inverse[y + N*x] * Delta[y];
1167  }
1168  Tsq += Delta[x] * temp;
1169  }
1170  free(Covariance);
1171  free(Inverse);
1172  free(Delta);
1173  // Changed this function to match the formula in
1174  // Statistical Methods in Medical Research p 473
1175  // By Peter Armitage, Geoffrey Berry, J. N. S. Matthews.
1176  // Tsq *= Left->SampleCount * Right->SampleCount / TotalDims;
1177  double F = Tsq * (TotalDims - EssentialN - 1) / ((TotalDims - 2)*EssentialN);
1178  int Fx = EssentialN;
1179  if (Fx > FTABLE_X)
1180  Fx = FTABLE_X;
1181  --Fx;
1182  int Fy = TotalDims - EssentialN - 1;
1183  if (Fy > FTABLE_Y)
1184  Fy = FTABLE_Y;
1185  --Fy;
1186  double FTarget = FTable[Fy][Fx];
1187  if (Config->MagicSamples > 0 &&
1188  TotalDims >= Config->MagicSamples * (1.0 - kMagicSampleMargin) &&
1189  TotalDims <= Config->MagicSamples * (1.0 + kMagicSampleMargin)) {
1190  // Give magic-sized clusters a magic FTable boost.
1191  FTarget += kFTableBoostMargin;
1192  }
1193  if (F < FTarget) {
1194  return NewEllipticalProto (Clusterer->SampleSize, Cluster, Statistics);
1195  }
1196  return NULL;
1197 }
FLOAT32 * CoVariance
Definition: cluster.cpp:172
struct sample * Left
Definition: cluster.h:36
FLOAT32 Mean[1]
Definition: cluster.h:39
void * Emalloc(int Size)
Definition: emalloc.cpp:47
#define tprintf(...)
Definition: tprintf.h:31
PARAM_DESC * ParamDesc
Definition: cluster.h:88
int MagicSamples
Definition: cluster.h:55
inT16 SampleSize
Definition: cluster.h:87
inT8 NonEssential
Definition: ocrfeatures.h:48
struct sample * Right
Definition: cluster.h:37
PROTOTYPE * NewEllipticalProto(inT16 N, CLUSTER *Cluster, STATISTICS *Statistics)
Definition: cluster.cpp:1532
float FLOAT32
Definition: host.h:42
#define FTABLE_Y
Definition: cluster.cpp:31
double InvertMatrix(const float *input, int size, float *inv)
Definition: cluster.cpp:2521
#define FTABLE_X
Definition: cluster.cpp:30
unsigned SampleCount
Definition: cluster.h:35
Definition: cluster.h:32
const double FTable[FTABLE_Y][FTABLE_X]
Definition: cluster.cpp:34

◆ UniformBucket()

uinT16 UniformBucket ( PARAM_DESC ParamDesc,
FLOAT32  x,
FLOAT32  Mean,
FLOAT32  StdDev 
)

This routine determines which bucket x falls into in the discrete uniform distribution defined by BUCKETTABLESIZE. x values which exceed the range of the discrete distribution are clipped.

Parameters
ParamDescused to identify circular dimensions
xvalue to be normalized
Meancenter of range of uniform distribution
StdDev1/2 the range of the uniform distribution
Returns
Bucket number into which x falls
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 2091 of file cluster.cpp.

2094  {
2095  FLOAT32 X;
2096 
2097  // wraparound circular parameters if necessary
2098  if (ParamDesc->Circular) {
2099  if (x - Mean > ParamDesc->HalfRange)
2100  x -= ParamDesc->Range;
2101  else if (x - Mean < -ParamDesc->HalfRange)
2102  x += ParamDesc->Range;
2103  }
2104 
2105  X = ((x - Mean) / (2 * StdDev) * BUCKETTABLESIZE + BUCKETTABLESIZE / 2.0);
2106  if (X < 0)
2107  return 0;
2108  if (X > BUCKETTABLESIZE - 1)
2109  return (uinT16) (BUCKETTABLESIZE - 1);
2110  return (uinT16) floor((FLOAT64) X);
2111 } // UniformBucket
FLOAT32 Range
Definition: ocrfeatures.h:51
inT8 Circular
Definition: ocrfeatures.h:47
#define BUCKETTABLESIZE
Definition: cluster.cpp:159
FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:644
float FLOAT32
Definition: host.h:42
uint16_t uinT16
Definition: host.h:37
double FLOAT64
Definition: host.h:43
FLOAT32 HalfRange
Definition: ocrfeatures.h:52

◆ UniformDensity()

FLOAT64 UniformDensity ( inT32  x)

This routine computes the probability density function of a uniform distribution at the specified point. The range of the distribution is from 0 to BUCKETTABLESIZE.

Parameters
xnumber to compute the uniform probability density for
Returns
The value of the uniform distribution at x.
Note
Exceptions: None
History: 6/5/89, DSJ, Created.

Definition at line 1939 of file cluster.cpp.

1939  {
1940  static FLOAT64 UniformDistributionDensity = (FLOAT64) 1.0 / BUCKETTABLESIZE;
1941 
1942  if ((x >= 0.0) && (x <= BUCKETTABLESIZE))
1943  return UniformDistributionDensity;
1944  else
1945  return (FLOAT64) 0.0;
1946 } // UniformDensity
#define BUCKETTABLESIZE
Definition: cluster.cpp:159
double FLOAT64
Definition: host.h:43

Variable Documentation

◆ FTable

const double FTable[FTABLE_Y][FTABLE_X]

Definition at line 34 of file cluster.cpp.