tesseract  4.00.00dev
cluster.h
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: cluster.h
3  ** Purpose: Definition of feature space clustering routines
4  ** Author: Dan Johnson
5  ** History: 5/29/89, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 #ifndef CLUSTER_H
19 #define CLUSTER_H
20 
21 #include "kdtree.h"
22 #include "oldlist.h"
23 
24 struct BUCKETS;
25 
26 #define MINBUCKETS 5
27 #define MAXBUCKETS 39
28 
29 /*----------------------------------------------------------------------
30  Types
31 ----------------------------------------------------------------------*/
32 typedef struct sample {
33  unsigned Clustered:1; // TRUE if included in a higher cluster
34  unsigned Prototype:1; // TRUE if cluster represented by a proto
35  unsigned SampleCount:30; // number of samples in this cluster
36  struct sample *Left; // ptr to left sub-cluster
37  struct sample *Right; // ptr to right sub-cluster
38  inT32 CharID; // identifier of char sample came from
39  FLOAT32 Mean[1]; // mean of cluster - SampleSize floats
40 } CLUSTER;
41 
42 typedef CLUSTER SAMPLE; // can refer to as either sample or cluster
43 
44 typedef enum {
46 } PROTOSTYLE;
47 
48 typedef struct { // parameters to control clustering
49  PROTOSTYLE ProtoStyle; // specifies types of protos to be made
50  FLOAT32 MinSamples; // min # of samples per proto - % of total
51  FLOAT32 MaxIllegal; // max percentage of samples in a cluster which have
52  // more than 1 feature in that cluster
53  FLOAT32 Independence; // desired independence between dimensions
54  FLOAT64 Confidence; // desired confidence in prototypes created
55  int MagicSamples; // Ideal number of samples in a cluster.
57 
58 typedef enum {
60 } DISTRIBUTION;
61 
62 typedef union {
65 } FLOATUNION;
66 
67 typedef struct {
68  unsigned Significant:1; // TRUE if prototype is significant
69  unsigned Merged:1; // Merged after clustering so do not output
70  // but kept for display purposes. If it has no
71  // samples then it was actually merged.
72  // Otherwise it matched an already significant
73  // cluster.
74  unsigned Style:2; // spherical, elliptical, or mixed
75  unsigned NumSamples:28; // number of samples in the cluster
76  CLUSTER *Cluster; // ptr to cluster which made prototype
77  DISTRIBUTION *Distrib; // different distribution for each dimension
78  FLOAT32 *Mean; // prototype mean
79  FLOAT32 TotalMagnitude; // total magnitude over all dimensions
80  FLOAT32 LogMagnitude; // log base e of TotalMagnitude
81  FLOATUNION Variance; // prototype variance
82  FLOATUNION Magnitude; // magnitude of density function
83  FLOATUNION Weight; // weight of density function
84 } PROTOTYPE;
85 
86 typedef struct {
87  inT16 SampleSize; // number of parameters per sample
88  PARAM_DESC *ParamDesc; // description of each parameter
89  inT32 NumberOfSamples; // total number of samples being clustered
90  KDTREE *KDTree; // for optimal nearest neighbor searching
91  CLUSTER *Root; // ptr to root cluster of cluster tree
92  LIST ProtoList; // list of prototypes
93  inT32 NumChar; // # of characters represented by samples
94  // cache of reusable histograms by distribution type and number of buckets.
96 } CLUSTERER;
97 
98 typedef struct {
99  inT32 NumSamples; // number of samples in list
100  inT32 MaxNumSamples; // maximum size of list
101  SAMPLE *Sample[1]; // array of ptrs to sample data structures
102 } SAMPLELIST;
103 
104 // low level cluster tree analysis routines.
105 #define InitSampleSearch(S,C) (((C)==NULL)?(S=NIL_LIST):(S=push(NIL_LIST,(C))))
106 
107 /*--------------------------------------------------------------------------
108  Public Function Prototypes
109 --------------------------------------------------------------------------*/
110 CLUSTERER *MakeClusterer (inT16 SampleSize, const PARAM_DESC ParamDesc[]);
111 
112 SAMPLE *MakeSample(CLUSTERER * Clusterer, const FLOAT32* Feature, inT32 CharID);
113 
115 
116 void FreeClusterer(CLUSTERER *Clusterer);
117 
118 void FreeProtoList(LIST *ProtoList);
119 
120 void FreePrototype(void *arg); // PROTOTYPE *Prototype);
121 
122 CLUSTER *NextSample(LIST *SearchState);
123 
124 FLOAT32 Mean(PROTOTYPE *Proto, uinT16 Dimension);
125 
126 FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension);
127 
128 inT32 MergeClusters(inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2,
129  FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[]);
130 
131 //--------------Global Data Definitions and Declarations---------------------------
132 // define errors that can be trapped
133 #define ALREADYCLUSTERED 4000
134 #endif
LIST ProtoList
Definition: cluster.h:92
PROTOSTYLE ProtoStyle
Definition: cluster.h:49
inT32 NumSamples
Definition: cluster.h:99
DISTRIBUTION * Distrib
Definition: cluster.h:77
FLOAT32 LogMagnitude
Definition: cluster.h:80
inT32 MaxNumSamples
Definition: cluster.h:100
#define MAXBUCKETS
Definition: cluster.h:27
unsigned Clustered
Definition: cluster.h:33
Definition: kdtree.h:49
FLOAT32 * Elliptical
Definition: cluster.h:64
int32_t inT32
Definition: host.h:38
struct sample * Left
Definition: cluster.h:36
FLOAT32 Mean[1]
Definition: cluster.h:39
unsigned Merged
Definition: cluster.h:69
#define MINBUCKETS
Definition: cluster.h:26
PARAM_DESC * ParamDesc
Definition: cluster.h:88
FLOAT32 MinSamples
Definition: cluster.h:50
FLOAT32 MaxIllegal
Definition: cluster.h:51
CLUSTERCONFIG Config
FLOATUNION Weight
Definition: cluster.h:83
struct sample CLUSTER
int16_t inT16
Definition: host.h:36
FLOAT64 Confidence
Definition: cluster.h:54
int MagicSamples
Definition: cluster.h:55
FLOAT32 StandardDeviation(PROTOTYPE *Proto, uinT16 Dimension)
Definition: cluster.cpp:657
FLOATUNION Variance
Definition: cluster.h:81
inT32 NumberOfSamples
Definition: cluster.h:89
inT16 SampleSize
Definition: cluster.h:87
unsigned Style
Definition: cluster.h:74
inT32 MergeClusters(inT16 N, PARAM_DESC ParamDesc[], inT32 n1, inT32 n2, FLOAT32 m[], FLOAT32 m1[], FLOAT32 m2[])
Definition: cluster.cpp:880
void FreePrototype(void *arg)
Definition: cluster.cpp:587
CLUSTER * NextSample(LIST *SearchState)
Definition: cluster.cpp:620
void FreeProtoList(LIST *ProtoList)
Definition: cluster.cpp:573
inT32 CharID
Definition: cluster.h:38
DISTRIBUTION
Definition: cluster.h:58
struct sample * Right
Definition: cluster.h:37
PROTOSTYLE
Definition: cluster.h:44
FLOAT32 * Mean
Definition: cluster.h:78
FLOAT32 Independence
Definition: cluster.h:53
inT32 NumChar
Definition: cluster.h:93
LIST ClusterSamples(CLUSTERER *Clusterer, CLUSTERCONFIG *Config)
Definition: cluster.cpp:512
unsigned Prototype
Definition: cluster.h:34
float FLOAT32
Definition: host.h:42
void FreeClusterer(CLUSTERER *Clusterer)
Definition: cluster.cpp:546
CLUSTER SAMPLE
Definition: cluster.h:42
Definition: cluster.h:45
FLOAT32 Spherical
Definition: cluster.h:63
Definition: cluster.h:59
unsigned NumSamples
Definition: cluster.h:75
unsigned Significant
Definition: cluster.h:68
CLUSTER * Cluster
Definition: cluster.h:76
CLUSTER * Root
Definition: cluster.h:91
unsigned SampleCount
Definition: cluster.h:35
KDTREE * KDTree
Definition: cluster.h:90
Definition: cluster.h:32
FLOATUNION Magnitude
Definition: cluster.h:82
uint16_t uinT16
Definition: host.h:37
FLOAT32 TotalMagnitude
Definition: cluster.h:79
double FLOAT64
Definition: host.h:43
SAMPLE * MakeSample(CLUSTERER *Clusterer, const FLOAT32 *Feature, inT32 CharID)
Definition: cluster.cpp:455
CLUSTERER * MakeClusterer(inT16 SampleSize, const PARAM_DESC ParamDesc[])
Definition: cluster.cpp:399