tesseract  4.00.00dev
clusttool.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: clustertool.c
3  ** Purpose: Misc. tools for use with the clustering routines
4  ** Author: Dan Johnson
5  ** History: 6/6/89, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 
19 //--------------------------Include Files----------------------------------
20 #include "clusttool.h"
21 #include "const.h"
22 #include "danerror.h"
23 #include "emalloc.h"
24 #include "scanutils.h"
25 #include <stdio.h>
26 #include <math.h>
27 
28 using tesseract::TFile;
29 
30 //---------------Global Data Definitions and Declarations--------------------
31 #define TOKENSIZE 80 //< max size of tokens read from an input file
32 #define QUOTED_TOKENSIZE "79"
33 #define MAXSAMPLESIZE 65535 //< max num of dimensions in feature space
34 //#define MAXBLOCKSIZE 65535 //< max num of samples in a character (block
35 // size)
36 
48  int SampleSize = 0;
49 
50  const int kMaxLineSize = 100;
51  char line[kMaxLineSize];
52  if (fp->FGets(line, kMaxLineSize) == nullptr ||
53  sscanf(line, "%d", &SampleSize) != 1 || (SampleSize < 0) ||
54  (SampleSize > MAXSAMPLESIZE))
55  DoError (ILLEGALSAMPLESIZE, "Illegal sample size");
56  return (SampleSize);
57 }
58 
74  PARAM_DESC *ParamDesc;
75  char linear_token[TOKENSIZE], essential_token[TOKENSIZE];
76 
77  ParamDesc = (PARAM_DESC *) Emalloc (N * sizeof (PARAM_DESC));
78  for (int i = 0; i < N; i++) {
79  const int kMaxLineSize = TOKENSIZE * 4;
80  char line[kMaxLineSize];
81  if (fp->FGets(line, kMaxLineSize) == nullptr ||
82  sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %f %f",
83  linear_token, essential_token, &ParamDesc[i].Min,
84  &ParamDesc[i].Max) != 4)
85  DoError(ILLEGALCIRCULARSPEC, "Illegal Parameter specification");
86  if (linear_token[0] == 'c')
87  ParamDesc[i].Circular = TRUE;
88  else
89  ParamDesc[i].Circular = FALSE;
90 
91  if (linear_token[0] == 'e')
92  ParamDesc[i].NonEssential = FALSE;
93  else
94  ParamDesc[i].NonEssential = TRUE;
95  ParamDesc[i].Range = ParamDesc[i].Max - ParamDesc[i].Min;
96  ParamDesc[i].HalfRange = ParamDesc[i].Range / 2;
97  ParamDesc[i].MidRange = (ParamDesc[i].Max + ParamDesc[i].Min) / 2;
98  }
99  return (ParamDesc);
100 }
101 
119  char sig_token[TOKENSIZE], shape_token[TOKENSIZE];
120  PROTOTYPE *Proto;
121  int SampleCount;
122  int i;
123 
124  const int kMaxLineSize = TOKENSIZE * 4;
125  char line[kMaxLineSize];
126  if (fp->FGets(line, kMaxLineSize) == nullptr ||
127  sscanf(line, "%" QUOTED_TOKENSIZE "s %" QUOTED_TOKENSIZE "s %d",
128  sig_token, shape_token, &SampleCount) != 3) {
129  tprintf("Invalid prototype: %s\n", line);
130  return nullptr;
131  }
132  Proto = (PROTOTYPE *)Emalloc(sizeof(PROTOTYPE));
133  Proto->Cluster = NULL;
134  if (sig_token[0] == 's')
135  Proto->Significant = TRUE;
136  else
137  Proto->Significant = FALSE;
138 
139  Proto->Style = ReadProtoStyle(shape_token);
140 
141  if (SampleCount < 0) DoError(ILLEGALSAMPLECOUNT, "Illegal sample count");
142  Proto->NumSamples = SampleCount;
143 
144  Proto->Mean = ReadNFloats(fp, N, NULL);
145  if (Proto->Mean == NULL) DoError(ILLEGALMEANSPEC, "Illegal prototype mean");
146 
147  switch (Proto->Style) {
148  case spherical:
149  if (ReadNFloats(fp, 1, &(Proto->Variance.Spherical)) == NULL)
150  DoError(ILLEGALVARIANCESPEC, "Illegal prototype variance");
151  Proto->Magnitude.Spherical =
152  1.0 / sqrt((double)(2.0 * PI * Proto->Variance.Spherical));
153  Proto->TotalMagnitude = pow(Proto->Magnitude.Spherical, (float)N);
154  Proto->LogMagnitude = log((double)Proto->TotalMagnitude);
155  Proto->Weight.Spherical = 1.0 / Proto->Variance.Spherical;
156  Proto->Distrib = NULL;
157  break;
158  case elliptical:
159  Proto->Variance.Elliptical = ReadNFloats(fp, N, NULL);
160  if (Proto->Variance.Elliptical == NULL)
161  DoError(ILLEGALVARIANCESPEC, "Illegal prototype variance");
162  Proto->Magnitude.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
163  Proto->Weight.Elliptical = (FLOAT32 *)Emalloc(N * sizeof(FLOAT32));
164  Proto->TotalMagnitude = 1.0;
165  for (i = 0; i < N; i++) {
166  Proto->Magnitude.Elliptical[i] =
167  1.0 / sqrt((double)(2.0 * PI * Proto->Variance.Elliptical[i]));
168  Proto->Weight.Elliptical[i] = 1.0 / Proto->Variance.Elliptical[i];
169  Proto->TotalMagnitude *= Proto->Magnitude.Elliptical[i];
170  }
171  Proto->LogMagnitude = log((double)Proto->TotalMagnitude);
172  Proto->Distrib = NULL;
173  break;
174  default:
175  Efree(Proto);
176  tprintf("Invalid prototype style\n");
177  return nullptr;
178  }
179  return Proto;
180 }
181 
191 PROTOSTYLE ReadProtoStyle(const char *shape) {
192  switch (shape[0]) {
193  case 's':
194  return spherical;
195  case 'e':
196  return elliptical;
197  case 'a':
198  return automatic;
199  default:
200  break;
201  }
202  tprintf("Invalid prototype style specification:%s\n", shape);
203  return elliptical;
204 }
205 
220 FLOAT32 *ReadNFloats(TFile *fp, uinT16 N, FLOAT32 Buffer[]) {
221  const int kMaxLineSize = 1024;
222  char line[kMaxLineSize];
223  if (fp->FGets(line, kMaxLineSize) == nullptr) {
224  tprintf("Hit EOF in ReadNFloats!\n");
225  return nullptr;
226  }
227  bool needs_free = false;
228 
229  if (Buffer == NULL) {
230  Buffer = static_cast<FLOAT32*>(Emalloc(N * sizeof(FLOAT32)));
231  needs_free = true;
232  }
233 
234  char *startptr = line;
235  for (int i = 0; i < N; i++) {
236  char *endptr;
237  Buffer[i] = strtof(startptr, &endptr);
238  if (endptr == startptr) {
239  tprintf("Read of %d floats failed!\n", N);
240  if (needs_free) Efree(Buffer);
241  return nullptr;
242  }
243  startptr = endptr;
244  }
245  return Buffer;
246 }
247 
259 void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[]) {
260  int i;
261 
262  for (i = 0; i < N; i++) {
263  if (ParamDesc[i].Circular)
264  fprintf (File, "circular ");
265  else
266  fprintf (File, "linear ");
267 
268  if (ParamDesc[i].NonEssential)
269  fprintf (File, "non-essential ");
270  else
271  fprintf (File, "essential ");
272 
273  fprintf (File, "%10.6f %10.6f\n", ParamDesc[i].Min, ParamDesc[i].Max);
274  }
275 }
276 
288 void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto) {
289  int i;
290 
291  if (Proto->Significant)
292  fprintf (File, "significant ");
293  else
294  fprintf (File, "insignificant ");
295  WriteProtoStyle (File, (PROTOSTYLE) Proto->Style);
296  fprintf (File, "%6d\n\t", Proto->NumSamples);
297  WriteNFloats (File, N, Proto->Mean);
298  fprintf (File, "\t");
299 
300  switch (Proto->Style) {
301  case spherical:
302  WriteNFloats (File, 1, &(Proto->Variance.Spherical));
303  break;
304  case elliptical:
305  WriteNFloats (File, N, Proto->Variance.Elliptical);
306  break;
307  case mixed:
308  for (i = 0; i < N; i++)
309  switch (Proto->Distrib[i]) {
310  case normal:
311  fprintf (File, " %9s", "normal");
312  break;
313  case uniform:
314  fprintf (File, " %9s", "uniform");
315  break;
316  case D_random:
317  fprintf (File, " %9s", "random");
318  break;
319  case DISTRIBUTION_COUNT:
320  ASSERT_HOST(!"Distribution count not allowed!");
321  }
322  fprintf (File, "\n\t");
323  WriteNFloats (File, N, Proto->Variance.Elliptical);
324  }
325 }
326 
338 void WriteNFloats(FILE * File, uinT16 N, FLOAT32 Array[]) {
339  for (int i = 0; i < N; i++)
340  fprintf(File, " %9.6f", Array[i]);
341  fprintf(File, "\n");
342 }
343 
355 void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle) {
356  switch (ProtoStyle) {
357  case spherical:
358  fprintf (File, "spherical");
359  break;
360  case elliptical:
361  fprintf (File, "elliptical");
362  break;
363  case mixed:
364  fprintf (File, "mixed");
365  break;
366  case automatic:
367  fprintf (File, "automatic");
368  break;
369  }
370 }
371 
389 void WriteProtoList(FILE *File, uinT16 N, PARAM_DESC ParamDesc[],
390  LIST ProtoList, BOOL8 WriteSigProtos,
391  BOOL8 WriteInsigProtos) {
392  PROTOTYPE *Proto;
393 
394  /* write file header */
395  fprintf(File,"%0d\n",N);
396  WriteParamDesc(File,N,ParamDesc);
397 
398  /* write prototypes */
399  iterate(ProtoList)
400  {
401  Proto = (PROTOTYPE *) first_node ( ProtoList );
402  if ((Proto->Significant && WriteSigProtos) ||
403  (!Proto->Significant && WriteInsigProtos))
404  WritePrototype(File, N, Proto);
405  }
406 }
DISTRIBUTION * Distrib
Definition: cluster.h:77
void WriteProtoStyle(FILE *File, PROTOSTYLE ProtoStyle)
Definition: clusttool.cpp:355
FLOAT32 LogMagnitude
Definition: cluster.h:80
#define TRUE
Definition: capi.h:45
FLOAT32 * Elliptical
Definition: cluster.h:64
PROTOTYPE * ReadPrototype(TFile *fp, uinT16 N)
Definition: clusttool.cpp:118
#define QUOTED_TOKENSIZE
Definition: clusttool.cpp:32
void WriteParamDesc(FILE *File, uinT16 N, const PARAM_DESC ParamDesc[])
Definition: clusttool.cpp:259
#define ILLEGALSAMPLECOUNT
Definition: clusttool.h:60
void WritePrototype(FILE *File, uinT16 N, PROTOTYPE *Proto)
Definition: clusttool.cpp:288
#define MAXSAMPLESIZE
Definition: clusttool.cpp:33
FLOAT32 MidRange
Definition: ocrfeatures.h:53
void * Emalloc(int Size)
Definition: emalloc.cpp:47
FLOAT32 Range
Definition: ocrfeatures.h:51
#define tprintf(...)
Definition: tprintf.h:31
FLOATUNION Weight
Definition: cluster.h:83
inT8 Circular
Definition: ocrfeatures.h:47
FLOAT32 Min
Definition: ocrfeatures.h:49
#define ASSERT_HOST(x)
Definition: errcode.h:84
FLOATUNION Variance
Definition: cluster.h:81
#define ILLEGALCIRCULARSPEC
Definition: clusttool.h:56
inT8 NonEssential
Definition: ocrfeatures.h:48
unsigned Style
Definition: cluster.h:74
void WriteNFloats(FILE *File, uinT16 N, FLOAT32 Array[])
Definition: clusttool.cpp:338
#define PI
Definition: const.h:19
FLOAT32 * ReadNFloats(TFile *fp, uinT16 N, FLOAT32 Buffer[])
Definition: clusttool.cpp:220
uinT16 ReadSampleSize(TFile *fp)
Definition: clusttool.cpp:47
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
PROTOSTYLE
Definition: cluster.h:44
FLOAT32 * Mean
Definition: cluster.h:78
#define TOKENSIZE
Definition: clusttool.cpp:31
#define first_node(l)
Definition: oldlist.h:139
PROTOSTYLE ReadProtoStyle(const char *shape)
Definition: clusttool.cpp:191
#define ILLEGALSAMPLESIZE
Definition: clusttool.h:55
float FLOAT32
Definition: host.h:42
Definition: cluster.h:45
FLOAT32 Spherical
Definition: cluster.h:63
Definition: cluster.h:59
void DoError(int Error, const char *Message)
Definition: danerror.cpp:42
FLOAT32 Max
Definition: ocrfeatures.h:50
unsigned NumSamples
Definition: cluster.h:75
PARAM_DESC * ReadParamDesc(TFile *fp, uinT16 N)
Definition: clusttool.cpp:73
unsigned Significant
Definition: cluster.h:68
CLUSTER * Cluster
Definition: cluster.h:76
#define ILLEGALVARIANCESPEC
Definition: clusttool.h:62
void Efree(void *ptr)
Definition: emalloc.cpp:79
#define ILLEGALMEANSPEC
Definition: clusttool.h:61
void WriteProtoList(FILE *File, uinT16 N, PARAM_DESC ParamDesc[], LIST ProtoList, BOOL8 WriteSigProtos, BOOL8 WriteInsigProtos)
Definition: clusttool.cpp:389
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:86
#define iterate(l)
Definition: oldlist.h:159
FLOATUNION Magnitude
Definition: cluster.h:82
uint16_t uinT16
Definition: host.h:37
FLOAT32 TotalMagnitude
Definition: cluster.h:79
FLOAT32 HalfRange
Definition: ocrfeatures.h:52