tesseract  4.00.00dev
statistc.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: statistc.h (Formerly stats.h)
3  * Description: Class description for STATS class.
4  * Author: Ray Smith
5  * Created: Mon Feb 04 16:19:07 GMT 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef TESSERACT_CCSTRUCT_STATISTC_H_
21 #define TESSERACT_CCSTRUCT_STATISTC_H_
22 
23 #include <stdio.h>
24 #include "host.h"
25 #include "kdpair.h"
26 #include "scrollview.h"
27 
28 template <typename T> class GenericVector;
29 
30 
31 // Simple histogram-based statistics for integer values in a known
32 // range, such that the range is small compared to the number of samples.
33 class STATS {
34  public:
35  // The histogram buckets are in the range
36  // [min_bucket_value, max_bucket_value_plus_1 - 1] i.e.
37  // [min_bucket_value, max_bucket_value].
38  // Any data under min_bucket value is silently mapped to min_bucket_value,
39  // and likewise, any data over max_bucket_value is silently mapped to
40  // max_bucket_value.
41  // In the internal array, min_bucket_value maps to 0 and
42  // max_bucket_value_plus_1 - min_bucket_value to the array size.
43  // TODO(rays) This is ugly. Convert the second argument to
44  // max_bucket_value and all the code that uses it.
45  STATS(inT32 min_bucket_value, inT32 max_bucket_value_plus_1);
46  STATS(); // empty for arrays
47 
48  ~STATS();
49 
50  // (Re)Sets the range and clears the counts.
51  // See the constructor for info on max and min values.
52  bool set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1);
53 
54  void clear(); // empty buckets
55 
56  void add(inT32 value, inT32 count);
57 
58  // "Accessors" return various statistics on the data.
59  inT32 mode() const; // get mode of samples
60  double mean() const; // get mean of samples
61  double sd() const; // standard deviation
62  // Returns the fractile value such that frac fraction (in [0,1]) of samples
63  // has a value less than the return value.
64  double ile(double frac) const;
65  // Returns the minimum used entry in the histogram (ie the minimum of the
66  // data, NOT the minimum of the supplied range, nor is it an index.)
67  // Would normally be called min(), but that is a reserved word in VC++.
68  inT32 min_bucket() const; // Find min
69  // Returns the maximum used entry in the histogram (ie the maximum of the
70  // data, NOT the maximum of the supplied range, nor is it an index.)
71  inT32 max_bucket() const; // Find max
72  // Finds a more useful estimate of median than ile(0.5).
73  // Overcomes a problem with ile() - if the samples are, for example,
74  // 6,6,13,14 ile(0.5) return 7.0 - when a more useful value would be midway
75  // between 6 and 13 = 9.5
76  double median() const; // get median of samples
77  // Returns the count of the given value.
78  inT32 pile_count(inT32 value ) const {
79  if (value <= rangemin_)
80  return buckets_[0];
81  if (value >= rangemax_ - 1)
82  return buckets_[rangemax_ - rangemin_ - 1];
83  return buckets_[value - rangemin_];
84  }
85  // Returns the total count of all buckets.
86  inT32 get_total() const {
87  return total_count_; // total of all piles
88  }
89  // Returns true if x is a local min.
90  bool local_min(inT32 x) const;
91 
92  // Apply a triangular smoothing filter to the stats.
93  // This makes the modes a bit more useful.
94  // The factor gives the height of the triangle, i.e. the weight of the
95  // centre.
96  void smooth(inT32 factor);
97 
98  // Cluster the samples into max_cluster clusters.
99  // Each call runs one iteration. The array of clusters must be
100  // max_clusters+1 in size as cluster 0 is used to indicate which samples
101  // have been used.
102  // The return value is the current number of clusters.
103  inT32 cluster(float lower, // thresholds
104  float upper,
105  float multiple, // distance threshold
106  inT32 max_clusters, // max no to make
107  STATS *clusters); // array of clusters
108 
109 // Finds (at most) the top max_modes modes, well actually the whole peak around
110 // each mode, returning them in the given modes vector as a <mean of peak,
111 // total count of peak> pair in order of decreasing total count.
112 // Since the mean is the key and the count the data in the pair, a single call
113 // to sort on the output will re-sort by increasing mean of peak if that is
114 // more useful than decreasing total count.
115 // Returns the actual number of modes found.
116  int top_n_modes(
117  int max_modes,
119 
120  // Prints a summary and table of the histogram.
121  void print() const;
122  // Prints summary stats only of the histogram.
123  void print_summary() const;
124 
125  #ifndef GRAPHICS_DISABLED
126  // Draws the histogram as a series of rectangles.
127  void plot(ScrollView* window, // window to draw in
128  float xorigin, // origin of histo
129  float yorigin, // gram
130  float xscale, // size of one unit
131  float yscale, // size of one uint
132  ScrollView::Color colour) const; // colour to draw in
133 
134  // Draws a line graph of the histogram.
135  void plotline(ScrollView* window, // window to draw in
136  float xorigin, // origin of histo
137  float yorigin, // gram
138  float xscale, // size of one unit
139  float yscale, // size of one uint
140  ScrollView::Color colour) const; // colour to draw in
141  #endif // GRAPHICS_DISABLED
142 
143  private:
144  inT32 rangemin_; // min of range
145  // rangemax_ is not well named as it is really one past the max.
146  inT32 rangemax_; // max of range
147  inT32 total_count_; // no of samples
148  inT32* buckets_; // array of cells
149 };
150 
151 // Returns the nth ordered item from the array, as if they were
152 // ordered, but without ordering them, in linear time.
153 // The array does get shuffled!
154 inT32 choose_nth_item(inT32 index, // index to choose
155  float *array, // array of items
156  inT32 count); // no of items
157 // Generic version uses a defined comparator (with qsort semantics).
158 inT32 choose_nth_item(inT32 index, // index to choose
159  void *array, // array of items
160  inT32 count, // no of items
161  size_t size, // element size
162  int (*compar)(const void*, const void*)); // comparator
163 // Swaps 2 entries in an array in-place.
164 void swap_entries(void *array, // array of entries
165  size_t size, // size of entry
166  inT32 index1, // entries to swap
167  inT32 index2);
168 
169 #endif // TESSERACT_CCSTRUCT_STATISTC_H_
inT32 get_total() const
Definition: statistc.h:86
int32_t inT32
Definition: host.h:38
voidpf void uLong size
Definition: ioapi.h:39
void plot(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:585
inT32 pile_count(inT32 value) const
Definition: statistc.h:78
double median() const
Definition: statistc.cpp:239
void print_summary() const
Definition: statistc.cpp:560
inT32 min_bucket() const
Definition: statistc.cpp:206
void smooth(inT32 factor)
Definition: statistc.cpp:289
int top_n_modes(int max_modes, GenericVector< tesseract::KDPairInc< float, int > > *modes) const
Definition: statistc.cpp:469
void plotline(ScrollView *window, float xorigin, float yorigin, float xscale, float yscale, ScrollView::Color colour) const
Definition: statistc.cpp:612
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
STATS()
Definition: statistc.cpp:51
inT32 cluster(float lower, float upper, float multiple, inT32 max_clusters, STATS *clusters)
Definition: statistc.cpp:320
inT32 mode() const
Definition: statistc.cpp:115
void swap_entries(void *array, size_t size, inT32 index1, inT32 index2)
Definition: statistc.cpp:766
Definition: statistc.h:33
double ile(double frac) const
Definition: statistc.cpp:174
void print() const
Definition: statistc.cpp:534
double mean() const
Definition: statistc.cpp:135
bool local_min(inT32 x) const
Definition: statistc.cpp:262
inT32 choose_nth_item(inT32 index, float *array, inT32 count)
Definition: statistc.cpp:638
~STATS()
Definition: statistc.cpp:92
int count(LIST var_list)
Definition: oldlist.cpp:103
bool set_range(inT32 min_bucket_value, inT32 max_bucket_value_plus_1)
Definition: statistc.cpp:62
double sd() const
Definition: statistc.cpp:151
inT32 max_bucket() const
Definition: statistc.cpp:221
void clear()
Definition: statistc.cpp:81