tesseract  4.00.00dev
linlsq.h
Go to the documentation of this file.
1 /**********************************************************************
2  * File: linlsq.h (Formerly llsq.h)
3  * Description: Linear Least squares fitting code.
4  * Author: Ray Smith
5  * Created: Thu Sep 12 08:44:51 BST 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #ifndef TESSERACT_CCSTRUCT_LINLSQ_H_
21 #define TESSERACT_CCSTRUCT_LINLSQ_H_
22 
23 #include "points.h"
24 #include "params.h"
25 
26 class LLSQ {
27  public:
28  LLSQ() { // constructor
29  clear(); // set to zeros
30  }
31  void clear(); // initialize
32 
33  // Adds an element with a weight of 1.
34  void add(double x, double y);
35  // Adds an element with a specified weight.
36  void add(double x, double y, double weight);
37  // Adds a whole LLSQ.
38  void add(const LLSQ& other);
39  // Deletes an element with a weight of 1.
40  void remove(double x, double y);
41  inT32 count() const { // no of elements
42  return static_cast<int>(total_weight + 0.5);
43  }
44 
45  double m() const; // get gradient
46  double c(double m) const; // get constant
47  double rms(double m, double c) const; // get error
48  double pearson() const; // get correlation coefficient.
49 
50  // Returns the x,y means as an FCOORD.
51  FCOORD mean_point() const;
52 
53  // Returns the average sum of squared perpendicular error from a line
54  // through mean_point() in the direction dir.
55  double rms_orth(const FCOORD &dir) const;
56 
57  // Returns the direction of the fitted line as a unit vector, using the
58  // least mean squared perpendicular distance. The line runs through the
59  // mean_point, i.e. a point p on the line is given by:
60  // p = mean_point() + lambda * vector_fit() for some real number lambda.
61  // Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
62  // and may be negated without changing its meaning, since a line is only
63  // unique to a range of pi radians.
64  // Modernists prefer to think of this as an Eigenvalue problem, but
65  // Pearson had the simple solution in 1901.
66  //
67  // Note that this is equivalent to returning the Principal Component in PCA,
68  // or the eigenvector corresponding to the largest eigenvalue in the
69  // covariance matrix.
70  FCOORD vector_fit() const;
71 
72  // Returns the covariance.
73  double covariance() const {
74  if (total_weight > 0.0)
75  return (sigxy - sigx * sigy / total_weight) / total_weight;
76  else
77  return 0.0;
78  }
79  double x_variance() const {
80  if (total_weight > 0.0)
81  return (sigxx - sigx * sigx / total_weight) / total_weight;
82  else
83  return 0.0;
84  }
85  double y_variance() const {
86  if (total_weight > 0.0)
87  return (sigyy - sigy * sigy / total_weight) / total_weight;
88  else
89  return 0.0;
90  }
91 
92  private:
93  double total_weight; // no of elements or sum of weights.
94  double sigx; // sum of x
95  double sigy; // sum of y
96  double sigxx; // sum x squared
97  double sigxy; // sum of xy
98  double sigyy; // sum y squared
99 };
100 
101 
102 // Returns the median value of the vector, given that the values are
103 // circular, with the given modulus. Values may be signed or unsigned,
104 // eg range from -pi to pi (modulus 2pi) or from 0 to 2pi (modulus 2pi).
105 // NOTE that the array is shuffled, but the time taken is linear.
106 // An assumption is made that most of the values are spread over no more than
107 // half the range, but wrap-around is accounted for if the median is near
108 // the wrap-around point.
109 // Cannot be a member of GenericVector, as it makes heavy used of LLSQ.
110 // T must be an integer or float/double type.
111 template<typename T> T MedianOfCircularValues(T modulus, GenericVector<T>* v) {
112  LLSQ stats;
113  T halfrange = static_cast<T>(modulus / 2);
114  int num_elements = v->size();
115  for (int i = 0; i < num_elements; ++i) {
116  stats.add((*v)[i], (*v)[i] + halfrange);
117  }
118  bool offset_needed = stats.y_variance() < stats.x_variance();
119  if (offset_needed) {
120  for (int i = 0; i < num_elements; ++i) {
121  (*v)[i] += halfrange;
122  }
123  }
124  int median_index = v->choose_nth_item(num_elements / 2);
125  if (offset_needed) {
126  for (int i = 0; i < num_elements; ++i) {
127  (*v)[i] -= halfrange;
128  }
129  }
130  return (*v)[median_index];
131 }
132 
133 
134 #endif // TESSERACT_CCSTRUCT_LINLSQ_H_
double x_variance() const
Definition: linlsq.h:79
Definition: points.h:189
int32_t inT32
Definition: host.h:38
double m() const
Definition: linlsq.cpp:101
FCOORD mean_point() const
Definition: linlsq.cpp:167
inT32 count() const
Definition: linlsq.h:41
double covariance() const
Definition: linlsq.h:73
void add(double x, double y)
Definition: linlsq.cpp:49
FCOORD vector_fit() const
Definition: linlsq.cpp:252
int size() const
Definition: genericvector.h:72
void clear()
Definition: linlsq.cpp:33
LLSQ()
Definition: linlsq.h:28
double rms(double m, double c) const
Definition: linlsq.cpp:131
double c(double m) const
Definition: linlsq.cpp:117
Definition: linlsq.h:26
double y_variance() const
Definition: linlsq.h:85
double rms_orth(const FCOORD &dir) const
Definition: linlsq.cpp:196
int choose_nth_item(int target_index)
T MedianOfCircularValues(T modulus, GenericVector< T > *v)
Definition: linlsq.h:111
double v[max]
double pearson() const
Definition: linlsq.cpp:154