tesseract  4.00.00dev
linlsq.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: linlsq.cpp (Formerly llsq.c)
3  * Description: Linear Least squares fitting code.
4  * Author: Ray Smith
5  * Created: Thu Sep 12 08:44:51 BST 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <stdio.h>
21 #include <math.h>
22 #include "errcode.h"
23 #include "linlsq.h"
24 
25 const ERRCODE EMPTY_LLSQ = "Can't delete from an empty LLSQ";
26 
27 /**********************************************************************
28  * LLSQ::clear
29  *
30  * Function to initialize a LLSQ.
31  **********************************************************************/
32 
33 void LLSQ::clear() { // initialize
34  total_weight = 0.0; // no elements
35  sigx = 0.0; // update accumulators
36  sigy = 0.0;
37  sigxx = 0.0;
38  sigxy = 0.0;
39  sigyy = 0.0;
40 }
41 
42 
43 /**********************************************************************
44  * LLSQ::add
45  *
46  * Add an element to the accumulator.
47  **********************************************************************/
48 
49 void LLSQ::add(double x, double y) { // add an element
50  total_weight++; // count elements
51  sigx += x; // update accumulators
52  sigy += y;
53  sigxx += x * x;
54  sigxy += x * y;
55  sigyy += y * y;
56 }
57 // Adds an element with a specified weight.
58 void LLSQ::add(double x, double y, double weight) {
59  total_weight += weight;
60  sigx += x * weight; // update accumulators
61  sigy += y * weight;
62  sigxx += x * x * weight;
63  sigxy += x * y * weight;
64  sigyy += y * y * weight;
65 }
66 // Adds a whole LLSQ.
67 void LLSQ::add(const LLSQ& other) {
68  total_weight += other.total_weight;
69  sigx += other.sigx; // update accumulators
70  sigy += other.sigy;
71  sigxx += other.sigxx;
72  sigxy += other.sigxy;
73  sigyy += other.sigyy;
74 }
75 
76 
77 /**********************************************************************
78  * LLSQ::remove
79  *
80  * Delete an element from the acculuator.
81  **********************************************************************/
82 
83 void LLSQ::remove(double x, double y) { // delete an element
84  if (total_weight <= 0.0) // illegal
85  EMPTY_LLSQ.error("LLSQ::remove", ABORT, NULL);
86  total_weight--; // count elements
87  sigx -= x; // update accumulators
88  sigy -= y;
89  sigxx -= x * x;
90  sigxy -= x * y;
91  sigyy -= y * y;
92 }
93 
94 
95 /**********************************************************************
96  * LLSQ::m
97  *
98  * Return the gradient of the line fit.
99  **********************************************************************/
100 
101 double LLSQ::m() const { // get gradient
102  double covar = covariance();
103  double x_var = x_variance();
104  if (x_var != 0.0)
105  return covar / x_var;
106  else
107  return 0.0; // too little
108 }
109 
110 
111 /**********************************************************************
112  * LLSQ::c
113  *
114  * Return the constant of the line fit.
115  **********************************************************************/
116 
117 double LLSQ::c(double m) const { // get constant
118  if (total_weight > 0.0)
119  return (sigy - m * sigx) / total_weight;
120  else
121  return 0; // too little
122 }
123 
124 
125 /**********************************************************************
126  * LLSQ::rms
127  *
128  * Return the rms error of the fit.
129  **********************************************************************/
130 
131 double LLSQ::rms(double m, double c) const { // get error
132  double error; // total error
133 
134  if (total_weight > 0) {
135  error = sigyy + m * (m * sigxx + 2 * (c * sigx - sigxy)) + c *
136  (total_weight * c - 2 * sigy);
137  if (error >= 0)
138  error = sqrt(error / total_weight); // sqrt of mean
139  else
140  error = 0;
141  } else {
142  error = 0; // too little
143  }
144  return error;
145 }
146 
147 
148 /**********************************************************************
149  * LLSQ::pearson
150  *
151  * Return the pearson product moment correlation coefficient.
152  **********************************************************************/
153 
154 double LLSQ::pearson() const { // get correlation
155  double r = 0.0; // Correlation is 0 if insufficent data.
156 
157  double covar = covariance();
158  if (covar != 0.0) {
159  double var_product = x_variance() * y_variance();
160  if (var_product > 0.0)
161  r = covar / sqrt(var_product);
162  }
163  return r;
164 }
165 
166 // Returns the x,y means as an FCOORD.
168  if (total_weight > 0.0) {
169  return FCOORD(sigx / total_weight, sigy / total_weight);
170  } else {
171  return FCOORD(0.0f, 0.0f);
172  }
173 }
174 
175 // Returns the sqrt of the mean squared error measured perpendicular from the
176 // line through mean_point() in the direction dir.
177 //
178 // Derivation:
179 // Lemma: Let v and x_i (i=1..N) be a k-dimensional vectors (1xk matrices).
180 // Let % be dot product and ' be transpose. Note that:
181 // Sum[i=1..N] (v % x_i)^2
182 // = v * [x_1' x_2' ... x_N'] * [x_1' x_2' .. x_N']' * v'
183 // If x_i have average 0 we have:
184 // = v * (N * COVARIANCE_MATRIX(X)) * v'
185 // Expanded for the case that k = 2, where we treat the dimensions
186 // as x_i and y_i, this is:
187 // = v * (N * [VAR(X), COV(X,Y); COV(X,Y) VAR(Y)]) * v'
188 // Now, we are trying to calculate the mean squared error, where v is
189 // perpendicular to our line of interest:
190 // Mean squared error
191 // = E [ (v % (x_i - x_avg))) ^2 ]
192 // = Sum (v % (x_i - x_avg))^2 / N
193 // = v * N * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] / N * v'
194 // = v * [VAR(X) COV(X,Y); COV(X,Y) VAR(Y)] * v'
195 // = code below
196 double LLSQ::rms_orth(const FCOORD &dir) const {
197  FCOORD v = !dir;
198  v.normalise();
199  return sqrt(v.x() * v.x() * x_variance() +
200  2 * v.x() * v.y() * covariance() +
201  v.y() * v.y() * y_variance());
202 }
203 
204 // Returns the direction of the fitted line as a unit vector, using the
205 // least mean squared perpendicular distance. The line runs through the
206 // mean_point, i.e. a point p on the line is given by:
207 // p = mean_point() + lambda * vector_fit() for some real number lambda.
208 // Note that the result (0<=x<=1, -1<=y<=-1) is directionally ambiguous
209 // and may be negated without changing its meaning.
210 // Fitting a line m + ๐œ†v to a set of N points Pi = (xi, yi), where
211 // m is the mean point (๐, ๐‚) and
212 // v is the direction vector (cos๐œƒ, sin๐œƒ)
213 // The perpendicular distance of each Pi from the line is:
214 // (Pi - m) x v, where x is the scalar cross product.
215 // Total squared error is thus:
216 // E = โˆ‘((xi - ๐)sin๐œƒ - (yi - ๐‚)cos๐œƒ)ยฒ
217 // = โˆ‘(xi - ๐)ยฒsinยฒ๐œƒ - 2โˆ‘(xi - ๐)(yi - ๐‚)sin๐œƒ cos๐œƒ + โˆ‘(yi - ๐‚)ยฒcosยฒ๐œƒ
218 // = NVar(xi)sinยฒ๐œƒ - 2NCovar(xi, yi)sin๐œƒ cos๐œƒ + NVar(yi)cosยฒ๐œƒ (Eq 1)
219 // where Var(xi) is the variance of xi,
220 // and Covar(xi, yi) is the covariance of xi, yi.
221 // Taking the derivative wrt ๐œƒ and setting to 0 to obtain the min/max:
222 // 0 = 2NVar(xi)sin๐œƒ cos๐œƒ -2NCovar(xi, yi)(cosยฒ๐œƒ - sinยฒ๐œƒ) -2NVar(yi)sin๐œƒ cos๐œƒ
223 // => Covar(xi, yi)(cosยฒ๐œƒ - sinยฒ๐œƒ) = (Var(xi) - Var(yi))sin๐œƒ cos๐œƒ
224 // Using double angles:
225 // 2Covar(xi, yi)cos2๐œƒ = (Var(xi) - Var(yi))sin2๐œƒ (Eq 2)
226 // So ๐œƒ = 0.5 atan2(2Covar(xi, yi), Var(xi) - Var(yi)) (Eq 3)
227 
228 // Because it involves 2๐œƒ , Eq 2 has 2 solutions 90 degrees apart, but which
229 // is the min and which is the max? From Eq1:
230 // E/N = Var(xi)sinยฒ๐œƒ - 2Covar(xi, yi)sin๐œƒ cos๐œƒ + Var(yi)cosยฒ๐œƒ
231 // and 90 degrees away, using sin/cos equivalences:
232 // E'/N = Var(xi)cosยฒ๐œƒ + 2Covar(xi, yi)sin๐œƒ cos๐œƒ + Var(yi)sinยฒ๐œƒ
233 // The second error is smaller (making it the minimum) iff
234 // E'/N < E/N ie:
235 // (Var(xi) - Var(yi))(cosยฒ๐œƒ - sinยฒ๐œƒ) < -4Covar(xi, yi)sin๐œƒ cos๐œƒ
236 // Using double angles:
237 // (Var(xi) - Var(yi))cos2๐œƒ < -2Covar(xi, yi)sin2๐œƒ (InEq 1)
238 // But atan2(2Covar(xi, yi), Var(xi) - Var(yi)) picks 2๐œƒ such that:
239 // sgn(cos2๐œƒ) = sgn(Var(xi) - Var(yi)) and sgn(sin2๐œƒ) = sgn(Covar(xi, yi))
240 // so InEq1 can *never* be true, making the atan2 result *always* the min!
241 // In the degenerate case, where Covar(xi, yi) = 0 AND Var(xi) = Var(yi),
242 // the 2 solutions have equal error and the inequality is still false.
243 // Therefore the solution really is as trivial as Eq 3.
244 
245 // This is equivalent to returning the Principal Component in PCA, or the
246 // eigenvector corresponding to the largest eigenvalue in the covariance
247 // matrix. However, atan2 is much simpler! The one reference I found that
248 // uses this formula is http://web.mit.edu/18.06/www/Essays/tlsfit.pdf but
249 // that is still a much more complex derivation. It seems Pearson had already
250 // found this simple solution in 1901.
251 // http://books.google.com/books?id=WXwvAQAAIAAJ&pg=PA559
253  double x_var = x_variance();
254  double y_var = y_variance();
255  double covar = covariance();
256  double theta = 0.5 * atan2(2.0 * covar, x_var - y_var);
257  FCOORD result(cos(theta), sin(theta));
258  return result;
259 }
double x_variance() const
Definition: linlsq.h:79
Definition: points.h:189
double m() const
Definition: linlsq.cpp:101
FCOORD mean_point() const
Definition: linlsq.cpp:167
double covariance() const
Definition: linlsq.h:73
void add(double x, double y)
Definition: linlsq.cpp:49
FCOORD vector_fit() const
Definition: linlsq.cpp:252
bool normalise()
Convert to unit vec.
const ERRCODE EMPTY_LLSQ
Definition: linlsq.cpp:25
void clear()
Definition: linlsq.cpp:33
double rms(double m, double c) const
Definition: linlsq.cpp:131
void error(const char *caller, TessErrorLogCode action, const char *format,...) const
Definition: errcode.cpp:40
double c(double m) const
Definition: linlsq.cpp:117
Definition: linlsq.h:26
float y() const
Definition: points.h:212
double y_variance() const
Definition: linlsq.h:85
void remove(double x, double y)
Definition: linlsq.cpp:83
Definition: errcode.h:30
double rms_orth(const FCOORD &dir) const
Definition: linlsq.cpp:196
double v[max]
float x() const
Definition: points.h:209
double pearson() const
Definition: linlsq.cpp:154