tesseract  4.00.00dev
weightmatrix.cpp
Go to the documentation of this file.
1 // File: weightmatrix.h
3 // Description: Hides distinction between float/int implementations.
4 // Author: Ray Smith
5 // Created: Tue Jun 17 11:46:20 PST 2014
6 //
7 // (C) Copyright 2014, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #include "weightmatrix.h"
20 
21 #include "dotproductavx.h"
22 #include "dotproductsse.h"
23 #include "simddetect.h"
24 #include "statistc.h"
25 #include "tprintf.h"
26 
27 namespace tesseract {
28 
29 // Copies the whole input transposed, converted to double, into *this.
31  int width = input.dim1();
32  int num_features = input.dim2();
33  ResizeNoInit(num_features, width);
34  for (int t = 0; t < width; ++t) WriteStrided(t, input[t]);
35 }
36 
37 // Sets up the network for training. Initializes weights using weights of
38 // scale `range` picked according to the random number generator `randomizer`.
39 int WeightMatrix::InitWeightsFloat(int no, int ni, bool ada_grad,
40  float weight_range, TRand* randomizer) {
41  int_mode_ = false;
42  wf_.Resize(no, ni, 0.0);
43  if (randomizer != NULL) {
44  for (int i = 0; i < no; ++i) {
45  for (int j = 0; j < ni; ++j) {
46  wf_[i][j] = randomizer->SignedRand(weight_range);
47  }
48  }
49  }
50  use_ada_grad_ = ada_grad;
51  InitBackward();
52  return ni * no;
53 }
54 
55 // Converts a float network to an int network. Each set of input weights that
56 // corresponds to a single output weight is converted independently:
57 // Compute the max absolute value of the weight set.
58 // Scale so the max absolute value becomes MAX_INT8.
59 // Round to integer.
60 // Store a multiplicative scale factor (as a double) that will reproduce
61 // the original value, subject to rounding errors.
63  wi_.ResizeNoInit(wf_.dim1(), wf_.dim2());
64  scales_.init_to_size(wi_.dim1(), 0.0);
65  int dim2 = wi_.dim2();
66  for (int t = 0; t < wi_.dim1(); ++t) {
67  double* f_line = wf_[t];
68  inT8* i_line = wi_[t];
69  double max_abs = 0.0;
70  for (int f = 0; f < dim2; ++f) {
71  double abs_val = fabs(f_line[f]);
72  if (abs_val > max_abs) max_abs = abs_val;
73  }
74  double scale = max_abs / MAX_INT8;
75  scales_[t] = scale;
76  if (scale == 0.0) scale = 1.0;
77  for (int f = 0; f < dim2; ++f) {
78  i_line[f] = IntCastRounded(f_line[f] / scale);
79  }
80  }
81  wf_.Resize(1, 1, 0.0);
82  int_mode_ = true;
83 }
84 
85 // Allocates any needed memory for running Backward, and zeroes the deltas,
86 // thus eliminating any existing momentum.
88  int no = int_mode_ ? wi_.dim1() : wf_.dim1();
89  int ni = int_mode_ ? wi_.dim2() : wf_.dim2();
90  dw_.Resize(no, ni, 0.0);
91  updates_.Resize(no, ni, 0.0);
92  wf_t_.Transpose(wf_);
93  if (use_ada_grad_) dw_sq_sum_.Resize(no, ni, 0.0);
94 }
95 
96 // Flag on mode to indicate that this weightmatrix uses inT8.
97 const int kInt8Flag = 1;
98 // Flag on mode to indicate that this weightmatrix uses ada grad.
99 const int kAdaGradFlag = 4;
100 // Flag on mode to indicate that this weightmatrix uses double. Set
101 // independently of kInt8Flag as even in int mode the scales can
102 // be float or double.
103 const int kDoubleFlag = 128;
104 
105 // Writes to the given file. Returns false in case of error.
106 bool WeightMatrix::Serialize(bool training, TFile* fp) const {
107  // For backward compatibility, add kDoubleFlag to mode to indicate the doubles
108  // format, without errs, so we can detect and read old format weight matrices.
109  uinT8 mode = (int_mode_ ? kInt8Flag : 0) |
110  (use_ada_grad_ ? kAdaGradFlag : 0) | kDoubleFlag;
111  if (fp->FWrite(&mode, sizeof(mode), 1) != 1) return false;
112  if (int_mode_) {
113  if (!wi_.Serialize(fp)) return false;
114  if (!scales_.Serialize(fp)) return false;
115  } else {
116  if (!wf_.Serialize(fp)) return false;
117  if (training && !updates_.Serialize(fp)) return false;
118  if (training && use_ada_grad_ && !dw_sq_sum_.Serialize(fp)) return false;
119  }
120  return true;
121 }
122 
123 // Reads from the given file. Returns false in case of error.
124 
125 bool WeightMatrix::DeSerialize(bool training, TFile* fp) {
126  uinT8 mode = 0;
127  if (fp->FRead(&mode, sizeof(mode), 1) != 1) return false;
128  int_mode_ = (mode & kInt8Flag) != 0;
129  use_ada_grad_ = (mode & kAdaGradFlag) != 0;
130  if ((mode & kDoubleFlag) == 0) return DeSerializeOld(training, fp);
131  if (int_mode_) {
132  if (!wi_.DeSerialize(fp)) return false;
133  if (!scales_.DeSerialize(fp)) return false;
134  } else {
135  if (!wf_.DeSerialize(fp)) return false;
136  if (training) {
137  InitBackward();
138  if (!updates_.DeSerialize(fp)) return false;
139  if (use_ada_grad_ && !dw_sq_sum_.DeSerialize(fp)) return false;
140  }
141  }
142  return true;
143 }
144 
145 // As DeSerialize, but reads an old (float) format WeightMatrix for
146 // backward compatibility.
147 bool WeightMatrix::DeSerializeOld(bool training, TFile* fp) {
148  GENERIC_2D_ARRAY<float> float_array;
149  if (int_mode_) {
150  if (!wi_.DeSerialize(fp)) return false;
151  GenericVector<float> old_scales;
152  if (!old_scales.DeSerialize(fp)) return false;
153  scales_.resize_no_init(old_scales.size());
154  for (int i = 0; i < old_scales.size(); ++i) scales_[i] = old_scales[i];
155  } else {
156  if (!float_array.DeSerialize(fp)) return false;
157  FloatToDouble(float_array, &wf_);
158  }
159  if (training) {
160  InitBackward();
161  if (!float_array.DeSerialize(fp)) return false;
162  FloatToDouble(float_array, &updates_);
163  // Errs was only used in int training, which is now dead.
164  if (!float_array.DeSerialize(fp)) return false;
165  }
166  return true;
167 }
168 
169 // Computes matrix.vector v = Wu.
170 // u is of size W.dim2() - 1 and the output v is of size W.dim1().
171 // u is imagined to have an extra element at the end with value 1, to
172 // implement the bias, but it doesn't actually have it.
173 // Asserts that the call matches what we have.
174 void WeightMatrix::MatrixDotVector(const double* u, double* v) const {
175  ASSERT_HOST(!int_mode_);
176  MatrixDotVectorInternal(wf_, true, false, u, v);
177 }
178 
179 void WeightMatrix::MatrixDotVector(const inT8* u, double* v) const {
180  ASSERT_HOST(int_mode_);
181  int num_out = wi_.dim1();
182  int num_in = wi_.dim2() - 1;
183  for (int i = 0; i < num_out; ++i) {
184  const inT8* Wi = wi_[i];
185  int total = 0;
187  total = IntDotProductSSE(u, Wi, num_in);
188  } else {
189  for (int j = 0; j < num_in; ++j) total += Wi[j] * u[j];
190  }
191  // Add in the bias and correct for integer values.
192  v[i] = (static_cast<double>(total) / MAX_INT8 + Wi[num_in]) * scales_[i];
193  }
194 }
195 
196 // MatrixDotVector for peep weights, MultiplyAccumulate adds the
197 // component-wise products of *this[0] and v to inout.
198 void WeightMatrix::MultiplyAccumulate(const double* v, double* inout) {
199  ASSERT_HOST(!int_mode_);
200  ASSERT_HOST(wf_.dim1() == 1);
201  int n = wf_.dim2();
202  const double* u = wf_[0];
203  for (int i = 0; i < n; ++i) {
204  inout[i] += u[i] * v[i];
205  }
206 }
207 
208 // Computes vector.matrix v = uW.
209 // u is of size W.dim1() and the output v is of size W.dim2() - 1.
210 // The last result is discarded, as v is assumed to have an imaginary
211 // last value of 1, as with MatrixDotVector.
212 void WeightMatrix::VectorDotMatrix(const double* u, double* v) const {
213  ASSERT_HOST(!int_mode_);
214  MatrixDotVectorInternal(wf_t_, false, true, u, v);
215 }
216 
217 // Fills dw_[i][j] with the dot product u[i][] . v[j][], using elements from
218 // u and v. In terms of the neural network, u is the gradients and v is the
219 // inputs.
220 // Note that (matching MatrixDotVector) v[last][] is missing, presumed 1.0.
221 // Runs parallel if requested. Note that u and v must be transposed.
223  const TransposedArray& v,
224  bool in_parallel) {
225  ASSERT_HOST(!int_mode_);
226  int num_outputs = dw_.dim1();
227  ASSERT_HOST(u.dim1() == num_outputs);
228  ASSERT_HOST(u.dim2() == v.dim2());
229  int num_inputs = dw_.dim2() - 1;
230  int num_samples = u.dim2();
231  // v is missing the last element in dim1.
232  ASSERT_HOST(v.dim1() == num_inputs);
233 #ifdef _OPENMP
234 #pragma omp parallel for num_threads(4) if (in_parallel)
235 #endif
236  for (int i = 0; i < num_outputs; ++i) {
237  double* dwi = dw_[i];
238  const double* ui = u[i];
239  for (int j = 0; j < num_inputs; ++j) {
240  dwi[j] = DotProduct(ui, v[j], num_samples);
241  }
242  // The last element of v is missing, presumed 1.0f.
243  double total = 0.0;
244  for (int k = 0; k < num_samples; ++k) total += ui[k];
245  dwi[num_inputs] = total;
246  }
247 }
248 
249 // Updates the weights using the given learning rate and momentum.
250 // num_samples is the quotient to be used in the adagrad computation iff
251 // use_ada_grad_ is true.
252 void WeightMatrix::Update(double learning_rate, double momentum,
253  int num_samples) {
254  ASSERT_HOST(!int_mode_);
255  if (use_ada_grad_ && num_samples > 0) {
256  dw_sq_sum_.SumSquares(dw_);
257  dw_.AdaGradScaling(dw_sq_sum_, num_samples);
258  }
259  dw_ *= learning_rate;
260  updates_ += dw_;
261  if (momentum > 0.0) wf_ += updates_;
262  if (momentum >= 0.0) updates_ *= momentum;
263  wf_t_.Transpose(wf_);
264 }
265 
266 // Adds the dw_ in other to the dw_ is *this.
268  ASSERT_HOST(dw_.dim1() == other.dw_.dim1());
269  ASSERT_HOST(dw_.dim2() == other.dw_.dim2());
270  dw_ += other.dw_;
271 }
272 
273 // Sums the products of weight updates in *this and other, splitting into
274 // positive (same direction) in *same and negative (different direction) in
275 // *changed.
276 void WeightMatrix::CountAlternators(const WeightMatrix& other, double* same,
277  double* changed) const {
278  int num_outputs = updates_.dim1();
279  int num_inputs = updates_.dim2();
280  ASSERT_HOST(num_outputs == other.updates_.dim1());
281  ASSERT_HOST(num_inputs == other.updates_.dim2());
282  for (int i = 0; i < num_outputs; ++i) {
283  const double* this_i = updates_[i];
284  const double* other_i = other.updates_[i];
285  for (int j = 0; j < num_inputs; ++j) {
286  double product = this_i[j] * other_i[j];
287  if (product < 0.0)
288  *changed -= product;
289  else
290  *same += product;
291  }
292  }
293 }
294 
295 // Helper computes an integer histogram bucket for a weight and adds it
296 // to the histogram.
297 const int kHistogramBuckets = 16;
298 static void HistogramWeight(double weight, STATS* histogram) {
299  int bucket = kHistogramBuckets - 1;
300  if (weight != 0.0) {
301  double logval = -log2(fabs(weight));
302  bucket = ClipToRange(IntCastRounded(logval), 0, kHistogramBuckets - 1);
303  }
304  histogram->add(bucket, 1);
305 }
306 
307 void WeightMatrix::Debug2D(const char* msg) {
308  STATS histogram(0, kHistogramBuckets);
309  if (int_mode_) {
310  for (int i = 0; i < wi_.dim1(); ++i) {
311  for (int j = 0; j < wi_.dim2(); ++j) {
312  HistogramWeight(wi_[i][j] * scales_[i], &histogram);
313  }
314  }
315  } else {
316  for (int i = 0; i < wf_.dim1(); ++i) {
317  for (int j = 0; j < wf_.dim2(); ++j) {
318  HistogramWeight(wf_[i][j], &histogram);
319  }
320  }
321  }
322  tprintf("%s\n", msg);
323  histogram.print();
324 }
325 
326 // Computes and returns the dot product of the two n-vectors u and v.
327 /* static */
328 double WeightMatrix::DotProduct(const double* u, const double* v, int n) {
329  // Note: because the order of addition is different among the 3 DotProduct
330  // functions, the results can (and do) vary slightly (although they agree
331  // to within about 4e-15). This produces different results when running
332  // training, despite all random inputs being precisely equal.
333  // To get consistent results, use just one of these DotProduct functions.
334  // On a test multi-layer network, serial is 57% slower than sse, and avx
335  // is about 8% faster than sse. This suggests that the time is memory
336  // bandwidth constrained and could benefit from holding the reused vector
337  // in AVX registers.
338  if (SIMDDetect::IsAVXAvailable()) return DotProductAVX(u, v, n);
339  if (SIMDDetect::IsSSEAvailable()) return DotProductSSE(u, v, n);
340  double total = 0.0;
341  for (int k = 0; k < n; ++k) total += u[k] * v[k];
342  return total;
343 }
344 
345 // Utility function converts an array of float to the corresponding array
346 // of double.
347 /* static */
350  int dim1 = wf.dim1();
351  int dim2 = wf.dim2();
352  wd->ResizeNoInit(dim1, dim2);
353  for (int i = 0; i < dim1; ++i) {
354  const float* wfi = wf[i];
355  double* wdi = (*wd)[i];
356  for (int j = 0; j < dim2; ++j) wdi[j] = static_cast<double>(wfi[j]);
357  }
358 }
359 
360 // Computes matrix.vector v = Wu.
361 // u is of size W.dim2() - add_bias_fwd and the output v is of size
362 // W.dim1() - skip_bias_back.
363 // If add_bias_fwd, u is imagined to have an extra element at the end with value
364 // 1, to implement the bias, weight.
365 // If skip_bias_back, we are actullay performing the backwards product on a
366 // transposed matrix, so we need to drop the v output corresponding to the last
367 // element in dim1.
368 void WeightMatrix::MatrixDotVectorInternal(const GENERIC_2D_ARRAY<double>& w,
369  bool add_bias_fwd,
370  bool skip_bias_back, const double* u,
371  double* v) {
372  int num_results = w.dim1() - skip_bias_back;
373  int extent = w.dim2() - add_bias_fwd;
374  for (int i = 0; i < num_results; ++i) {
375  const double* wi = w[i];
376  double total = DotProduct(wi, u, extent);
377  if (add_bias_fwd) total += wi[extent]; // The bias value.
378  v[i] = total;
379  }
380 }
381 
382 } // namespace tesseract.
double u[max]
bool DeSerialize(bool swap, FILE *fp)
const int kHistogramBuckets
void Debug2D(const char *msg)
void MatrixDotVector(const double *u, double *v) const
void AddDeltas(const WeightMatrix &other)
double DotProductSSE(const double *u, const double *v, int n)
#define tprintf(...)
Definition: tprintf.h:31
void Transpose(const GENERIC_2D_ARRAY< double > &input)
void resize_no_init(int size)
Definition: genericvector.h:66
void VectorDotMatrix(const double *u, double *v) const
void WriteStrided(int t, const float *data)
Definition: weightmatrix.h:37
void CountAlternators(const WeightMatrix &other, double *same, double *changed) const
int IntCastRounded(double x)
Definition: helpers.h:179
int size() const
Definition: genericvector.h:72
static bool IsAVXAvailable()
Definition: simddetect.h:26
#define ASSERT_HOST(x)
Definition: errcode.h:84
int dim1() const
Definition: matrix.h:201
int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n)
int dim2() const
Definition: matrix.h:202
bool DeSerialize(bool swap, FILE *fp)
Definition: matrix.h:155
const char int mode
Definition: ioapi.h:38
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
const int kDoubleFlag
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:148
const int kInt8Flag
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101
void Update(double learning_rate, double momentum, int num_samples)
void ResizeNoInit(int size1, int size2)
Definition: matrix.h:86
double DotProductAVX(const double *u, const double *v, int n)
static double DotProduct(const double *u, const double *v, int n)
int8_t inT8
Definition: host.h:34
bool DeSerialize(bool training, TFile *fp)
int InitWeightsFloat(int no, int ni, bool ada_grad, float weight_range, TRand *randomizer)
static void FloatToDouble(const GENERIC_2D_ARRAY< float > &wf, GENERIC_2D_ARRAY< double > *wd)
uint8_t uinT8
Definition: host.h:35
bool Serialize(bool training, TFile *fp) const
bool DeSerializeOld(bool training, TFile *fp)
Definition: statistc.h:33
double DotProduct(const double *u, const double *v, int n)
#define MAX_INT8
Definition: host.h:60
void MultiplyAccumulate(const double *v, double *inout)
void print() const
Definition: statistc.cpp:534
const int kAdaGradFlag
double SignedRand(double range)
Definition: helpers.h:60
static bool IsSSEAvailable()
Definition: simddetect.h:28
double v[max]
void SumOuterTransposed(const TransposedArray &u, const TransposedArray &v, bool parallel)
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:108