tesseract  4.00.00dev
dotproductavx.cpp
Go to the documentation of this file.
1 // File: dotproductavx.cpp
3 // Description: Architecture-specific dot-product function.
4 // Author: Ray Smith
5 // Created: Wed Jul 22 10:48:05 PDT 2015
6 //
7 // (C) Copyright 2015, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
18 
19 #if !defined(__AVX__)
20 // Implementation for non-avx archs.
21 
22 #include "dotproductavx.h"
23 #include <stdio.h>
24 #include <stdlib.h>
25 
26 namespace tesseract {
27 double DotProductAVX(const double* u, const double* v, int n) {
28  fprintf(stderr, "DotProductAVX can't be used on Android\n");
29  abort();
30 }
31 } // namespace tesseract
32 
33 #else // !defined(__AVX__)
34 // Implementation for avx capable archs.
35 #include <immintrin.h>
36 #include <stdint.h>
37 #include "dotproductavx.h"
38 #include "host.h"
39 
40 namespace tesseract {
41 
42 // Computes and returns the dot product of the n-vectors u and v.
43 // Uses Intel AVX intrinsics to access the SIMD instruction set.
44 double DotProductAVX(const double* u, const double* v, int n) {
45  int max_offset = n - 4;
46  int offset = 0;
47  // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
48  // v, and multiplying them together in parallel.
49  __m256d sum = _mm256_setzero_pd();
50  if (offset <= max_offset) {
51  offset = 4;
52  // Aligned load is reputedly faster but requires 32 byte aligned input.
53  if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 &&
54  (reinterpret_cast<const uintptr_t>(v) & 31) == 0) {
55  // Use aligned load.
56  __m256d floats1 = _mm256_load_pd(u);
57  __m256d floats2 = _mm256_load_pd(v);
58  // Multiply.
59  sum = _mm256_mul_pd(floats1, floats2);
60  while (offset <= max_offset) {
61  floats1 = _mm256_load_pd(u + offset);
62  floats2 = _mm256_load_pd(v + offset);
63  offset += 4;
64  __m256d product = _mm256_mul_pd(floats1, floats2);
65  sum = _mm256_add_pd(sum, product);
66  }
67  } else {
68  // Use unaligned load.
69  __m256d floats1 = _mm256_loadu_pd(u);
70  __m256d floats2 = _mm256_loadu_pd(v);
71  // Multiply.
72  sum = _mm256_mul_pd(floats1, floats2);
73  while (offset <= max_offset) {
74  floats1 = _mm256_loadu_pd(u + offset);
75  floats2 = _mm256_loadu_pd(v + offset);
76  offset += 4;
77  __m256d product = _mm256_mul_pd(floats1, floats2);
78  sum = _mm256_add_pd(sum, product);
79  }
80  }
81  }
82  // Add the 4 product sums together horizontally. Not so easy as with sse, as
83  // there is no add across the upper/lower 128 bit boundary, so permute to
84  // move the upper 128 bits to lower in another register.
85  __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
86  sum = _mm256_hadd_pd(sum, sum2);
87  sum = _mm256_hadd_pd(sum, sum);
88  double result;
89  // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
90  // instruction, as that introduces a 70 cycle delay. All this casting is to
91  // fool the intrinsics into thinking we are extracting the bottom int64.
92  auto cast_sum = _mm256_castpd_si256(sum);
93  *(reinterpret_cast<int64_t*>(&result)) =
94 #if defined(_WIN32) || defined(__i386__)
95  // This is a very simple workaround that is activated
96  // for all platforms that do not have _mm256_extract_epi64.
97  // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
98  ((uint64_t*)&cast_sum)[0]
99 #else
100  _mm256_extract_epi64(cast_sum, 0)
101 #endif
102  ;
103  while (offset < n) {
104  result += u[offset] * v[offset];
105  ++offset;
106  }
107  return result;
108 }
109 
110 } // namespace tesseract.
111 
112 #endif // ANDROID_BUILD
double u[max]
voidpf uLong offset
Definition: ioapi.h:42
double DotProductAVX(const double *u, const double *v, int n)
double v[max]