tesseract  4.00.00dev
/home/stefan/src/github/tesseract-ocr/tesseract/dotproduct-test.cpp
Go to the documentation of this file.
1 #include <stdint.h>
2 #include <immintrin.h>
3 
4 double DotProductAVX(const double* u, const double* v, int n) {
5  int max_offset = n - 3;
6  int offset = 0;
7  // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
8  // v, and multiplying them together in parallel.
9  __m256d sum = _mm256_setzero_pd();
10  if (max_offset > 0) {
11  // Aligned load is reputedly faster but requires 32 byte aligned input.
12  if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 &&
13  (reinterpret_cast<const uintptr_t>(v) & 31) == 0) {
14  do {
15  // Use aligned load.
16  __m256d floats1 = _mm256_load_pd(u + offset);
17  __m256d floats2 = _mm256_load_pd(v + offset);
18  offset += 4;
19  //~ __builtin_prefetch(u + offset);
20  //~ __builtin_prefetch(u + offset + 1);
21  //~ __builtin_prefetch(u + offset + 2);
22  //~ __builtin_prefetch(u + offset + 3);
23  //~ __builtin_prefetch(v + offset);
24  //~ __builtin_prefetch(v + offset + 1);
25  //~ __builtin_prefetch(v + offset + 2);
26  //~ __builtin_prefetch(v + offset + 3);
27  // Multiply.
28  __m256d product = _mm256_mul_pd(floats1, floats2);
29  sum = _mm256_add_pd(sum, product);
30  } while (offset < max_offset);
31  } else {
32  do {
33  // Use unaligned load.
34  __m256d floats1 = _mm256_loadu_pd(u + offset);
35  __m256d floats2 = _mm256_loadu_pd(v + offset);
36  offset += 4;
37  __builtin_prefetch(u + offset);
38  __builtin_prefetch(u + offset + 1);
39  __builtin_prefetch(u + offset + 2);
40  __builtin_prefetch(u + offset + 3);
41  __builtin_prefetch(v + offset);
42  __builtin_prefetch(v + offset + 1);
43  __builtin_prefetch(v + offset + 2);
44  __builtin_prefetch(v + offset + 3);
45  // Multiply.
46  __m256d product = _mm256_mul_pd(floats1, floats2);
47  sum = _mm256_add_pd(sum, product);
48  } while (offset <= max_offset);
49  }
50  }
51  // Add the 4 product sums together horizontally. Not so easy as with sse, as
52  // there is no add across the upper/lower 128 bit boundary, so permute to
53  // move the upper 128 bits to lower in another register.
54  __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
55  sum = _mm256_hadd_pd(sum, sum2);
56  sum = _mm256_hadd_pd(sum, sum);
57  double result;
58  // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
59  // instruction, as that introduces a 70 cycle delay. All this casting is to
60  // fool the intrinsics into thinking we are extracting the bottom int64.
61  __m256i cast_sum = _mm256_castpd_si256(sum);
62 #if defined(_WIN32) || defined(__i386__) || 1
63  // This is a very simple workaround that is activated
64  // for all platforms that do not have _mm256_extract_epi64.
65  // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
66  *(reinterpret_cast<int64_t*>(&result)) = ((uint64_t*)&cast_sum)[0];
67 #else
68  *(reinterpret_cast<int64_t*>(&result)) = _mm256_extract_epi64(cast_sum, 0);
69 #endif
70  while (offset < n) {
71  //~ __builtin_prefetch(u + offset + 1);
72  //~ __builtin_prefetch(v + offset + 1);
73  result += u[offset] * v[offset];
74  ++offset;
75  }
76  return result;
77 }
78 
79 double DotProductSIMD(const double* u, const double* v, int n) {
80  double total = 0.0;
81 #ifdef _OPENMP
82 #pragma omp simd aligned(u, v: 32)
83 #else
84 #error
85 #endif
86  for (int k = 0; k < n; ++k) total += u[k] * v[k];
87  return total;
88 }
89 
90 double DotProduct(const double* u, const double* v, int n) {
91  double total = 0.0;
92  for (int k = 0; k < n; ++k) total += u[k] * v[k];
93  return total;
94 }
double u[max]
double DotProductSIMD(const double *u, const double *v, int n)
voidpf uLong offset
Definition: ioapi.h:42
double DotProduct(const double *u, const double *v, int n)
double DotProductAVX(const double *u, const double *v, int n)
double v[max]