tesseract/a00968_source.html

 #include <stdint.h>
 #include <immintrin.h>

 double DotProductAVX(const double* u, const double* v, int n) {
   int max_offset = n - 3;
   int offset = 0;
   // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
   // v, and multiplying them together in parallel.
   __m256d sum = _mm256_setzero_pd();
   if (max_offset > 0) {
     // Aligned load is reputedly faster but requires 32 byte aligned input.
     if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 &&
         (reinterpret_cast<const uintptr_t>(v) & 31) == 0) {
       do {
         // Use aligned load.
         __m256d floats1 = _mm256_load_pd(u + offset);
         __m256d floats2 = _mm256_load_pd(v + offset);
         offset += 4;
         //~ __builtin_prefetch(u + offset);
         //~ __builtin_prefetch(u + offset + 1);
         //~ __builtin_prefetch(u + offset + 2);
         //~ __builtin_prefetch(u + offset + 3);
         //~ __builtin_prefetch(v + offset);
         //~ __builtin_prefetch(v + offset + 1);
         //~ __builtin_prefetch(v + offset + 2);
         //~ __builtin_prefetch(v + offset + 3);
         // Multiply.
         __m256d product = _mm256_mul_pd(floats1, floats2);
         sum = _mm256_add_pd(sum, product);
       } while (offset < max_offset);
     } else {
       do {
         // Use unaligned load.
         __m256d floats1 = _mm256_loadu_pd(u + offset);
         __m256d floats2 = _mm256_loadu_pd(v + offset);
         offset += 4;
         __builtin_prefetch(u + offset);
         __builtin_prefetch(u + offset + 1);
         __builtin_prefetch(u + offset + 2);
         __builtin_prefetch(u + offset + 3);
         __builtin_prefetch(v + offset);
         __builtin_prefetch(v + offset + 1);
         __builtin_prefetch(v + offset + 2);
         __builtin_prefetch(v + offset + 3);
         // Multiply.
         __m256d product = _mm256_mul_pd(floats1, floats2);
         sum = _mm256_add_pd(sum, product);
       } while (offset <= max_offset);
     }
   }
   // Add the 4 product sums together horizontally. Not so easy as with sse, as
   // there is no add across the upper/lower 128 bit boundary, so permute to
   // move the upper 128 bits to lower in another register.
   __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
   sum = _mm256_hadd_pd(sum, sum2);
   sum = _mm256_hadd_pd(sum, sum);
   double result;
   // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
   // instruction, as that introduces a 70 cycle delay. All this casting is to
   // fool the intrinsics into thinking we are extracting the bottom int64.
   __m256i cast_sum = _mm256_castpd_si256(sum);
 #if defined(_WIN32) || defined(__i386__) || 1
   // This is a very simple workaround that is activated
   // for all platforms that do not have _mm256_extract_epi64.
   // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
   *(reinterpret_cast<int64_t*>(&result)) = ((uint64_t*)&cast_sum)[0];
 #else
   *(reinterpret_cast<int64_t*>(&result)) = _mm256_extract_epi64(cast_sum, 0);
 #endif
   while (offset < n) {
     //~ __builtin_prefetch(u + offset + 1);
     //~ __builtin_prefetch(v + offset + 1);
     result += u[offset] * v[offset];
     ++offset;
   }
   return result;
 }

 double DotProductSIMD(const double* u, const double* v, int n) {
   double total = 0.0;
 #ifdef _OPENMP
 #pragma omp simd aligned(u, v: 32)
 #else
 #error
 #endif
   for (int k = 0; k < n; ++k) total += u[k] * v[k];
   return total;
 }

 double DotProduct(const double* u, const double* v, int n) {
   double total = 0.0;
   for (int k = 0; k < n; ++k) total += u[k] * v[k];
   return total;
 }
u
double u[max]
Definition: dotproduct-main.cpp:5

DotProductSIMD
double DotProductSIMD(const double *u, const double *v, int n)
Definition: dotproduct-test.cpp:79

offset
voidpf uLong offset
Definition: ioapi.h:42

DotProduct
double DotProduct(const double *u, const double *v, int n)
Definition: dotproduct-test.cpp:90

DotProductAVX
double DotProductAVX(const double *u, const double *v, int n)
Definition: dotproduct-test.cpp:4

v
double v[max]
Definition: dotproduct-main.cpp:6