5 int max_offset = n - 3;
9 __m256d sum = _mm256_setzero_pd();
12 if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 &&
13 (reinterpret_cast<const uintptr_t>(v) & 31) == 0) {
16 __m256d floats1 = _mm256_load_pd(u + offset);
17 __m256d floats2 = _mm256_load_pd(v + offset);
28 __m256d product = _mm256_mul_pd(floats1, floats2);
29 sum = _mm256_add_pd(sum, product);
30 }
while (offset < max_offset);
34 __m256d floats1 = _mm256_loadu_pd(u + offset);
35 __m256d floats2 = _mm256_loadu_pd(v + offset);
37 __builtin_prefetch(u + offset);
38 __builtin_prefetch(u + offset + 1);
39 __builtin_prefetch(u + offset + 2);
40 __builtin_prefetch(u + offset + 3);
41 __builtin_prefetch(v + offset);
42 __builtin_prefetch(v + offset + 1);
43 __builtin_prefetch(v + offset + 2);
44 __builtin_prefetch(v + offset + 3);
46 __m256d product = _mm256_mul_pd(floats1, floats2);
47 sum = _mm256_add_pd(sum, product);
48 }
while (offset <= max_offset);
54 __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
55 sum = _mm256_hadd_pd(sum, sum2);
56 sum = _mm256_hadd_pd(sum, sum);
61 __m256i cast_sum = _mm256_castpd_si256(sum);
62 #if defined(_WIN32) || defined(__i386__) || 1 66 *(
reinterpret_cast<int64_t*
>(&result)) = ((uint64_t*)&cast_sum)[0];
68 *(
reinterpret_cast<int64_t*
>(&result)) = _mm256_extract_epi64(cast_sum, 0);
82 #pragma omp simd aligned(u, v: 32) 86 for (
int k = 0; k < n; ++k) total += u[k] * v[k];
92 for (
int k = 0; k < n; ++k) total += u[k] * v[k];
double DotProductSIMD(const double *u, const double *v, int n)
double DotProduct(const double *u, const double *v, int n)
double DotProductAVX(const double *u, const double *v, int n)