28 fprintf(stderr,
"DotProductAVX can't be used on Android\n");
33 #else // !defined(__AVX__) 35 #include <immintrin.h> 45 int max_offset = n - 4;
49 __m256d sum = _mm256_setzero_pd();
50 if (offset <= max_offset) {
53 if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 &&
54 (reinterpret_cast<const uintptr_t>(v) & 31) == 0) {
56 __m256d floats1 = _mm256_load_pd(u);
57 __m256d floats2 = _mm256_load_pd(v);
59 sum = _mm256_mul_pd(floats1, floats2);
60 while (offset <= max_offset) {
61 floats1 = _mm256_load_pd(u + offset);
62 floats2 = _mm256_load_pd(v + offset);
64 __m256d product = _mm256_mul_pd(floats1, floats2);
65 sum = _mm256_add_pd(sum, product);
69 __m256d floats1 = _mm256_loadu_pd(u);
70 __m256d floats2 = _mm256_loadu_pd(v);
72 sum = _mm256_mul_pd(floats1, floats2);
73 while (offset <= max_offset) {
74 floats1 = _mm256_loadu_pd(u + offset);
75 floats2 = _mm256_loadu_pd(v + offset);
77 __m256d product = _mm256_mul_pd(floats1, floats2);
78 sum = _mm256_add_pd(sum, product);
85 __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
86 sum = _mm256_hadd_pd(sum, sum2);
87 sum = _mm256_hadd_pd(sum, sum);
92 auto cast_sum = _mm256_castpd_si256(sum);
93 *(
reinterpret_cast<int64_t*
>(&result)) =
94 #if defined(_WIN32) || defined(__i386__) 98 ((uint64_t*)&cast_sum)[0]
100 _mm256_extract_epi64(cast_sum, 0)
112 #endif // ANDROID_BUILD
double DotProductAVX(const double *u, const double *v, int n)