#include <stdint.h>
#include <immintrin.h>

Functions
double	DotProductAVX (const double u, const double v, int n)

double	DotProductSIMD (const double u, const double v, int n)

double	DotProduct (const double u, const double v, int n)

Function Documentation

◆ DotProduct()

double DotProduct	(	const double *	u,
		const double *	v,
		int	n
	)

Definition at line 90 of file dotproduct-test.cpp.

                                                            {
   double total = 0.0;
   for (int k = 0; k < n; ++k) total += u[k] * v[k];
   return total;
 }

◆ DotProductAVX()

double DotProductAVX	(	const double *	u,
		const double *	v,
		int	n
	)

Definition at line 4 of file dotproduct-test.cpp.

                                                               {
   int max_offset = n - 3;
   int offset = 0;
   // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
   // v, and multiplying them together in parallel.
   __m256d sum = _mm256_setzero_pd();
   if (max_offset > 0) {
     // Aligned load is reputedly faster but requires 32 byte aligned input.
     if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 &&
         (reinterpret_cast<const uintptr_t>(v) & 31) == 0) {
       do {
         // Use aligned load.
         __m256d floats1 = _mm256_load_pd(u + offset);
         __m256d floats2 = _mm256_load_pd(v + offset);
         offset += 4;
         //~ __builtin_prefetch(u + offset);
         //~ __builtin_prefetch(u + offset + 1);
         //~ __builtin_prefetch(u + offset + 2);
         //~ __builtin_prefetch(u + offset + 3);
         //~ __builtin_prefetch(v + offset);
         //~ __builtin_prefetch(v + offset + 1);
         //~ __builtin_prefetch(v + offset + 2);
         //~ __builtin_prefetch(v + offset + 3);
         // Multiply.
         __m256d product = _mm256_mul_pd(floats1, floats2);
         sum = _mm256_add_pd(sum, product);
       } while (offset < max_offset);
     } else {
       do {
         // Use unaligned load.
         __m256d floats1 = _mm256_loadu_pd(u + offset);
         __m256d floats2 = _mm256_loadu_pd(v + offset);
         offset += 4;
         __builtin_prefetch(u + offset);
         __builtin_prefetch(u + offset + 1);
         __builtin_prefetch(u + offset + 2);
         __builtin_prefetch(u + offset + 3);
         __builtin_prefetch(v + offset);
         __builtin_prefetch(v + offset + 1);
         __builtin_prefetch(v + offset + 2);
         __builtin_prefetch(v + offset + 3);
         // Multiply.
         __m256d product = _mm256_mul_pd(floats1, floats2);
         sum = _mm256_add_pd(sum, product);
       } while (offset <= max_offset);
     }
   }
   // Add the 4 product sums together horizontally. Not so easy as with sse, as
   // there is no add across the upper/lower 128 bit boundary, so permute to
   // move the upper 128 bits to lower in another register.
   __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
   sum = _mm256_hadd_pd(sum, sum2);
   sum = _mm256_hadd_pd(sum, sum);
   double result;
   // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
   // instruction, as that introduces a 70 cycle delay. All this casting is to
   // fool the intrinsics into thinking we are extracting the bottom int64.
   __m256i cast_sum = _mm256_castpd_si256(sum);
 #if defined(_WIN32) || defined(__i386__) || 1
   // This is a very simple workaround that is activated
   // for all platforms that do not have _mm256_extract_epi64.
   // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
   *(reinterpret_cast<int64_t*>(&result)) = ((uint64_t*)&cast_sum)[0];
 #else
   *(reinterpret_cast<int64_t*>(&result)) = _mm256_extract_epi64(cast_sum, 0);
 #endif
   while (offset < n) {
     //~ __builtin_prefetch(u + offset + 1);
     //~ __builtin_prefetch(v + offset + 1);
     result += u[offset] * v[offset];
     ++offset;
   }
   return result;
 }

◆ DotProductSIMD()

double DotProductSIMD	(	const double *	u,
		const double *	v,
		int	n
	)

Definition at line 79 of file dotproduct-test.cpp.

                                                                {
   double total = 0.0;
 #ifdef _OPENMP
 #pragma omp simd aligned(u, v: 32)
 #else
 #error
 #endif
   for (int k = 0; k < n; ++k) total += u[k] * v[k];
   return total;
 }

Functions

Function Documentation

◆ DotProduct()

◆ DotProductAVX()

◆ DotProductSIMD()