19 #if !defined(__SSE4_1__) 28 fprintf(stderr,
"DotProductSSE can't be used on Android\n");
32 fprintf(stderr,
"IntDotProductSSE can't be used on Android\n");
37 #else // !defined(__SSE4_1__) 40 #include <emmintrin.h> 41 #include <smmintrin.h> 51 int max_offset = n - 2;
55 __m128d sum = _mm_setzero_pd();
56 if (offset <= max_offset) {
59 if ((reinterpret_cast<const uintptr_t>(u) & 15) == 0 &&
60 (reinterpret_cast<const uintptr_t>(v) & 15) == 0) {
63 __m128d floats2 = _mm_load_pd(v);
65 sum = _mm_mul_pd(sum, floats2);
66 while (offset <= max_offset) {
67 __m128d floats1 = _mm_load_pd(u + offset);
68 floats2 = _mm_load_pd(v + offset);
70 floats1 = _mm_mul_pd(floats1, floats2);
71 sum = _mm_add_pd(sum, floats1);
75 sum = _mm_loadu_pd(u);
76 __m128d floats2 = _mm_loadu_pd(v);
78 sum = _mm_mul_pd(sum, floats2);
79 while (offset <= max_offset) {
80 __m128d floats1 = _mm_loadu_pd(u + offset);
81 floats2 = _mm_loadu_pd(v + offset);
83 floats1 = _mm_mul_pd(floats1, floats2);
84 sum = _mm_add_pd(sum, floats1);
89 sum = _mm_hadd_pd(sum, sum);
91 double result = _mm_cvtsd_f64(sum);
103 int max_offset = n - 8;
107 __m128i sum = _mm_setzero_si128();
108 if (offset <= max_offset) {
110 __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
111 __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
112 sum = _mm_cvtepi8_epi16(packed1);
113 packed2 = _mm_cvtepi8_epi16(packed2);
117 sum = _mm_madd_epi16(sum, packed2);
118 while (offset <= max_offset) {
119 packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
120 packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
122 packed1 = _mm_cvtepi8_epi16(packed1);
123 packed2 = _mm_cvtepi8_epi16(packed2);
124 packed1 = _mm_madd_epi16(packed1, packed2);
125 sum = _mm_add_epi32(sum, packed1);
129 sum = _mm_hadd_epi32(sum, sum);
130 sum = _mm_hadd_epi32(sum, sum);
131 int32_t result = _mm_cvtsi128_si32(sum);
141 #endif // ANDROID_BUILD
double DotProductSSE(const double *u, const double *v, int n)
int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n)