tesseract/a00029_source.html

 // File:        dotproductavx.cpp
 // Description: Architecture-specific dot-product function.
 // Author:      Ray Smith
 // Created:     Wed Jul 22 10:48:05 PDT 2015
 //
 // (C) Copyright 2015, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #if !defined(__AVX__)
 // Implementation for non-avx archs.

 #include "dotproductavx.h"
 #include <stdio.h>
 #include <stdlib.h>

 namespace tesseract {
 double DotProductAVX(const double* u, const double* v, int n) {
   fprintf(stderr, "DotProductAVX can't be used on Android\n");
   abort();
 }
 }  // namespace tesseract

 #else  // !defined(__AVX__)
 // Implementation for avx capable archs.
 #include <immintrin.h>
 #include <stdint.h>
 #include "dotproductavx.h"
 #include "host.h"

 namespace tesseract {

 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel AVX intrinsics to access the SIMD instruction set.
 double DotProductAVX(const double* u, const double* v, int n) {
   int max_offset = n - 4;
   int offset = 0;
   // Accumulate a set of 4 sums in sum, by loading pairs of 4 values from u and
   // v, and multiplying them together in parallel.
   __m256d sum = _mm256_setzero_pd();
   if (offset <= max_offset) {
     offset = 4;
     // Aligned load is reputedly faster but requires 32 byte aligned input.
     if ((reinterpret_cast<const uintptr_t>(u) & 31) == 0 &&
         (reinterpret_cast<const uintptr_t>(v) & 31) == 0) {
       // Use aligned load.
       __m256d floats1 = _mm256_load_pd(u);
       __m256d floats2 = _mm256_load_pd(v);
       // Multiply.
       sum = _mm256_mul_pd(floats1, floats2);
       while (offset <= max_offset) {
         floats1 = _mm256_load_pd(u + offset);
         floats2 = _mm256_load_pd(v + offset);
         offset += 4;
         __m256d product = _mm256_mul_pd(floats1, floats2);
         sum = _mm256_add_pd(sum, product);
       }
     } else {
       // Use unaligned load.
       __m256d floats1 = _mm256_loadu_pd(u);
       __m256d floats2 = _mm256_loadu_pd(v);
       // Multiply.
       sum = _mm256_mul_pd(floats1, floats2);
       while (offset <= max_offset) {
         floats1 = _mm256_loadu_pd(u + offset);
         floats2 = _mm256_loadu_pd(v + offset);
         offset += 4;
         __m256d product = _mm256_mul_pd(floats1, floats2);
         sum = _mm256_add_pd(sum, product);
       }
     }
   }
   // Add the 4 product sums together horizontally. Not so easy as with sse, as
   // there is no add across the upper/lower 128 bit boundary, so permute to
   // move the upper 128 bits to lower in another register.
   __m256d sum2 = _mm256_permute2f128_pd(sum, sum, 1);
   sum = _mm256_hadd_pd(sum, sum2);
   sum = _mm256_hadd_pd(sum, sum);
   double result;
   // _mm256_extract_f64 doesn't exist, but resist the temptation to use an sse
   // instruction, as that introduces a 70 cycle delay. All this casting is to
   // fool the intrinsics into thinking we are extracting the bottom int64.
   auto cast_sum = _mm256_castpd_si256(sum);
   *(reinterpret_cast<int64_t*>(&result)) =
 #if defined(_WIN32) || defined(__i386__)
       // This is a very simple workaround that is activated
       // for all platforms that do not have _mm256_extract_epi64.
       // _mm256_extract_epi64(X, Y) == ((uint64_t*)&X)[Y]
       ((uint64_t*)&cast_sum)[0]
 #else
       _mm256_extract_epi64(cast_sum, 0)
 #endif
       ;
   while (offset < n) {
     result += u[offset] * v[offset];
     ++offset;
   }
   return result;
 }

 }  // namespace tesseract.

 #endif  // ANDROID_BUILD
u
double u[max]
Definition: dotproduct-main.cpp:5

host.h

offset
voidpf uLong offset
Definition: ioapi.h:42

tesseract
Definition: baseapi.cpp:82

dotproductavx.h

tesseract::DotProductAVX
double DotProductAVX(const double *u, const double *v, int n)
Definition: dotproductavx.cpp:27

v
double v[max]
Definition: dotproduct-main.cpp:6