tesseract/a00035_source.html

 // File:        dotproductsse.cpp
 // Description: Architecture-specific dot-product function.
 // Author:      Ray Smith
 // Created:     Wed Jul 22 10:57:45 PDT 2015
 //
 // (C) Copyright 2015, Google Inc.
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.

 #if !defined(__SSE4_1__)
 // This code can't compile with "-msse4.1", so use dummy stubs.

 #include "dotproductsse.h"
 #include <stdio.h>
 #include <stdlib.h>

 namespace tesseract {
 double DotProductSSE(const double* u, const double* v, int n) {
   fprintf(stderr, "DotProductSSE can't be used on Android\n");
   abort();
 }
 int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
   fprintf(stderr, "IntDotProductSSE can't be used on Android\n");
   abort();
 }
 }  // namespace tesseract

 #else  // !defined(__SSE4_1__)
 // Non-Android code here

 #include <emmintrin.h>
 #include <smmintrin.h>
 #include <stdint.h>
 #include "dotproductsse.h"
 #include "host.h"

 namespace tesseract {

 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
 double DotProductSSE(const double* u, const double* v, int n) {
   int max_offset = n - 2;
   int offset = 0;
   // Accumulate a set of 2 sums in sum, by loading pairs of 2 values from u and
   // v, and multiplying them together in parallel.
   __m128d sum = _mm_setzero_pd();
   if (offset <= max_offset) {
     offset = 2;
     // Aligned load is reputedly faster but requires 16 byte aligned input.
     if ((reinterpret_cast<const uintptr_t>(u) & 15) == 0 &&
         (reinterpret_cast<const uintptr_t>(v) & 15) == 0) {
       // Use aligned load.
       sum = _mm_load_pd(u);
       __m128d floats2 = _mm_load_pd(v);
       // Multiply.
       sum = _mm_mul_pd(sum, floats2);
       while (offset <= max_offset) {
         __m128d floats1 = _mm_load_pd(u + offset);
         floats2 = _mm_load_pd(v + offset);
         offset += 2;
         floats1 = _mm_mul_pd(floats1, floats2);
         sum = _mm_add_pd(sum, floats1);
       }
     } else {
       // Use unaligned load.
       sum = _mm_loadu_pd(u);
       __m128d floats2 = _mm_loadu_pd(v);
       // Multiply.
       sum = _mm_mul_pd(sum, floats2);
       while (offset <= max_offset) {
         __m128d floats1 = _mm_loadu_pd(u + offset);
         floats2 = _mm_loadu_pd(v + offset);
         offset += 2;
         floats1 = _mm_mul_pd(floats1, floats2);
         sum = _mm_add_pd(sum, floats1);
       }
     }
   }
   // Add the 2 sums in sum horizontally.
   sum = _mm_hadd_pd(sum, sum);
   // Extract the low result.
   double result = _mm_cvtsd_f64(sum);
   // Add on any left-over products.
   while (offset < n) {
     result += u[offset] * v[offset];
     ++offset;
   }
   return result;
 }

 // Computes and returns the dot product of the n-vectors u and v.
 // Uses Intel SSE intrinsics to access the SIMD instruction set.
 int32_t IntDotProductSSE(const int8_t* u, const int8_t* v, int n) {
   int max_offset = n - 8;
   int offset = 0;
   // Accumulate a set of 4 32-bit sums in sum, by loading 8 pairs of 8-bit
   // values, extending to 16 bit, multiplying to make 32 bit results.
   __m128i sum = _mm_setzero_si128();
   if (offset <= max_offset) {
     offset = 8;
     __m128i packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u));
     __m128i packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v));
     sum = _mm_cvtepi8_epi16(packed1);
     packed2 = _mm_cvtepi8_epi16(packed2);
     // The magic _mm_add_epi16 is perfect here. It multiplies 8 pairs of 16 bit
     // ints to make 32 bit results, which are then horizontally added in pairs
     // to make 4 32 bit results that still fit in a 128 bit register.
     sum = _mm_madd_epi16(sum, packed2);
     while (offset <= max_offset) {
       packed1 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(u + offset));
       packed2 = _mm_loadl_epi64(reinterpret_cast<const __m128i*>(v + offset));
       offset += 8;
       packed1 = _mm_cvtepi8_epi16(packed1);
       packed2 = _mm_cvtepi8_epi16(packed2);
       packed1 = _mm_madd_epi16(packed1, packed2);
       sum = _mm_add_epi32(sum, packed1);
     }
   }
   // Sum the 4 packed 32 bit sums and extract the low result.
   sum = _mm_hadd_epi32(sum, sum);
   sum = _mm_hadd_epi32(sum, sum);
   int32_t result = _mm_cvtsi128_si32(sum);
   while (offset < n) {
     result += u[offset] * v[offset];
     ++offset;
   }
   return result;
 }

 }  // namespace tesseract.

 #endif  // ANDROID_BUILD
u
double u[max]
Definition: dotproduct-main.cpp:5

host.h

tesseract::DotProductSSE
double DotProductSSE(const double *u, const double *v, int n)
Definition: dotproductsse.cpp:27

offset
voidpf uLong offset
Definition: ioapi.h:42

tesseract
Definition: baseapi.cpp:82

dotproductsse.h

tesseract::IntDotProductSSE
int32_t IntDotProductSSE(const int8_t *u, const int8_t *v, int n)
Definition: dotproductsse.cpp:31

v
double v[max]
Definition: dotproduct-main.cpp:6