tesseract  4.00.00dev
openclwrapper.cpp
Go to the documentation of this file.
1 // Licensed under the Apache License, Version 2.0 (the "License");
2 // you may not use this file except in compliance with the License.
3 // You may obtain a copy of the License at
4 // http://www.apache.org/licenses/LICENSE-2.0
5 // Unless required by applicable law or agreed to in writing, software
6 // distributed under the License is distributed on an "AS IS" BASIS,
7 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
8 // See the License for the specific language governing permissions and
9 // limitations under the License.
10 #ifdef _WIN32
11 #include <io.h>
12 #else
13 #include <sys/types.h>
14 #include <unistd.h>
15 #endif
16 #include <float.h>
17 
18 #include "openclwrapper.h"
19 #include "oclkernels.h"
20 
21 // for micro-benchmark
22 #include "otsuthr.h"
23 #include "thresholder.h"
24 
25 #if ON_APPLE
26 #include <mach/mach_time.h>
27 #include <stdio.h>
28 #endif
29 
30 #define CALLOC LEPT_CALLOC
31 #define FREE LEPT_FREE
32 
33 #ifdef USE_OPENCL
34 
36 GPUEnv OpenclDevice::gpuEnv;
37 
38 bool OpenclDevice::deviceIsSelected = false;
39 ds_device OpenclDevice::selectedDevice;
40 
41 int OpenclDevice::isInited = 0;
42 
43 static l_int32 MORPH_BC = ASYMMETRIC_MORPH_BC;
44 
45 static const l_uint32 lmask32[] = {
46  0x80000000, 0xc0000000, 0xe0000000, 0xf0000000, 0xf8000000, 0xfc000000,
47  0xfe000000, 0xff000000, 0xff800000, 0xffc00000, 0xffe00000, 0xfff00000,
48  0xfff80000, 0xfffc0000, 0xfffe0000, 0xffff0000, 0xffff8000, 0xffffc000,
49  0xffffe000, 0xfffff000, 0xfffff800, 0xfffffc00, 0xfffffe00, 0xffffff00,
50  0xffffff80, 0xffffffc0, 0xffffffe0, 0xfffffff0, 0xfffffff8, 0xfffffffc,
51  0xfffffffe, 0xffffffff};
52 
53 static const l_uint32 rmask32[] = {
54  0x00000001, 0x00000003, 0x00000007, 0x0000000f, 0x0000001f, 0x0000003f,
55  0x0000007f, 0x000000ff, 0x000001ff, 0x000003ff, 0x000007ff, 0x00000fff,
56  0x00001fff, 0x00003fff, 0x00007fff, 0x0000ffff, 0x0001ffff, 0x0003ffff,
57  0x0007ffff, 0x000fffff, 0x001fffff, 0x003fffff, 0x007fffff, 0x00ffffff,
58  0x01ffffff, 0x03ffffff, 0x07ffffff, 0x0fffffff, 0x1fffffff, 0x3fffffff,
59  0x7fffffff, 0xffffffff};
60 
61 struct tiff_transform {
62  int vflip; /* if non-zero, image needs a vertical fip */
63  int hflip; /* if non-zero, image needs a horizontal flip */
64  int rotate; /* -1 -> counterclockwise 90-degree rotation,
65  0 -> no rotation
66  1 -> clockwise 90-degree rotation */
67 };
68 
69 static struct tiff_transform tiff_orientation_transforms[] = {
70  {0, 0, 0},
71  {0, 1, 0},
72  {1, 1, 0},
73  {1, 0, 0},
74  {0, 1, -1},
75  {0, 0, 1},
76  {0, 1, 1},
77  {0, 0, -1}
78 };
79 
80 static const l_int32 MAX_PAGES_IN_TIFF_FILE = 3000;
81 
82 static cl_mem pixsCLBuffer, pixdCLBuffer, pixdCLIntermediate; //Morph operations buffers
83 static cl_mem pixThBuffer; //output from thresholdtopix calculation
84 static cl_int clStatus;
85 static KernelEnv rEnv;
86 
87 #define DS_TAG_VERSION "<version>"
88 #define DS_TAG_VERSION_END "</version>"
89 #define DS_TAG_DEVICE "<device>"
90 #define DS_TAG_DEVICE_END "</device>"
91 #define DS_TAG_SCORE "<score>"
92 #define DS_TAG_SCORE_END "</score>"
93 #define DS_TAG_DEVICE_TYPE "<type>"
94 #define DS_TAG_DEVICE_TYPE_END "</type>"
95 #define DS_TAG_DEVICE_NAME "<name>"
96 #define DS_TAG_DEVICE_NAME_END "</name>"
97 #define DS_TAG_DEVICE_DRIVER_VERSION "<driver>"
98 #define DS_TAG_DEVICE_DRIVER_VERSION_END "</driver>"
99 
100 #define DS_DEVICE_NATIVE_CPU_STRING "native_cpu"
101 
102 #define DS_DEVICE_NAME_LENGTH 256
103 
104 typedef enum { DS_EVALUATE_ALL, DS_EVALUATE_NEW_ONLY } ds_evaluation_type;
105 
106 typedef struct {
107  unsigned int numDevices;
108  ds_device *devices;
109  const char *version;
110 } ds_profile;
111 
112 typedef enum {
113  DS_SUCCESS = 0,
114  DS_INVALID_PROFILE = 1000,
115  DS_MEMORY_ERROR,
116  DS_INVALID_PERF_EVALUATOR_TYPE,
117  DS_INVALID_PERF_EVALUATOR,
118  DS_PERF_EVALUATOR_ERROR,
119  DS_FILE_ERROR,
120  DS_UNKNOWN_DEVICE_TYPE,
121  DS_PROFILE_FILE_ERROR,
122  DS_SCORE_SERIALIZER_ERROR,
123  DS_SCORE_DESERIALIZER_ERROR
124 } ds_status;
125 
126 // Pointer to a function that calculates the score of a device (ex:
127 // device->score) update the data size of score. The encoding and the format
128 // of the score data is implementation defined. The function should return
129 // DS_SUCCESS if there's no error to be reported.
130 typedef ds_status (*ds_perf_evaluator)(ds_device *device, void *data);
131 
132 // deallocate memory used by score
133 typedef ds_status (*ds_score_release)(void *score);
134 static ds_status releaseDSProfile(ds_profile *profile, ds_score_release sr) {
135  ds_status status = DS_SUCCESS;
136  if (profile != nullptr) {
137  if (profile->devices != nullptr && sr != nullptr) {
138  unsigned int i;
139  for (i = 0; i < profile->numDevices; i++) {
140  free(profile->devices[i].oclDeviceName);
141  free(profile->devices[i].oclDriverVersion);
142  status = sr(profile->devices[i].score);
143  if (status != DS_SUCCESS) break;
144  }
145  free(profile->devices);
146  }
147  free(profile);
148  }
149  return status;
150 }
151 
152 static ds_status initDSProfile(ds_profile **p, const char *version) {
153  int numDevices;
154  cl_uint numPlatforms;
155  cl_platform_id *platforms = nullptr;
156  cl_device_id *devices = nullptr;
157  ds_status status = DS_SUCCESS;
158  unsigned int next;
159  unsigned int i;
160 
161  if (p == nullptr) return DS_INVALID_PROFILE;
162 
163  ds_profile *profile = (ds_profile *)malloc(sizeof(ds_profile));
164  if (profile == nullptr) return DS_MEMORY_ERROR;
165 
166  memset(profile, 0, sizeof(ds_profile));
167 
168  clGetPlatformIDs(0, nullptr, &numPlatforms);
169 
170  if (numPlatforms > 0) {
171  platforms = (cl_platform_id *)malloc(numPlatforms * sizeof(cl_platform_id));
172  if (platforms == nullptr) {
173  status = DS_MEMORY_ERROR;
174  goto cleanup;
175  }
176  clGetPlatformIDs(numPlatforms, platforms, nullptr);
177  }
178 
179  numDevices = 0;
180  for (i = 0; i < (unsigned int)numPlatforms; i++) {
181  cl_uint num;
182  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, 0, nullptr, &num);
183  numDevices += num;
184  }
185 
186  if (numDevices > 0) {
187  devices = (cl_device_id *)malloc(numDevices * sizeof(cl_device_id));
188  if (devices == nullptr) {
189  status = DS_MEMORY_ERROR;
190  goto cleanup;
191  }
192  }
193 
194  profile->numDevices =
195  numDevices + 1; // +1 to numDevices to include the native CPU
196  profile->devices =
197  (ds_device *)malloc(profile->numDevices * sizeof(ds_device));
198  if (profile->devices == nullptr) {
199  profile->numDevices = 0;
200  status = DS_MEMORY_ERROR;
201  goto cleanup;
202  }
203  memset(profile->devices, 0, profile->numDevices * sizeof(ds_device));
204 
205  next = 0;
206  for (i = 0; i < (unsigned int)numPlatforms; i++) {
207  cl_uint num;
208  unsigned j;
209  clGetDeviceIDs(platforms[i], CL_DEVICE_TYPE_ALL, numDevices, devices, &num);
210  for (j = 0; j < num; j++, next++) {
211  char buffer[DS_DEVICE_NAME_LENGTH];
212  size_t length;
213 
214  profile->devices[next].type = DS_DEVICE_OPENCL_DEVICE;
215  profile->devices[next].oclDeviceID = devices[j];
216 
217  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DEVICE_NAME,
218  DS_DEVICE_NAME_LENGTH, &buffer, nullptr);
219  length = strlen(buffer);
220  profile->devices[next].oclDeviceName = (char *)malloc(length + 1);
221  memcpy(profile->devices[next].oclDeviceName, buffer, length + 1);
222 
223  clGetDeviceInfo(profile->devices[next].oclDeviceID, CL_DRIVER_VERSION,
224  DS_DEVICE_NAME_LENGTH, &buffer, nullptr);
225  length = strlen(buffer);
226  profile->devices[next].oclDriverVersion = (char *)malloc(length + 1);
227  memcpy(profile->devices[next].oclDriverVersion, buffer, length + 1);
228  }
229  }
230  profile->devices[next].type = DS_DEVICE_NATIVE_CPU;
231  profile->version = version;
232 
233 cleanup:
234  free(platforms);
235  free(devices);
236  if (status == DS_SUCCESS) {
237  *p = profile;
238  } else {
239  if (profile) {
240  free(profile->devices);
241  free(profile);
242  }
243  }
244  return status;
245 }
246 
247 static ds_status profileDevices(ds_profile *profile,
248  const ds_evaluation_type type,
249  ds_perf_evaluator evaluator,
250  void *evaluatorData, unsigned int *numUpdates) {
251  ds_status status = DS_SUCCESS;
252  unsigned int i;
253  unsigned int updates = 0;
254 
255  if (profile == nullptr) {
256  return DS_INVALID_PROFILE;
257  }
258  if (evaluator == nullptr) {
259  return DS_INVALID_PERF_EVALUATOR;
260  }
261 
262  for (i = 0; i < profile->numDevices; i++) {
263  ds_status evaluatorStatus;
264 
265  switch (type) {
266  case DS_EVALUATE_NEW_ONLY:
267  if (profile->devices[i].score != nullptr) break;
268  // else fall through
269  case DS_EVALUATE_ALL:
270  evaluatorStatus = evaluator(profile->devices + i, evaluatorData);
271  if (evaluatorStatus != DS_SUCCESS) {
272  status = evaluatorStatus;
273  return status;
274  }
275  updates++;
276  break;
277  default:
278  return DS_INVALID_PERF_EVALUATOR_TYPE;
279  break;
280  };
281  }
282  if (numUpdates) *numUpdates = updates;
283  return status;
284 }
285 
286 static const char *findString(const char *contentStart, const char *contentEnd,
287  const char *string) {
288  size_t stringLength;
289  const char *currentPosition;
290  const char *found = nullptr;
291  stringLength = strlen(string);
292  currentPosition = contentStart;
293  for (currentPosition = contentStart; currentPosition < contentEnd;
294  currentPosition++) {
295  if (*currentPosition == string[0]) {
296  if (currentPosition + stringLength < contentEnd) {
297  if (strncmp(currentPosition, string, stringLength) == 0) {
298  found = currentPosition;
299  break;
300  }
301  }
302  }
303  }
304  return found;
305 }
306 
307 static ds_status readProFile(const char *fileName, char **content,
308  size_t *contentSize) {
309  size_t size = 0;
310 
311  *contentSize = 0;
312  *content = nullptr;
313 
314  FILE *input = fopen(fileName, "rb");
315  if (input == nullptr) {
316  return DS_FILE_ERROR;
317  }
318 
319  fseek(input, 0L, SEEK_END);
320  size = ftell(input);
321  rewind(input);
322  char *binary = (char *)malloc(size);
323  if (binary == nullptr) {
324  fclose(input);
325  return DS_FILE_ERROR;
326  }
327  fread(binary, sizeof(char), size, input);
328  fclose(input);
329 
330  *contentSize = size;
331  *content = binary;
332  return DS_SUCCESS;
333 }
334 
335 typedef ds_status (*ds_score_deserializer)(ds_device *device,
336  const unsigned char *serializedScore,
337  unsigned int serializedScoreSize);
338 
339 static ds_status readProfileFromFile(ds_profile *profile,
340  ds_score_deserializer deserializer,
341  const char *file) {
342  ds_status status = DS_SUCCESS;
343  char *contentStart = nullptr;
344  const char *contentEnd = nullptr;
345  size_t contentSize;
346 
347  if (profile == nullptr) return DS_INVALID_PROFILE;
348 
349  status = readProFile(file, &contentStart, &contentSize);
350  if (status == DS_SUCCESS) {
351  const char *currentPosition;
352  const char *dataStart;
353  const char *dataEnd;
354  size_t versionStringLength;
355 
356  contentEnd = contentStart + contentSize;
357  currentPosition = contentStart;
358 
359  // parse the version string
360  dataStart = findString(currentPosition, contentEnd, DS_TAG_VERSION);
361  if (dataStart == nullptr) {
362  status = DS_PROFILE_FILE_ERROR;
363  goto cleanup;
364  }
365  dataStart += strlen(DS_TAG_VERSION);
366 
367  dataEnd = findString(dataStart, contentEnd, DS_TAG_VERSION_END);
368  if (dataEnd == nullptr) {
369  status = DS_PROFILE_FILE_ERROR;
370  goto cleanup;
371  }
372 
373  versionStringLength = strlen(profile->version);
374  if (versionStringLength != (dataEnd - dataStart) ||
375  strncmp(profile->version, dataStart, versionStringLength) != 0) {
376  // version mismatch
377  status = DS_PROFILE_FILE_ERROR;
378  goto cleanup;
379  }
380  currentPosition = dataEnd + strlen(DS_TAG_VERSION_END);
381 
382  // parse the device information
383  while (1) {
384  unsigned int i;
385 
386  const char *deviceTypeStart;
387  const char *deviceTypeEnd;
388  ds_device_type deviceType;
389 
390  const char *deviceNameStart;
391  const char *deviceNameEnd;
392 
393  const char *deviceScoreStart;
394  const char *deviceScoreEnd;
395 
396  const char *deviceDriverStart;
397  const char *deviceDriverEnd;
398 
399  dataStart = findString(currentPosition, contentEnd, DS_TAG_DEVICE);
400  if (dataStart == nullptr) {
401  // nothing useful remain, quit...
402  break;
403  }
404  dataStart += strlen(DS_TAG_DEVICE);
405  dataEnd = findString(dataStart, contentEnd, DS_TAG_DEVICE_END);
406  if (dataEnd == nullptr) {
407  status = DS_PROFILE_FILE_ERROR;
408  goto cleanup;
409  }
410 
411  // parse the device type
412  deviceTypeStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_TYPE);
413  if (deviceTypeStart == nullptr) {
414  status = DS_PROFILE_FILE_ERROR;
415  goto cleanup;
416  }
417  deviceTypeStart += strlen(DS_TAG_DEVICE_TYPE);
418  deviceTypeEnd =
419  findString(deviceTypeStart, contentEnd, DS_TAG_DEVICE_TYPE_END);
420  if (deviceTypeEnd == nullptr) {
421  status = DS_PROFILE_FILE_ERROR;
422  goto cleanup;
423  }
424  memcpy(&deviceType, deviceTypeStart, sizeof(ds_device_type));
425 
426  // parse the device name
427  if (deviceType == DS_DEVICE_OPENCL_DEVICE) {
428  deviceNameStart = findString(dataStart, contentEnd, DS_TAG_DEVICE_NAME);
429  if (deviceNameStart == nullptr) {
430  status = DS_PROFILE_FILE_ERROR;
431  goto cleanup;
432  }
433  deviceNameStart += strlen(DS_TAG_DEVICE_NAME);
434  deviceNameEnd =
435  findString(deviceNameStart, contentEnd, DS_TAG_DEVICE_NAME_END);
436  if (deviceNameEnd == nullptr) {
437  status = DS_PROFILE_FILE_ERROR;
438  goto cleanup;
439  }
440 
441  deviceDriverStart =
442  findString(dataStart, contentEnd, DS_TAG_DEVICE_DRIVER_VERSION);
443  if (deviceDriverStart == nullptr) {
444  status = DS_PROFILE_FILE_ERROR;
445  goto cleanup;
446  }
447  deviceDriverStart += strlen(DS_TAG_DEVICE_DRIVER_VERSION);
448  deviceDriverEnd = findString(deviceDriverStart, contentEnd,
449  DS_TAG_DEVICE_DRIVER_VERSION_END);
450  if (deviceDriverEnd == nullptr) {
451  status = DS_PROFILE_FILE_ERROR;
452  goto cleanup;
453  }
454 
455  // check if this device is on the system
456  for (i = 0; i < profile->numDevices; i++) {
457  if (profile->devices[i].type == DS_DEVICE_OPENCL_DEVICE) {
458  size_t actualDeviceNameLength;
459  size_t driverVersionLength;
460 
461  actualDeviceNameLength = strlen(profile->devices[i].oclDeviceName);
462  driverVersionLength = strlen(profile->devices[i].oclDriverVersion);
463  if (actualDeviceNameLength == (deviceNameEnd - deviceNameStart) &&
464  driverVersionLength == (deviceDriverEnd - deviceDriverStart) &&
465  strncmp(profile->devices[i].oclDeviceName, deviceNameStart,
466  actualDeviceNameLength) == 0 &&
467  strncmp(profile->devices[i].oclDriverVersion, deviceDriverStart,
468  driverVersionLength) == 0) {
469  deviceScoreStart =
470  findString(dataStart, contentEnd, DS_TAG_SCORE);
471  if (deviceNameStart == nullptr) {
472  status = DS_PROFILE_FILE_ERROR;
473  goto cleanup;
474  }
475  deviceScoreStart += strlen(DS_TAG_SCORE);
476  deviceScoreEnd =
477  findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
478  status = deserializer(profile->devices + i,
479  (const unsigned char *)deviceScoreStart,
480  deviceScoreEnd - deviceScoreStart);
481  if (status != DS_SUCCESS) {
482  goto cleanup;
483  }
484  }
485  }
486  }
487  } else if (deviceType == DS_DEVICE_NATIVE_CPU) {
488  for (i = 0; i < profile->numDevices; i++) {
489  if (profile->devices[i].type == DS_DEVICE_NATIVE_CPU) {
490  deviceScoreStart = findString(dataStart, contentEnd, DS_TAG_SCORE);
491  if (deviceScoreStart == nullptr) {
492  status = DS_PROFILE_FILE_ERROR;
493  goto cleanup;
494  }
495  deviceScoreStart += strlen(DS_TAG_SCORE);
496  deviceScoreEnd =
497  findString(deviceScoreStart, contentEnd, DS_TAG_SCORE_END);
498  status = deserializer(profile->devices + i,
499  (const unsigned char *)deviceScoreStart,
500  deviceScoreEnd - deviceScoreStart);
501  if (status != DS_SUCCESS) {
502  goto cleanup;
503  }
504  }
505  }
506  }
507 
508  // skip over the current one to find the next device
509  currentPosition = dataEnd + strlen(DS_TAG_DEVICE_END);
510  }
511  }
512 cleanup:
513  free(contentStart);
514  return status;
515 }
516 
517 typedef ds_status (*ds_score_serializer)(ds_device *device,
518  void **serializedScore,
519  unsigned int *serializedScoreSize);
520 static ds_status writeProfileToFile(ds_profile *profile,
521  ds_score_serializer serializer,
522  const char *file) {
523  ds_status status = DS_SUCCESS;
524 
525  if (profile == nullptr) return DS_INVALID_PROFILE;
526 
527  FILE *profileFile = fopen(file, "wb");
528  if (profileFile == nullptr) {
529  status = DS_FILE_ERROR;
530  } else {
531  unsigned int i;
532 
533  // write version string
534  fwrite(DS_TAG_VERSION, sizeof(char), strlen(DS_TAG_VERSION), profileFile);
535  fwrite(profile->version, sizeof(char), strlen(profile->version),
536  profileFile);
537  fwrite(DS_TAG_VERSION_END, sizeof(char), strlen(DS_TAG_VERSION_END),
538  profileFile);
539  fwrite("\n", sizeof(char), 1, profileFile);
540 
541  for (i = 0; i < profile->numDevices && status == DS_SUCCESS; i++) {
542  void *serializedScore;
543  unsigned int serializedScoreSize;
544 
545  fwrite(DS_TAG_DEVICE, sizeof(char), strlen(DS_TAG_DEVICE), profileFile);
546 
547  fwrite(DS_TAG_DEVICE_TYPE, sizeof(char), strlen(DS_TAG_DEVICE_TYPE),
548  profileFile);
549  fwrite(&profile->devices[i].type, sizeof(ds_device_type), 1, profileFile);
550  fwrite(DS_TAG_DEVICE_TYPE_END, sizeof(char),
551  strlen(DS_TAG_DEVICE_TYPE_END), profileFile);
552 
553  switch (profile->devices[i].type) {
554  case DS_DEVICE_NATIVE_CPU: {
555  // There's no need to emit a device name for the native CPU device.
556  /*
557  fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
558  profileFile);
559  fwrite(DS_DEVICE_NATIVE_CPU_STRING,sizeof(char),
560  strlen(DS_DEVICE_NATIVE_CPU_STRING), profileFile);
561  fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
562  strlen(DS_TAG_DEVICE_NAME_END), profileFile);
563  */
564  } break;
565  case DS_DEVICE_OPENCL_DEVICE: {
566  fwrite(DS_TAG_DEVICE_NAME, sizeof(char), strlen(DS_TAG_DEVICE_NAME),
567  profileFile);
568  fwrite(profile->devices[i].oclDeviceName, sizeof(char),
569  strlen(profile->devices[i].oclDeviceName), profileFile);
570  fwrite(DS_TAG_DEVICE_NAME_END, sizeof(char),
571  strlen(DS_TAG_DEVICE_NAME_END), profileFile);
572 
573  fwrite(DS_TAG_DEVICE_DRIVER_VERSION, sizeof(char),
574  strlen(DS_TAG_DEVICE_DRIVER_VERSION), profileFile);
575  fwrite(profile->devices[i].oclDriverVersion, sizeof(char),
576  strlen(profile->devices[i].oclDriverVersion), profileFile);
577  fwrite(DS_TAG_DEVICE_DRIVER_VERSION_END, sizeof(char),
578  strlen(DS_TAG_DEVICE_DRIVER_VERSION_END), profileFile);
579  } break;
580  default:
581  status = DS_UNKNOWN_DEVICE_TYPE;
582  break;
583  };
584 
585  fwrite(DS_TAG_SCORE, sizeof(char), strlen(DS_TAG_SCORE), profileFile);
586  status = serializer(profile->devices + i, &serializedScore,
587  &serializedScoreSize);
588  if (status == DS_SUCCESS && serializedScore != nullptr &&
589  serializedScoreSize > 0) {
590  fwrite(serializedScore, sizeof(char), serializedScoreSize, profileFile);
591  free(serializedScore);
592  }
593  fwrite(DS_TAG_SCORE_END, sizeof(char), strlen(DS_TAG_SCORE_END),
594  profileFile);
595  fwrite(DS_TAG_DEVICE_END, sizeof(char), strlen(DS_TAG_DEVICE_END),
596  profileFile);
597  fwrite("\n", sizeof(char), 1, profileFile);
598  }
599  fclose(profileFile);
600  }
601  return status;
602 }
603 
604 // substitute invalid characters in device name with _
605 static void legalizeFileName( char *fileName) {
606  //printf("fileName: %s\n", fileName);
607  const char *invalidChars =
608  "/\?:*\"><| "; // space is valid but can cause headaches
609  // for each invalid char
610  for (int i = 0; i < strlen(invalidChars); i++) {
611  char invalidStr[4];
612  invalidStr[0] = invalidChars[i];
613  invalidStr[1] = '\0';
614  //printf("eliminating %s\n", invalidStr);
615  //char *pos = strstr(fileName, invalidStr);
616  // initial ./ is valid for present directory
617  //if (*pos == '.') pos++;
618  //if (*pos == '/') pos++;
619  for (char *pos = strstr(fileName, invalidStr); pos != nullptr;
620  pos = strstr(pos + 1, invalidStr)) {
621  // printf("\tfound: %s, ", pos);
622  pos[0] = '_';
623  // printf("fileName: %s\n", fileName);
624  }
625  }
626 }
627 
628 static void populateGPUEnvFromDevice( GPUEnv *gpuInfo, cl_device_id device ) {
629  //printf("[DS] populateGPUEnvFromDevice\n");
630  size_t size;
631  gpuInfo->mnIsUserCreated = 1;
632  // device
633  gpuInfo->mpDevID = device;
634  gpuInfo->mpArryDevsID = new cl_device_id[1];
635  gpuInfo->mpArryDevsID[0] = gpuInfo->mpDevID;
636  clStatus =
637  clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_TYPE,
638  sizeof(cl_device_type), &gpuInfo->mDevType, &size);
639  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(TYPE)");
640  // platform
641  clStatus =
642  clGetDeviceInfo(gpuInfo->mpDevID, CL_DEVICE_PLATFORM,
643  sizeof(cl_platform_id), &gpuInfo->mpPlatformID, &size);
644  CHECK_OPENCL( clStatus, "populateGPUEnv::getDeviceInfo(PLATFORM)");
645  // context
646  cl_context_properties props[3];
647  props[0] = CL_CONTEXT_PLATFORM;
648  props[1] = (cl_context_properties) gpuInfo->mpPlatformID;
649  props[2] = 0;
650  gpuInfo->mpContext = clCreateContext(props, 1, &gpuInfo->mpDevID, nullptr,
651  nullptr, &clStatus);
652  CHECK_OPENCL( clStatus, "populateGPUEnv::createContext");
653  // queue
654  cl_command_queue_properties queueProperties = 0;
655  gpuInfo->mpCmdQueue = clCreateCommandQueue( gpuInfo->mpContext, gpuInfo->mpDevID, queueProperties, &clStatus );
656  CHECK_OPENCL( clStatus, "populateGPUEnv::createCommandQueue");
657 }
658 
659 int OpenclDevice::LoadOpencl()
660 {
661 #ifdef WIN32
662  HINSTANCE HOpenclDll = nullptr;
663  void *OpenclDll = nullptr;
664  // fprintf(stderr, " LoadOpenclDllxx... \n");
665  OpenclDll = static_cast<HINSTANCE>(HOpenclDll);
666  OpenclDll = LoadLibrary("openCL.dll");
667  if (!static_cast<HINSTANCE>(OpenclDll)) {
668  fprintf(stderr, "[OD] Load opencl.dll failed!\n");
669  FreeLibrary(static_cast<HINSTANCE>(OpenclDll));
670  return 0;
671  }
672  fprintf(stderr, "[OD] Load opencl.dll successful!\n");
673 #endif
674  return 1;
675 }
676 int OpenclDevice::SetKernelEnv( KernelEnv *envInfo )
677 {
678  envInfo->mpkContext = gpuEnv.mpContext;
679  envInfo->mpkCmdQueue = gpuEnv.mpCmdQueue;
680  envInfo->mpkProgram = gpuEnv.mpArryPrograms[0];
681 
682  return 1;
683 }
684 
685 static cl_mem allocateZeroCopyBuffer(KernelEnv rEnv, l_uint32 *hostbuffer,
686  size_t nElements, cl_mem_flags flags,
687  cl_int *pStatus)
688 {
689  cl_mem membuffer = clCreateBuffer( rEnv.mpkContext, (cl_mem_flags) (flags),
690  nElements * sizeof(l_uint32), hostbuffer, pStatus);
691 
692  return membuffer;
693 }
694 
695 static
696 Pix *mapOutputCLBuffer(KernelEnv rEnv, cl_mem clbuffer, Pix *pixd, Pix *pixs,
697  int elements, cl_mem_flags flags, bool memcopy = false,
698  bool sync = true) {
699  PROCNAME("mapOutputCLBuffer");
700  if (!pixd) {
701  if (memcopy) {
702  if ((pixd = pixCreateTemplate(pixs)) == nullptr)
703  (Pix *)ERROR_PTR("pixd not made", procName, nullptr);
704  } else {
705  if ((pixd = pixCreateHeader(pixGetWidth(pixs), pixGetHeight(pixs),
706  pixGetDepth(pixs))) == nullptr)
707  (Pix *)ERROR_PTR("pixd not made", procName, nullptr);
708  }
709  }
710  l_uint32 *pValues = (l_uint32 *)clEnqueueMapBuffer(
711  rEnv.mpkCmdQueue, clbuffer, CL_TRUE, flags, 0,
712  elements * sizeof(l_uint32), 0, nullptr, nullptr, nullptr);
713 
714  if (memcopy) {
715  memcpy(pixGetData(pixd), pValues, elements * sizeof(l_uint32));
716  } else {
717  pixSetData(pixd, pValues);
718  }
719 
720  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, clbuffer, pValues, 0, nullptr,
721  nullptr);
722 
723  if (sync) {
724  clFinish(rEnv.mpkCmdQueue);
725  }
726 
727  return pixd;
728 }
729 
730 static cl_mem allocateIntBuffer(KernelEnv rEnv, const l_uint32 *_pValues,
731  size_t nElements, cl_int *pStatus,
732  bool sync = false)
733 {
734  cl_mem xValues =
735  clCreateBuffer(rEnv.mpkContext, (cl_mem_flags)(CL_MEM_READ_WRITE),
736  nElements * sizeof(l_int32), nullptr, pStatus);
737 
738  if (_pValues != nullptr) {
739  l_int32 *pValues = (l_int32 *)clEnqueueMapBuffer(
740  rEnv.mpkCmdQueue, xValues, CL_TRUE, CL_MAP_WRITE, 0,
741  nElements * sizeof(l_int32), 0, nullptr, nullptr, nullptr);
742 
743  memcpy(pValues, _pValues, nElements * sizeof(l_int32));
744 
745  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, xValues, pValues, 0, nullptr,
746  nullptr);
747 
748  if (sync) clFinish(rEnv.mpkCmdQueue);
749  }
750 
751  return xValues;
752 }
753 
754 
755 void OpenclDevice::releaseMorphCLBuffers()
756 {
757  if (pixdCLIntermediate != nullptr) clReleaseMemObject(pixdCLIntermediate);
758  if (pixsCLBuffer != nullptr) clReleaseMemObject(pixsCLBuffer);
759  if (pixdCLBuffer != nullptr) clReleaseMemObject(pixdCLBuffer);
760  if (pixThBuffer != nullptr) clReleaseMemObject(pixThBuffer);
761  pixdCLIntermediate = pixsCLBuffer = pixdCLBuffer = pixThBuffer = nullptr;
762 }
763 
764 int OpenclDevice::initMorphCLAllocations(l_int32 wpl, l_int32 h, Pix* pixs)
765 {
766  SetKernelEnv( &rEnv );
767 
768  if (pixThBuffer != nullptr) {
769  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
770  CL_MEM_ALLOC_HOST_PTR, &clStatus);
771 
772  // Get the output from ThresholdToPix operation
773  clStatus =
774  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixThBuffer, pixsCLBuffer, 0, 0,
775  sizeof(l_uint32) * wpl * h, 0, nullptr, nullptr);
776  }
777  else
778  {
779  //Get data from the source image
780  l_uint32* srcdata = (l_uint32*) malloc(wpl*h*sizeof(l_uint32));
781  memcpy(srcdata, pixGetData(pixs), wpl*h*sizeof(l_uint32));
782 
783  pixsCLBuffer = allocateZeroCopyBuffer(rEnv, srcdata, wpl*h, CL_MEM_USE_HOST_PTR, &clStatus);
784  }
785 
786  pixdCLBuffer = allocateZeroCopyBuffer(rEnv, nullptr, wpl * h,
787  CL_MEM_ALLOC_HOST_PTR, &clStatus);
788 
789  pixdCLIntermediate = allocateZeroCopyBuffer(
790  rEnv, nullptr, wpl * h, CL_MEM_ALLOC_HOST_PTR, &clStatus);
791 
792  return (int)clStatus;
793 }
794 
795 int OpenclDevice::InitEnv()
796 {
797 //PERF_COUNT_START("OD::InitEnv")
798 // printf("[OD] OpenclDevice::InitEnv()\n");
799 #ifdef SAL_WIN32
800  while( 1 )
801  {
802  if( 1 == LoadOpencl() )
803  break;
804  }
805 PERF_COUNT_SUB("LoadOpencl")
806 #endif
807  // sets up environment, compiles programs
808 
809  InitOpenclRunEnv_DeviceSelection( 0 );
810 //PERF_COUNT_SUB("called InitOpenclRunEnv_DS")
811 //PERF_COUNT_END
812  return 1;
813 }
814 
815 int OpenclDevice::ReleaseOpenclRunEnv()
816 {
817  ReleaseOpenclEnv( &gpuEnv );
818 #ifdef SAL_WIN32
819  FreeOpenclDll();
820 #endif
821  return 1;
822 }
823 inline int OpenclDevice::AddKernelConfig( int kCount, const char *kName )
824 {
825  if ( kCount < 1 )
826  fprintf(stderr,"Error: ( KCount < 1 ) AddKernelConfig\n" );
827  strcpy( gpuEnv.mArrykernelNames[kCount-1], kName );
828  gpuEnv.mnKernelCount++;
829  return 0;
830 }
831 int OpenclDevice::RegistOpenclKernel()
832 {
833  if ( !gpuEnv.mnIsUserCreated )
834  memset( &gpuEnv, 0, sizeof(gpuEnv) );
835 
836  gpuEnv.mnFileCount = 0; //argc;
837  gpuEnv.mnKernelCount = 0UL;
838 
839  AddKernelConfig( 1, (const char*) "oclAverageSub1" );
840  return 0;
841 }
842 
843 int OpenclDevice::InitOpenclRunEnv_DeviceSelection( int argc ) {
844 //PERF_COUNT_START("InitOpenclRunEnv_DS")
845  if (!isInited) {
846  // after programs compiled, selects best device
847  ds_device bestDevice_DS = getDeviceSelection( );
848 //PERF_COUNT_SUB("called getDeviceSelection()")
849  cl_device_id bestDevice = bestDevice_DS.oclDeviceID;
850  // overwrite global static GPUEnv with new device
851  if (selectedDeviceIsOpenCL() ) {
852  //printf("[DS] InitOpenclRunEnv_DS::Calling populateGPUEnvFromDevice() for selected device\n");
853  populateGPUEnvFromDevice( &gpuEnv, bestDevice );
854  gpuEnv.mnFileCount = 0; //argc;
855  gpuEnv.mnKernelCount = 0UL;
856 //PERF_COUNT_SUB("populate gpuEnv")
857  CompileKernelFile(&gpuEnv, "");
858 //PERF_COUNT_SUB("CompileKernelFile")
859  } else {
860  //printf("[DS] InitOpenclRunEnv_DS::Skipping populateGPUEnvFromDevice() b/c native cpu selected\n");
861  }
862  isInited = 1;
863  }
864 //PERF_COUNT_END
865  return 0;
866 }
867 
868 
869 OpenclDevice::OpenclDevice()
870 {
871  //InitEnv();
872 }
873 
874 OpenclDevice::~OpenclDevice()
875 {
876  //ReleaseOpenclRunEnv();
877 }
878 
879 int OpenclDevice::ReleaseOpenclEnv( GPUEnv *gpuInfo )
880 {
881  int i = 0;
882  int clStatus = 0;
883 
884  if ( !isInited )
885  {
886  return 1;
887  }
888 
889  for ( i = 0; i < gpuEnv.mnFileCount; i++ )
890  {
891  if ( gpuEnv.mpArryPrograms[i] )
892  {
893  clStatus = clReleaseProgram( gpuEnv.mpArryPrograms[i] );
894  CHECK_OPENCL( clStatus, "clReleaseProgram" );
895  gpuEnv.mpArryPrograms[i] = nullptr;
896  }
897  }
898  if ( gpuEnv.mpCmdQueue )
899  {
900  clReleaseCommandQueue( gpuEnv.mpCmdQueue );
901  gpuEnv.mpCmdQueue = nullptr;
902  }
903  if ( gpuEnv.mpContext )
904  {
905  clReleaseContext( gpuEnv.mpContext );
906  gpuEnv.mpContext = nullptr;
907  }
908  isInited = 0;
909  gpuInfo->mnIsUserCreated = 0;
910  delete[] gpuInfo->mpArryDevsID;
911  return 1;
912 }
913 int OpenclDevice::BinaryGenerated( const char * clFileName, FILE ** fhandle )
914 {
915  unsigned int i = 0;
916  cl_int clStatus;
917  int status = 0;
918  char *str = nullptr;
919  FILE *fd = nullptr;
920  char fileName[256] = {0}, cl_name[128] = {0};
921  char deviceName[1024];
922  clStatus = clGetDeviceInfo(gpuEnv.mpArryDevsID[i], CL_DEVICE_NAME,
923  sizeof(deviceName), deviceName, nullptr);
924  CHECK_OPENCL(clStatus, "clGetDeviceInfo");
925  str = (char *)strstr(clFileName, (char *)".cl");
926  memcpy(cl_name, clFileName, str - clFileName);
927  cl_name[str - clFileName] = '\0';
928  sprintf(fileName, "%s-%s.bin", cl_name, deviceName);
929  legalizeFileName(fileName);
930  fd = fopen(fileName, "rb");
931  status = (fd != nullptr) ? 1 : 0;
932  if (fd != nullptr) {
933  *fhandle = fd;
934  }
935  return status;
936 
937 }
938 int OpenclDevice::CachedOfKernerPrg( const GPUEnv *gpuEnvCached, const char * clFileName )
939 {
940  int i;
941  for ( i = 0; i < gpuEnvCached->mnFileCount; i++ )
942  {
943  if ( strcasecmp( gpuEnvCached->mArryKnelSrcFile[i], clFileName ) == 0 )
944  {
945  if (gpuEnvCached->mpArryPrograms[i] != nullptr) {
946  return 1;
947  }
948  }
949  }
950 
951  return 0;
952 }
953 int OpenclDevice::WriteBinaryToFile( const char* fileName, const char* birary, size_t numBytes )
954 {
955  FILE *output = nullptr;
956  output = fopen(fileName, "wb");
957  if (output == nullptr) {
958  return 0;
959  }
960 
961  fwrite( birary, sizeof(char), numBytes, output );
962  fclose( output );
963 
964  return 1;
965 
966 }
967 int OpenclDevice::GeneratBinFromKernelSource( cl_program program, const char * clFileName )
968 {
969  unsigned int i = 0;
970  cl_int clStatus;
971  size_t *binarySizes;
972  cl_uint numDevices;
973  cl_device_id *mpArryDevsID;
974  char **binaries, *str = nullptr;
975 
976  clStatus = clGetProgramInfo(program, CL_PROGRAM_NUM_DEVICES,
977  sizeof(numDevices), &numDevices, nullptr);
978  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
979 
980  mpArryDevsID = (cl_device_id*) malloc( sizeof(cl_device_id) * numDevices );
981  if (mpArryDevsID == nullptr) {
982  return 0;
983  }
984  /* grab the handles to all of the devices in the program. */
985  clStatus = clGetProgramInfo(program, CL_PROGRAM_DEVICES,
986  sizeof(cl_device_id) * numDevices, mpArryDevsID,
987  nullptr);
988  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
989 
990  /* figure out the sizes of each of the binaries. */
991  binarySizes = (size_t*) malloc( sizeof(size_t) * numDevices );
992 
993  clStatus =
994  clGetProgramInfo(program, CL_PROGRAM_BINARY_SIZES,
995  sizeof(size_t) * numDevices, binarySizes, nullptr);
996  CHECK_OPENCL( clStatus, "clGetProgramInfo" );
997 
998  /* copy over all of the generated binaries. */
999  binaries = (char**) malloc( sizeof(char *) * numDevices );
1000  if (binaries == nullptr) {
1001  return 0;
1002  }
1003 
1004  for ( i = 0; i < numDevices; i++ )
1005  {
1006  if ( binarySizes[i] != 0 )
1007  {
1008  binaries[i] = (char*) malloc( sizeof(char) * binarySizes[i] );
1009  if (binaries[i] == nullptr) {
1010  return 0;
1011  }
1012  }
1013  else
1014  {
1015  binaries[i] = nullptr;
1016  }
1017  }
1018 
1019  clStatus = clGetProgramInfo(program, CL_PROGRAM_BINARIES,
1020  sizeof(char *) * numDevices, binaries, nullptr);
1021  CHECK_OPENCL(clStatus,"clGetProgramInfo");
1022 
1023  /* dump out each binary into its own separate file. */
1024  for ( i = 0; i < numDevices; i++ )
1025  {
1026  char fileName[256] = { 0 }, cl_name[128] = { 0 };
1027 
1028  if ( binarySizes[i] != 0 )
1029  {
1030  char deviceName[1024];
1031  clStatus = clGetDeviceInfo(mpArryDevsID[i], CL_DEVICE_NAME,
1032  sizeof(deviceName), deviceName, nullptr);
1033  CHECK_OPENCL( clStatus, "clGetDeviceInfo" );
1034 
1035  str = (char*) strstr( clFileName, (char*) ".cl" );
1036  memcpy( cl_name, clFileName, str - clFileName );
1037  cl_name[str - clFileName] = '\0';
1038  sprintf( fileName, "%s-%s.bin", cl_name, deviceName );
1039  legalizeFileName(fileName);
1040  if ( !WriteBinaryToFile( fileName, binaries[i], binarySizes[i] ) )
1041  {
1042  printf("[OD] write binary[%s] failed\n", fileName);
1043  return 0;
1044  } //else
1045  printf("[OD] write binary[%s] successfully\n", fileName);
1046  }
1047  }
1048 
1049  // Release all resouces and memory
1050  for ( i = 0; i < numDevices; i++ )
1051  {
1052  free(binaries[i]);
1053  binaries[i] = nullptr;
1054  }
1055 
1056  free(binaries);
1057  binaries = nullptr;
1058 
1059  free(binarySizes);
1060  binarySizes = nullptr;
1061 
1062  free(mpArryDevsID);
1063  mpArryDevsID = nullptr;
1064 
1065  return 1;
1066 }
1067 
1068 int OpenclDevice::CompileKernelFile( GPUEnv *gpuInfo, const char *buildOption )
1069 {
1070 //PERF_COUNT_START("CompileKernelFile")
1071  cl_int clStatus = 0;
1072  size_t length;
1073  char *buildLog = nullptr, *binary;
1074  const char *source;
1075  size_t source_size[1];
1076  int b_error, binary_status, binaryExisted, idx;
1077  cl_uint numDevices;
1078  cl_device_id *mpArryDevsID;
1079  FILE *fd, *fd1;
1080  const char* filename = "kernel.cl";
1081  //fprintf(stderr, "[OD] CompileKernelFile ... \n");
1082  if ( CachedOfKernerPrg(gpuInfo, filename) == 1 )
1083  {
1084  return 1;
1085  }
1086 
1087  idx = gpuInfo->mnFileCount;
1088 
1089  source = kernel_src;
1090 
1091  source_size[0] = strlen( source );
1092  binaryExisted = 0;
1093  binaryExisted = BinaryGenerated( filename, &fd ); // don't check for binary during microbenchmark
1094 //PERF_COUNT_SUB("BinaryGenerated")
1095  if ( binaryExisted == 1 )
1096  {
1097  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_NUM_DEVICES,
1098  sizeof(numDevices), &numDevices, nullptr);
1099  CHECK_OPENCL(clStatus, "clGetContextInfo");
1100 
1101  mpArryDevsID = (cl_device_id *)malloc(sizeof(cl_device_id) * numDevices);
1102  if (mpArryDevsID == nullptr) {
1103  return 0;
1104  }
1105 //PERF_COUNT_SUB("get numDevices")
1106  b_error = 0;
1107  length = 0;
1108  b_error |= fseek( fd, 0, SEEK_END ) < 0;
1109  b_error |= ( length = ftell(fd) ) <= 0;
1110  b_error |= fseek( fd, 0, SEEK_SET ) < 0;
1111  if ( b_error )
1112  {
1113  return 0;
1114  }
1115 
1116  binary = (char*) malloc( length + 2 );
1117  if ( !binary )
1118  {
1119  return 0;
1120  }
1121 
1122  memset( binary, 0, length + 2 );
1123  b_error |= fread( binary, 1, length, fd ) != length;
1124 
1125 
1126  fclose( fd );
1127 //PERF_COUNT_SUB("read file")
1128  fd = nullptr;
1129  // grab the handles to all of the devices in the context.
1130  clStatus = clGetContextInfo(gpuInfo->mpContext, CL_CONTEXT_DEVICES,
1131  sizeof(cl_device_id) * numDevices,
1132  mpArryDevsID, nullptr);
1133  CHECK_OPENCL( clStatus, "clGetContextInfo" );
1134 //PERF_COUNT_SUB("get devices")
1135  //fprintf(stderr, "[OD] Create kernel from binary\n");
1136  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithBinary( gpuInfo->mpContext,numDevices,
1137  mpArryDevsID, &length, (const unsigned char**) &binary,
1138  &binary_status, &clStatus );
1139  CHECK_OPENCL( clStatus, "clCreateProgramWithBinary" );
1140 //PERF_COUNT_SUB("clCreateProgramWithBinary")
1141  free( binary );
1142  free( mpArryDevsID );
1143  mpArryDevsID = nullptr;
1144  // PERF_COUNT_SUB("binaryExisted")
1145  }
1146  else
1147  {
1148  // create a CL program using the kernel source
1149  //fprintf(stderr, "[OD] Create kernel from source\n");
1150  gpuInfo->mpArryPrograms[idx] = clCreateProgramWithSource( gpuInfo->mpContext, 1, &source,
1151  source_size, &clStatus);
1152  CHECK_OPENCL( clStatus, "clCreateProgramWithSource" );
1153 //PERF_COUNT_SUB("!binaryExisted")
1154  }
1155 
1156  if (gpuInfo->mpArryPrograms[idx] == (cl_program) nullptr) {
1157  return 0;
1158  }
1159 
1160  //char options[512];
1161  // create a cl program executable for all the devices specified
1162  //printf("[OD] BuildProgram.\n");
1163 PERF_COUNT_START("OD::CompileKernel::clBuildProgram")
1164  if (!gpuInfo->mnIsUserCreated)
1165  {
1166  clStatus =
1167  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, gpuInfo->mpArryDevsID,
1168  buildOption, nullptr, nullptr);
1169  // PERF_COUNT_SUB("clBuildProgram notUserCreated")
1170  }
1171  else
1172  {
1173  clStatus =
1174  clBuildProgram(gpuInfo->mpArryPrograms[idx], 1, &(gpuInfo->mpDevID),
1175  buildOption, nullptr, nullptr);
1176  // PERF_COUNT_SUB("clBuildProgram isUserCreated")
1177  }
1179  if ( clStatus != CL_SUCCESS )
1180  {
1181  printf ("BuildProgram error!\n");
1182  if ( !gpuInfo->mnIsUserCreated )
1183  {
1184  clStatus = clGetProgramBuildInfo(
1185  gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1186  CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1187  }
1188  else
1189  {
1190  clStatus = clGetProgramBuildInfo(
1191  gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1192  CL_PROGRAM_BUILD_LOG, 0, nullptr, &length);
1193  }
1194  if ( clStatus != CL_SUCCESS )
1195  {
1196  printf("opencl create build log fail\n");
1197  return 0;
1198  }
1199  buildLog = (char*) malloc( length );
1200  if (buildLog == (char *)nullptr) {
1201  return 0;
1202  }
1203  if ( !gpuInfo->mnIsUserCreated )
1204  {
1205  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpArryDevsID[0],
1206  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1207  }
1208  else
1209  {
1210  clStatus = clGetProgramBuildInfo( gpuInfo->mpArryPrograms[idx], gpuInfo->mpDevID,
1211  CL_PROGRAM_BUILD_LOG, length, buildLog, &length );
1212  }
1213  if ( clStatus != CL_SUCCESS )
1214  {
1215  printf("opencl program build info fail\n");
1216  return 0;
1217  }
1218 
1219  fd1 = fopen( "kernel-build.log", "w+" );
1220  if (fd1 != nullptr) {
1221  fwrite(buildLog, sizeof(char), length, fd1);
1222  fclose(fd1);
1223  }
1224 
1225  free( buildLog );
1226 //PERF_COUNT_SUB("build error log")
1227  return 0;
1228  }
1229 
1230  strcpy( gpuInfo->mArryKnelSrcFile[idx], filename );
1231 //PERF_COUNT_SUB("strcpy")
1232  if ( binaryExisted == 0 ) {
1233  GeneratBinFromKernelSource( gpuInfo->mpArryPrograms[idx], filename );
1234  PERF_COUNT_SUB("GenerateBinFromKernelSource")
1235  }
1236 
1237  gpuInfo->mnFileCount += 1;
1238 //PERF_COUNT_END
1239  return 1;
1240 }
1241 
1242 l_uint32* OpenclDevice::pixReadFromTiffKernel(l_uint32 *tiffdata,l_int32 w,l_int32 h,l_int32 wpl,l_uint32 *line)
1243 {
1244 PERF_COUNT_START("pixReadFromTiffKernel")
1245  cl_int clStatus;
1246  KernelEnv rEnv;
1247  size_t globalThreads[2];
1248  size_t localThreads[2];
1249  int gsize;
1250  cl_mem valuesCl;
1251  cl_mem outputCl;
1252 
1253  //global and local work dimensions for Horizontal pass
1254  gsize = (w + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1255  globalThreads[0] = gsize;
1256  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1257  globalThreads[1] = gsize;
1258  localThreads[0] = GROUPSIZE_X;
1259  localThreads[1] = GROUPSIZE_Y;
1260 
1261  SetKernelEnv( &rEnv );
1262 
1263  l_uint32 *pResult = (l_uint32 *)malloc(w*h * sizeof(l_uint32));
1264  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "composeRGBPixel", &clStatus );
1265  CHECK_OPENCL(clStatus, "clCreateKernel composeRGBPixel");
1266 
1267  //Allocate input and output OCL buffers
1268  valuesCl = allocateZeroCopyBuffer(rEnv, tiffdata, w*h, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1269  outputCl = allocateZeroCopyBuffer(rEnv, pResult, w*h, CL_MEM_WRITE_ONLY | CL_MEM_USE_HOST_PTR, &clStatus);
1270 
1271  //Kernel arguments
1272  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &valuesCl);
1273  CHECK_OPENCL( clStatus, "clSetKernelArg");
1274  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(w), &w);
1275  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1276  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(h), &h);
1277  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1278  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1279  CHECK_OPENCL( clStatus, "clSetKernelArg" );
1280  clStatus = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outputCl);
1281  CHECK_OPENCL( clStatus, "clSetKernelArg");
1282 
1283  //Kernel enqueue
1284 PERF_COUNT_SUB("before")
1285 clStatus =
1286  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1287  globalThreads, localThreads, 0, nullptr, nullptr);
1288 CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel");
1289 
1290 /* map results back from gpu */
1291 void *ptr = clEnqueueMapBuffer(rEnv.mpkCmdQueue, outputCl, CL_TRUE, CL_MAP_READ,
1292  0, w * h * sizeof(l_uint32), 0, nullptr, nullptr,
1293  &clStatus);
1294 CHECK_OPENCL(clStatus, "clEnqueueMapBuffer outputCl");
1295 clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, outputCl, ptr, 0, nullptr, nullptr);
1296 
1297 // Sync
1298 clFinish(rEnv.mpkCmdQueue);
1299 PERF_COUNT_SUB("kernel & map")
1301  return pResult;
1302 }
1303 
1304 //Morphology Dilate operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1305 static cl_int pixDilateCL_55(l_int32 wpl, l_int32 h)
1306 {
1307  size_t globalThreads[2];
1308  cl_mem pixtemp;
1309  cl_int status;
1310  int gsize;
1311  size_t localThreads[2];
1312 
1313  //Horizontal pass
1314  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1315  globalThreads[0] = gsize;
1316  globalThreads[1] = GROUPSIZE_HMORY;
1317  localThreads[0] = GROUPSIZE_HMORX;
1318  localThreads[1] = GROUPSIZE_HMORY;
1319 
1320  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateHor_5x5", &status );
1321  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_5x5");
1322 
1323  status = clSetKernelArg(rEnv.mpkKernel,
1324  0,
1325  sizeof(cl_mem),
1326  &pixsCLBuffer);
1327  status = clSetKernelArg(rEnv.mpkKernel,
1328  1,
1329  sizeof(cl_mem),
1330  &pixdCLBuffer);
1331  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1332  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1333 
1334  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1335  nullptr, globalThreads, localThreads, 0,
1336  nullptr, nullptr);
1337 
1338  //Swap source and dest buffers
1339  pixtemp = pixsCLBuffer;
1340  pixsCLBuffer = pixdCLBuffer;
1341  pixdCLBuffer = pixtemp;
1342 
1343  //Vertical
1344  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1345  globalThreads[0] = gsize;
1346  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1347  globalThreads[1] = gsize;
1348  localThreads[0] = GROUPSIZE_X;
1349  localThreads[1] = GROUPSIZE_Y;
1350 
1351  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer_5x5", &status );
1352  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer_5x5");
1353 
1354  status = clSetKernelArg(rEnv.mpkKernel,
1355  0,
1356  sizeof(cl_mem),
1357  &pixsCLBuffer);
1358  status = clSetKernelArg(rEnv.mpkKernel,
1359  1,
1360  sizeof(cl_mem),
1361  &pixdCLBuffer);
1362  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1363  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1364  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1365  nullptr, globalThreads, localThreads, 0,
1366  nullptr, nullptr);
1367 
1368  return status;
1369 }
1370 
1371 //Morphology Erode operation for 5x5 structuring element. Invokes the relevant OpenCL kernels
1372 static cl_int pixErodeCL_55(l_int32 wpl, l_int32 h)
1373 {
1374  size_t globalThreads[2];
1375  cl_mem pixtemp;
1376  cl_int status;
1377  int gsize;
1378  l_uint32 fwmask, lwmask;
1379  size_t localThreads[2];
1380 
1381  lwmask = lmask32[31 - 2];
1382  fwmask = rmask32[31 - 2];
1383 
1384  //Horizontal pass
1385  gsize = (wpl*h + GROUPSIZE_HMORX - 1)/ GROUPSIZE_HMORX * GROUPSIZE_HMORX;
1386  globalThreads[0] = gsize;
1387  globalThreads[1] = GROUPSIZE_HMORY;
1388  localThreads[0] = GROUPSIZE_HMORX;
1389  localThreads[1] = GROUPSIZE_HMORY;
1390 
1391  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeHor_5x5", &status );
1392  CHECK_OPENCL(status, "clCreateKernel morphoErodeHor_5x5");
1393 
1394  status = clSetKernelArg(rEnv.mpkKernel,
1395  0,
1396  sizeof(cl_mem),
1397  &pixsCLBuffer);
1398  status = clSetKernelArg(rEnv.mpkKernel,
1399  1,
1400  sizeof(cl_mem),
1401  &pixdCLBuffer);
1402  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1403  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1404 
1405  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1406  nullptr, globalThreads, localThreads, 0,
1407  nullptr, nullptr);
1408 
1409  //Swap source and dest buffers
1410  pixtemp = pixsCLBuffer;
1411  pixsCLBuffer = pixdCLBuffer;
1412  pixdCLBuffer = pixtemp;
1413 
1414  //Vertical
1415  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1416  globalThreads[0] = gsize;
1417  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1418  globalThreads[1] = gsize;
1419  localThreads[0] = GROUPSIZE_X;
1420  localThreads[1] = GROUPSIZE_Y;
1421 
1422  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoErodeVer_5x5", &status );
1423  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer_5x5");
1424 
1425  status = clSetKernelArg(rEnv.mpkKernel,
1426  0,
1427  sizeof(cl_mem),
1428  &pixsCLBuffer);
1429  status = clSetKernelArg(rEnv.mpkKernel,
1430  1,
1431  sizeof(cl_mem),
1432  &pixdCLBuffer);
1433  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1434  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1435  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(fwmask), &fwmask);
1436  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(lwmask), &lwmask);
1437  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1438  nullptr, globalThreads, localThreads, 0,
1439  nullptr, nullptr);
1440 
1441  return status;
1442 }
1443 
1444 //Morphology Dilate operation. Invokes the relevant OpenCL kernels
1445 static cl_int
1446 pixDilateCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1447 {
1448  l_int32 xp, yp, xn, yn;
1449  SEL* sel;
1450  size_t globalThreads[2];
1451  cl_mem pixtemp;
1452  cl_int status;
1453  int gsize;
1454  size_t localThreads[2];
1455  char isEven;
1456 
1457  OpenclDevice::SetKernelEnv( &rEnv );
1458 
1459  if (hsize == 5 && vsize == 5)
1460  {
1461  //Specific case for 5x5
1462  status = pixDilateCL_55(wpl, h);
1463  return status;
1464  }
1465 
1466  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1467 
1468  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1469  selDestroy(&sel);
1470  //global and local work dimensions for Horizontal pass
1471  gsize = (wpl + GROUPSIZE_X - 1)/ GROUPSIZE_X * GROUPSIZE_X;
1472  globalThreads[0] = gsize;
1473  gsize = (h + GROUPSIZE_Y - 1)/ GROUPSIZE_Y * GROUPSIZE_Y;
1474  globalThreads[1] = gsize;
1475  localThreads[0] = GROUPSIZE_X;
1476  localThreads[1] = GROUPSIZE_Y;
1477 
1478  if (xp > 31 || xn > 31)
1479  {
1480  // Generic case.
1481  rEnv.mpkKernel =
1482  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor", &status);
1483  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor");
1484 
1485  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1486  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1487  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1488  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1489  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1490  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1491  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1492  nullptr, globalThreads, localThreads, 0,
1493  nullptr, nullptr);
1494 
1495  if (yp > 0 || yn > 0) {
1496  pixtemp = pixsCLBuffer;
1497  pixsCLBuffer = pixdCLBuffer;
1498  pixdCLBuffer = pixtemp;
1499  }
1500  }
1501  else if (xp > 0 || xn > 0 )
1502  {
1503  // Specific Horizontal pass kernel for half width < 32
1504  rEnv.mpkKernel =
1505  clCreateKernel(rEnv.mpkProgram, "morphoDilateHor_32word", &status);
1506  CHECK_OPENCL(status, "clCreateKernel morphoDilateHor_32word");
1507  isEven = (xp != xn);
1508 
1509  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1510  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1511  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1512  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1513  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1514  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isEven), &isEven);
1515  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1516  nullptr, globalThreads, localThreads, 0,
1517  nullptr, nullptr);
1518 
1519  if (yp > 0 || yn > 0) {
1520  pixtemp = pixsCLBuffer;
1521  pixsCLBuffer = pixdCLBuffer;
1522  pixdCLBuffer = pixtemp;
1523  }
1524  }
1525 
1526  if (yp > 0 || yn > 0)
1527  {
1528  rEnv.mpkKernel = clCreateKernel( rEnv.mpkProgram, "morphoDilateVer", &status );
1529  CHECK_OPENCL(status, "clCreateKernel morphoDilateVer");
1530 
1531  status = clSetKernelArg(rEnv.mpkKernel,
1532  0,
1533  sizeof(cl_mem),
1534  &pixsCLBuffer);
1535  status = clSetKernelArg(rEnv.mpkKernel,
1536  1,
1537  sizeof(cl_mem),
1538  &pixdCLBuffer);
1539  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1540  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1541  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1542  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(yn), &yn);
1543  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1544  nullptr, globalThreads, localThreads, 0,
1545  nullptr, nullptr);
1546  }
1547 
1548  return status;
1549 }
1550 
1551 //Morphology Erode operation. Invokes the relevant OpenCL kernels
1552 static cl_int pixErodeCL(l_int32 hsize, l_int32 vsize, l_uint32 wpl, l_uint32 h) {
1553  l_int32 xp, yp, xn, yn;
1554  SEL *sel;
1555  size_t globalThreads[2];
1556  size_t localThreads[2];
1557  cl_mem pixtemp;
1558  cl_int status;
1559  int gsize;
1560  char isAsymmetric = (MORPH_BC == ASYMMETRIC_MORPH_BC);
1561  l_uint32 rwmask, lwmask;
1562  char isEven;
1563 
1564  sel = selCreateBrick(vsize, hsize, vsize / 2, hsize / 2, SEL_HIT);
1565 
1566  selFindMaxTranslations(sel, &xp, &yp, &xn, &yn);
1567  selDestroy(&sel);
1568  OpenclDevice::SetKernelEnv(&rEnv);
1569 
1570  if (hsize == 5 && vsize == 5 && isAsymmetric) {
1571  // Specific kernel for 5x5
1572  status = pixErodeCL_55(wpl, h);
1573  return status;
1574  }
1575 
1576  lwmask = lmask32[31 - (xn & 31)];
1577  rwmask = rmask32[31 - (xp & 31)];
1578 
1579  // global and local work dimensions for Horizontal pass
1580  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1581  globalThreads[0] = gsize;
1582  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1583  globalThreads[1] = gsize;
1584  localThreads[0] = GROUPSIZE_X;
1585  localThreads[1] = GROUPSIZE_Y;
1586 
1587  // Horizontal Pass
1588  if (xp > 31 || xn > 31) {
1589  // Generic case.
1590  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeHor", &status);
1591 
1592  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1593  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1594  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1595  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(xn), &xn);
1596  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(wpl), &wpl);
1597  status = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(h), &h);
1598  status =
1599  clSetKernelArg(rEnv.mpkKernel, 6, sizeof(isAsymmetric), &isAsymmetric);
1600  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(rwmask), &rwmask);
1601  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(lwmask), &lwmask);
1602  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1603  nullptr, globalThreads, localThreads, 0,
1604  nullptr, nullptr);
1605 
1606  if (yp > 0 || yn > 0) {
1607  pixtemp = pixsCLBuffer;
1608  pixsCLBuffer = pixdCLBuffer;
1609  pixdCLBuffer = pixtemp;
1610  }
1611  } else if (xp > 0 || xn > 0) {
1612  rEnv.mpkKernel =
1613  clCreateKernel(rEnv.mpkProgram, "morphoErodeHor_32word", &status);
1614  isEven = (xp != xn);
1615 
1616  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1617  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1618  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(xp), &xp);
1619  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1620  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1621  status =
1622  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1623  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(rwmask), &rwmask);
1624  status = clSetKernelArg(rEnv.mpkKernel, 7, sizeof(lwmask), &lwmask);
1625  status = clSetKernelArg(rEnv.mpkKernel, 8, sizeof(isEven), &isEven);
1626  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1627  nullptr, globalThreads, localThreads, 0,
1628  nullptr, nullptr);
1629 
1630  if (yp > 0 || yn > 0) {
1631  pixtemp = pixsCLBuffer;
1632  pixsCLBuffer = pixdCLBuffer;
1633  pixdCLBuffer = pixtemp;
1634  }
1635  }
1636 
1637  // Vertical Pass
1638  if (yp > 0 || yn > 0) {
1639  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "morphoErodeVer", &status);
1640  CHECK_OPENCL(status, "clCreateKernel morphoErodeVer");
1641 
1642  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &pixsCLBuffer);
1643  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &pixdCLBuffer);
1644  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(yp), &yp);
1645  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(wpl), &wpl);
1646  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(h), &h);
1647  status =
1648  clSetKernelArg(rEnv.mpkKernel, 5, sizeof(isAsymmetric), &isAsymmetric);
1649  status = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(yn), &yn);
1650  status = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2,
1651  nullptr, globalThreads, localThreads, 0,
1652  nullptr, nullptr);
1653  }
1654 
1655  return status;
1656 }
1657 
1658 //Morphology Open operation. Invokes the relevant OpenCL kernels
1659 static cl_int pixOpenCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1660 {
1661  cl_int status;
1662  cl_mem pixtemp;
1663 
1664  //Erode followed by Dilate
1665  status = pixErodeCL(hsize, vsize, wpl, h);
1666 
1667  pixtemp = pixsCLBuffer;
1668  pixsCLBuffer = pixdCLBuffer;
1669  pixdCLBuffer = pixtemp;
1670 
1671  status = pixDilateCL(hsize, vsize, wpl, h);
1672 
1673  return status;
1674 }
1675 
1676 //Morphology Close operation. Invokes the relevant OpenCL kernels
1677 static cl_int pixCloseCL(l_int32 hsize, l_int32 vsize, l_int32 wpl, l_int32 h)
1678 {
1679  cl_int status;
1680  cl_mem pixtemp;
1681 
1682  //Dilate followed by Erode
1683  status = pixDilateCL(hsize, vsize, wpl, h);
1684 
1685  pixtemp = pixsCLBuffer;
1686  pixsCLBuffer = pixdCLBuffer;
1687  pixdCLBuffer = pixtemp;
1688 
1689  status = pixErodeCL(hsize, vsize, wpl, h);
1690 
1691  return status;
1692 }
1693 
1694 //output = buffer1 & ~(buffer2)
1695 static
1696 cl_int pixSubtractCL_work(l_uint32 wpl, l_uint32 h, cl_mem buffer1,
1697  cl_mem buffer2, cl_mem outBuffer = nullptr) {
1698  cl_int status;
1699  size_t globalThreads[2];
1700  int gsize;
1701  size_t localThreads[] = {GROUPSIZE_X, GROUPSIZE_Y};
1702 
1703  gsize = (wpl + GROUPSIZE_X - 1) / GROUPSIZE_X * GROUPSIZE_X;
1704  globalThreads[0] = gsize;
1705  gsize = (h + GROUPSIZE_Y - 1) / GROUPSIZE_Y * GROUPSIZE_Y;
1706  globalThreads[1] = gsize;
1707 
1708  if (outBuffer != nullptr) {
1709  rEnv.mpkKernel = clCreateKernel(rEnv.mpkProgram, "pixSubtract", &status);
1710  CHECK_OPENCL(status, "clCreateKernel pixSubtract");
1711  } else {
1712  rEnv.mpkKernel =
1713  clCreateKernel(rEnv.mpkProgram, "pixSubtract_inplace", &status);
1714  CHECK_OPENCL(status, "clCreateKernel pixSubtract_inplace");
1715  }
1716 
1717  // Enqueue a kernel run call.
1718  status = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &buffer1);
1719  status = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(cl_mem), &buffer2);
1720  status = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(wpl), &wpl);
1721  status = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(h), &h);
1722  if (outBuffer != nullptr) {
1723  status = clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &outBuffer);
1724  }
1725  status =
1726  clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 2, nullptr,
1727  globalThreads, localThreads, 0, nullptr, nullptr);
1728 
1729  return status;
1730 }
1731 
1732 // OpenCL implementation of Get Lines from pix function
1733 //Note: Assumes the source and dest opencl buffer are initialized. No check done
1734 void OpenclDevice::pixGetLinesCL(Pix *pixd, Pix *pixs, Pix **pix_vline,
1735  Pix **pix_hline, Pix **pixClosed,
1736  bool getpixClosed, l_int32 close_hsize,
1737  l_int32 close_vsize, l_int32 open_hsize,
1738  l_int32 open_vsize, l_int32 line_hsize,
1739  l_int32 line_vsize) {
1740  l_uint32 wpl, h;
1741  cl_mem pixtemp;
1742 
1743  wpl = pixGetWpl(pixs);
1744  h = pixGetHeight(pixs);
1745 
1746  // First step : Close Morph operation: Dilate followed by Erode
1747  clStatus = pixCloseCL(close_hsize, close_vsize, wpl, h);
1748 
1749  // Copy the Close output to CPU buffer
1750  if (getpixClosed) {
1751  *pixClosed = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pixClosed, pixs,
1752  wpl * h, CL_MAP_READ, true, false);
1753  }
1754 
1755  // Store the output of close operation in an intermediate buffer
1756  // this will be later used for pixsubtract
1757  clStatus =
1758  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1759  0, sizeof(int) * wpl * h, 0, nullptr, nullptr);
1760 
1761  // Second step: Open Operation - Erode followed by Dilate
1762  pixtemp = pixsCLBuffer;
1763  pixsCLBuffer = pixdCLBuffer;
1764  pixdCLBuffer = pixtemp;
1765 
1766  clStatus = pixOpenCL(open_hsize, open_vsize, wpl, h);
1767 
1768  // Third step: Subtract : (Close - Open)
1769  pixtemp = pixsCLBuffer;
1770  pixsCLBuffer = pixdCLBuffer;
1771  pixdCLBuffer = pixdCLIntermediate;
1772  pixdCLIntermediate = pixtemp;
1773 
1774  clStatus = pixSubtractCL_work(wpl, h, pixdCLBuffer, pixsCLBuffer);
1775 
1776  // Store the output of Hollow operation in an intermediate buffer
1777  // this will be later used
1778  clStatus =
1779  clEnqueueCopyBuffer(rEnv.mpkCmdQueue, pixdCLBuffer, pixdCLIntermediate, 0,
1780  0, sizeof(int) * wpl * h, 0, nullptr, nullptr);
1781 
1782  pixtemp = pixsCLBuffer;
1783  pixsCLBuffer = pixdCLBuffer;
1784  pixdCLBuffer = pixtemp;
1785 
1786  // Fourth step: Get vertical line
1787  // pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
1788  clStatus = pixOpenCL(1, line_vsize, wpl, h);
1789 
1790  // Copy the vertical line output to CPU buffer
1791  *pix_vline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_vline, pixs, wpl * h,
1792  CL_MAP_READ, true, false);
1793 
1794  pixtemp = pixsCLBuffer;
1795  pixsCLBuffer = pixdCLIntermediate;
1796  pixdCLIntermediate = pixtemp;
1797 
1798  // Fifth step: Get horizontal line
1799  // pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
1800  clStatus = pixOpenCL(line_hsize, 1, wpl, h);
1801 
1802  // Copy the horizontal line output to CPU buffer
1803  *pix_hline = mapOutputCLBuffer(rEnv, pixdCLBuffer, *pix_hline, pixs, wpl * h,
1804  CL_MAP_READ, true, true);
1805 
1806  return;
1807 }
1808 
1809 /*************************************************************************
1810  * HistogramRect
1811  * Otsu Thresholding Operations
1812  * histogramAllChannels is laid out as all channel 0, then all channel 1...
1813  * only supports 1 or 4 channels (bytes_per_pixel)
1814  ************************************************************************/
1815 int OpenclDevice::HistogramRectOCL(unsigned char *imageData,
1816  int bytes_per_pixel, int bytes_per_line,
1817  int left, // always 0
1818  int top, // always 0
1819  int width, int height, int kHistogramSize,
1820  int *histogramAllChannels) {
1821  PERF_COUNT_START("HistogramRectOCL")
1822  cl_int clStatus;
1823  int retVal = 0;
1824  KernelEnv histKern;
1825  SetKernelEnv(&histKern);
1826  KernelEnv histRedKern;
1827  SetKernelEnv(&histRedKern);
1828  /* map imagedata to device as read only */
1829  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
1830  // coherent which we don't need.
1831  // faster option would be to allocate initial image buffer
1832  // using a garlic bus memory type
1833  cl_mem imageBuffer = clCreateBuffer(
1834  histKern.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
1835  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
1836  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1837 
1838  /* setup work group size parameters */
1839  int block_size = 256;
1840  cl_uint numCUs;
1841  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
1842  sizeof(numCUs), &numCUs, nullptr);
1843  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
1844 
1845  int requestedOccupancy = 10;
1846  int numWorkGroups = numCUs * requestedOccupancy;
1847  int numThreads = block_size * numWorkGroups;
1848  size_t local_work_size[] = {static_cast<size_t>(block_size)};
1849  size_t global_work_size[] = {static_cast<size_t>(numThreads)};
1850  size_t red_global_work_size[] = {
1851  static_cast<size_t>(block_size * kHistogramSize * bytes_per_pixel)};
1852 
1853  /* map histogramAllChannels as write only */
1854  int numBins = kHistogramSize * bytes_per_pixel * numWorkGroups;
1855 
1856  cl_mem histogramBuffer = clCreateBuffer(
1857  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
1858  kHistogramSize * bytes_per_pixel * sizeof(int), histogramAllChannels,
1859  &clStatus);
1860  CHECK_OPENCL(clStatus, "clCreateBuffer histogramBuffer");
1861 
1862  /* intermediate histogram buffer */
1863  int histRed = 256;
1864  int tmpHistogramBins = kHistogramSize * bytes_per_pixel * histRed;
1865 
1866  cl_mem tmpHistogramBuffer =
1867  clCreateBuffer(histKern.mpkContext, CL_MEM_READ_WRITE,
1868  tmpHistogramBins * sizeof(cl_uint), nullptr, &clStatus);
1869  CHECK_OPENCL(clStatus, "clCreateBuffer tmpHistogramBuffer");
1870 
1871  /* atomic sync buffer */
1872  int *zeroBuffer = new int[1];
1873  zeroBuffer[0] = 0;
1874  cl_mem atomicSyncBuffer = clCreateBuffer(
1875  histKern.mpkContext, CL_MEM_READ_WRITE | CL_MEM_COPY_HOST_PTR,
1876  sizeof(cl_int), zeroBuffer, &clStatus);
1877  CHECK_OPENCL(clStatus, "clCreateBuffer atomicSyncBuffer");
1878  delete[] zeroBuffer;
1879  // Create kernel objects based on bytes_per_pixel
1880  if (bytes_per_pixel == 1) {
1881  histKern.mpkKernel = clCreateKernel(
1882  histKern.mpkProgram, "kernel_HistogramRectOneChannel", &clStatus);
1883  CHECK_OPENCL(clStatus, "clCreateKernel kernel_HistogramRectOneChannel");
1884 
1885  histRedKern.mpkKernel =
1886  clCreateKernel(histRedKern.mpkProgram,
1887  "kernel_HistogramRectOneChannelReduction", &clStatus);
1888  CHECK_OPENCL(clStatus,
1889  "clCreateKernel kernel_HistogramRectOneChannelReduction");
1890  } else {
1891  histKern.mpkKernel = clCreateKernel( histKern.mpkProgram, "kernel_HistogramRectAllChannels", &clStatus );
1892  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannels");
1893 
1894  histRedKern.mpkKernel = clCreateKernel( histRedKern.mpkProgram, "kernel_HistogramRectAllChannelsReduction", &clStatus );
1895  CHECK_OPENCL( clStatus, "clCreateKernel kernel_HistogramRectAllChannelsReduction");
1896  }
1897 
1898  void *ptr;
1899 
1900  //Initialize tmpHistogramBuffer buffer
1901  ptr = clEnqueueMapBuffer(
1902  histKern.mpkCmdQueue, tmpHistogramBuffer, CL_TRUE, CL_MAP_WRITE, 0,
1903  tmpHistogramBins * sizeof(cl_uint), 0, nullptr, nullptr, &clStatus);
1904  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer tmpHistogramBuffer");
1905 
1906  memset(ptr, 0, tmpHistogramBins*sizeof(cl_uint));
1907  clEnqueueUnmapMemObject(histKern.mpkCmdQueue, tmpHistogramBuffer, ptr, 0,
1908  nullptr, nullptr);
1909 
1910  /* set kernel 1 arguments */
1911  clStatus =
1912  clSetKernelArg(histKern.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
1913  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
1914  cl_uint numPixels = width*height;
1915  clStatus =
1916  clSetKernelArg(histKern.mpkKernel, 1, sizeof(cl_uint), &numPixels);
1917  CHECK_OPENCL( clStatus, "clSetKernelArg numPixels" );
1918  clStatus = clSetKernelArg(histKern.mpkKernel, 2, sizeof(cl_mem),
1919  &tmpHistogramBuffer);
1920  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
1921 
1922  /* set kernel 2 arguments */
1923  int n = numThreads/bytes_per_pixel;
1924  clStatus = clSetKernelArg(histRedKern.mpkKernel, 0, sizeof(cl_int), &n);
1925  CHECK_OPENCL( clStatus, "clSetKernelArg imageBuffer");
1926  clStatus = clSetKernelArg(histRedKern.mpkKernel, 1, sizeof(cl_mem),
1927  &tmpHistogramBuffer);
1928  CHECK_OPENCL( clStatus, "clSetKernelArg tmpHistogramBuffer");
1929  clStatus = clSetKernelArg(histRedKern.mpkKernel, 2, sizeof(cl_mem),
1930  &histogramBuffer);
1931  CHECK_OPENCL( clStatus, "clSetKernelArg histogramBuffer");
1932 
1933  /* launch histogram */
1934 PERF_COUNT_SUB("before")
1935 clStatus = clEnqueueNDRangeKernel(histKern.mpkCmdQueue, histKern.mpkKernel, 1,
1936  nullptr, global_work_size, local_work_size, 0,
1937  nullptr, nullptr);
1938 CHECK_OPENCL(clStatus,
1939  "clEnqueueNDRangeKernel kernel_HistogramRectAllChannels");
1940 clFinish(histKern.mpkCmdQueue);
1941 if (clStatus != 0) {
1942  retVal = -1;
1943  }
1944  /* launch histogram */
1945  clStatus = clEnqueueNDRangeKernel(
1946  histRedKern.mpkCmdQueue, histRedKern.mpkKernel, 1, nullptr,
1947  red_global_work_size, local_work_size, 0, nullptr, nullptr);
1948  CHECK_OPENCL( clStatus, "clEnqueueNDRangeKernel kernel_HistogramRectAllChannelsReduction" );
1949  clFinish( histRedKern.mpkCmdQueue );
1950  if (clStatus != 0) {
1951  retVal = -1;
1952  }
1953  PERF_COUNT_SUB("redKernel")
1954 
1955  /* map results back from gpu */
1956  ptr = clEnqueueMapBuffer(histRedKern.mpkCmdQueue, histogramBuffer, CL_TRUE,
1957  CL_MAP_READ, 0,
1958  kHistogramSize * bytes_per_pixel * sizeof(int), 0,
1959  nullptr, nullptr, &clStatus);
1960  CHECK_OPENCL( clStatus, "clEnqueueMapBuffer histogramBuffer");
1961  if (clStatus != 0) {
1962  retVal = -1;
1963  }
1964  clEnqueueUnmapMemObject(histRedKern.mpkCmdQueue, histogramBuffer, ptr, 0,
1965  nullptr, nullptr);
1966 
1967  clReleaseMemObject(histogramBuffer);
1968  clReleaseMemObject(imageBuffer);
1969 PERF_COUNT_SUB("after")
1971 return retVal;
1972 }
1973 
1974 /*************************************************************************
1975  * Threshold the rectangle, taking everything except the image buffer pointer
1976  * from the class, using thresholds/hi_values to the output IMAGE.
1977  * only supports 1 or 4 channels
1978  ************************************************************************/
1979 int OpenclDevice::ThresholdRectToPixOCL(unsigned char *imageData,
1980  int bytes_per_pixel, int bytes_per_line,
1981  int *thresholds, int *hi_values,
1982  Pix **pix, int height, int width,
1983  int top, int left) {
1984  PERF_COUNT_START("ThresholdRectToPixOCL")
1985  int retVal = 0;
1986  /* create pix result buffer */
1987  *pix = pixCreate(width, height, 1);
1988  uint32_t *pixData = pixGetData(*pix);
1989  int wpl = pixGetWpl(*pix);
1990  int pixSize = wpl * height * sizeof(uint32_t); // number of pixels
1991 
1992  cl_int clStatus;
1993  KernelEnv rEnv;
1994  SetKernelEnv(&rEnv);
1995 
1996  /* setup work group size parameters */
1997  int block_size = 256;
1998  cl_uint numCUs = 6;
1999  clStatus = clGetDeviceInfo(gpuEnv.mpDevID, CL_DEVICE_MAX_COMPUTE_UNITS,
2000  sizeof(numCUs), &numCUs, nullptr);
2001  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
2002 
2003  int requestedOccupancy = 10;
2004  int numWorkGroups = numCUs * requestedOccupancy;
2005  int numThreads = block_size * numWorkGroups;
2006  size_t local_work_size[] = {(size_t)block_size};
2007  size_t global_work_size[] = {(size_t)numThreads};
2008 
2009  /* map imagedata to device as read only */
2010  // USE_HOST_PTR uses onion+ bus which is slowest option; also happens to be
2011  // coherent which we don't need.
2012  // faster option would be to allocate initial image buffer
2013  // using a garlic bus memory type
2014  cl_mem imageBuffer = clCreateBuffer(
2015  rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
2016  width * height * bytes_per_pixel * sizeof(char), imageData, &clStatus);
2017  CHECK_OPENCL(clStatus, "clCreateBuffer imageBuffer");
2018 
2019  /* map pix as write only */
2020  pixThBuffer =
2021  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_WRITE | CL_MEM_USE_HOST_PTR,
2022  pixSize, pixData, &clStatus);
2023  CHECK_OPENCL(clStatus, "clCreateBuffer pix");
2024 
2025  /* map thresholds and hi_values */
2026  cl_mem thresholdsBuffer =
2027  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
2028  bytes_per_pixel * sizeof(int), thresholds, &clStatus);
2029  CHECK_OPENCL(clStatus, "clCreateBuffer thresholdBuffer");
2030  cl_mem hiValuesBuffer =
2031  clCreateBuffer(rEnv.mpkContext, CL_MEM_READ_ONLY | CL_MEM_USE_HOST_PTR,
2032  bytes_per_pixel * sizeof(int), hi_values, &clStatus);
2033  CHECK_OPENCL(clStatus, "clCreateBuffer hiValuesBuffer");
2034 
2035  /* compile kernel */
2036  if (bytes_per_pixel == 4) {
2037  rEnv.mpkKernel =
2038  clCreateKernel(rEnv.mpkProgram, "kernel_ThresholdRectToPix", &clStatus);
2039  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix");
2040  } else {
2041  rEnv.mpkKernel = clCreateKernel(
2042  rEnv.mpkProgram, "kernel_ThresholdRectToPix_OneChan", &clStatus);
2043  CHECK_OPENCL(clStatus, "clCreateKernel kernel_ThresholdRectToPix_OneChan");
2044  }
2045 
2046  /* set kernel arguments */
2047  clStatus = clSetKernelArg(rEnv.mpkKernel, 0, sizeof(cl_mem), &imageBuffer);
2048  CHECK_OPENCL(clStatus, "clSetKernelArg imageBuffer");
2049  cl_uint numPixels = width * height;
2050  clStatus = clSetKernelArg(rEnv.mpkKernel, 1, sizeof(int), &height);
2051  CHECK_OPENCL(clStatus, "clSetKernelArg height");
2052  clStatus = clSetKernelArg(rEnv.mpkKernel, 2, sizeof(int), &width);
2053  CHECK_OPENCL(clStatus, "clSetKernelArg width");
2054  clStatus = clSetKernelArg(rEnv.mpkKernel, 3, sizeof(int), &wpl);
2055  CHECK_OPENCL(clStatus, "clSetKernelArg wpl");
2056  clStatus =
2057  clSetKernelArg(rEnv.mpkKernel, 4, sizeof(cl_mem), &thresholdsBuffer);
2058  CHECK_OPENCL(clStatus, "clSetKernelArg thresholdsBuffer");
2059  clStatus = clSetKernelArg(rEnv.mpkKernel, 5, sizeof(cl_mem), &hiValuesBuffer);
2060  CHECK_OPENCL(clStatus, "clSetKernelArg hiValuesBuffer");
2061  clStatus = clSetKernelArg(rEnv.mpkKernel, 6, sizeof(cl_mem), &pixThBuffer);
2062  CHECK_OPENCL(clStatus, "clSetKernelArg pixThBuffer");
2063 
2064  /* launch kernel & wait */
2065  PERF_COUNT_SUB("before")
2066  clStatus = clEnqueueNDRangeKernel(rEnv.mpkCmdQueue, rEnv.mpkKernel, 1,
2067  nullptr, global_work_size, local_work_size,
2068  0, nullptr, nullptr);
2069  CHECK_OPENCL(clStatus, "clEnqueueNDRangeKernel kernel_ThresholdRectToPix");
2070  clFinish(rEnv.mpkCmdQueue);
2071  PERF_COUNT_SUB("kernel")
2072  if (clStatus != 0) {
2073  printf("Setting return value to -1\n");
2074  retVal = -1;
2075  }
2076  /* map results back from gpu */
2077  void *ptr =
2078  clEnqueueMapBuffer(rEnv.mpkCmdQueue, pixThBuffer, CL_TRUE, CL_MAP_READ, 0,
2079  pixSize, 0, nullptr, nullptr, &clStatus);
2080  CHECK_OPENCL(clStatus, "clEnqueueMapBuffer histogramBuffer");
2081  clEnqueueUnmapMemObject(rEnv.mpkCmdQueue, pixThBuffer, ptr, 0, nullptr,
2082  nullptr);
2083 
2084  clReleaseMemObject(imageBuffer);
2085  clReleaseMemObject(thresholdsBuffer);
2086  clReleaseMemObject(hiValuesBuffer);
2087 
2088  PERF_COUNT_SUB("after")
2090  return retVal;
2091 }
2092 
2093 
2094 
2095 /******************************************************************************
2096  * Data Types for Device Selection
2097  *****************************************************************************/
2098 
2099 typedef struct _TessScoreEvaluationInputData {
2100  int height;
2101  int width;
2102  int numChannels;
2103  unsigned char *imageData;
2104  Pix *pix;
2105 } TessScoreEvaluationInputData;
2106 
2107 static void populateTessScoreEvaluationInputData(TessScoreEvaluationInputData *input) {
2108  srand(1);
2109  // 8.5x11 inches @ 300dpi rounded to clean multiples
2110  int height = 3328; // %256
2111  int width = 2560; // %512
2112  int numChannels = 4;
2113  input->height = height;
2114  input->width = width;
2115  input->numChannels = numChannels;
2116  unsigned char (*imageData4)[4] = (unsigned char (*)[4]) malloc(height*width*numChannels*sizeof(unsigned char)); // new unsigned char[4][height*width];
2117  input->imageData = (unsigned char *) &imageData4[0];
2118 
2119  // zero out image
2120  unsigned char pixelWhite[4] = { 0, 0, 0, 255};
2121  unsigned char pixelBlack[4] = {255, 255, 255, 255};
2122  for (int p = 0; p < height*width; p++) {
2123  //unsigned char tmp[4] = imageData4[0];
2124  imageData4[p][0] = pixelWhite[0];
2125  imageData4[p][1] = pixelWhite[1];
2126  imageData4[p][2] = pixelWhite[2];
2127  imageData4[p][3] = pixelWhite[3];
2128  }
2129  // random lines to be eliminated
2130  int maxLineWidth = 64; // pixels wide
2131  int numLines = 10;
2132  // vertical lines
2133  for (int i = 0; i < numLines; i++) {
2134  int lineWidth = rand()%maxLineWidth;
2135  int vertLinePos = lineWidth + rand()%(width-2*lineWidth);
2136  //printf("[PI] VerticalLine @ %i (w=%i)\n", vertLinePos, lineWidth);
2137  for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2138  for (int col = 0; col < height; col++) {
2139  //imageData4[row*width+col] = pixelBlack;
2140  imageData4[row*width+col][0] = pixelBlack[0];
2141  imageData4[row*width+col][1] = pixelBlack[1];
2142  imageData4[row*width+col][2] = pixelBlack[2];
2143  imageData4[row*width+col][3] = pixelBlack[3];
2144  }
2145  }
2146  }
2147  // horizontal lines
2148  for (int i = 0; i < numLines; i++) {
2149  int lineWidth = rand()%maxLineWidth;
2150  int horLinePos = lineWidth + rand()%(height-2*lineWidth);
2151  //printf("[PI] HorizontalLine @ %i (w=%i)\n", horLinePos, lineWidth);
2152  for (int row = 0; row < width; row++) {
2153  for (int col = horLinePos-lineWidth/2; col < horLinePos+lineWidth/2; col++) { // for (int row = vertLinePos-lineWidth/2; row < vertLinePos+lineWidth/2; row++) {
2154  //printf("[PI] HoizLine pix @ (%3i, %3i)\n", row, col);
2155  //imageData4[row*width+col] = pixelBlack;
2156  imageData4[row*width+col][0] = pixelBlack[0];
2157  imageData4[row*width+col][1] = pixelBlack[1];
2158  imageData4[row*width+col][2] = pixelBlack[2];
2159  imageData4[row*width+col][3] = pixelBlack[3];
2160  }
2161  }
2162  }
2163  // spots (noise, squares)
2164  float fractionBlack = 0.1; // how much of the image should be blackened
2165  int numSpots = (height*width)*fractionBlack/(maxLineWidth*maxLineWidth/2/2);
2166  for (int i = 0; i < numSpots; i++) {
2167  int lineWidth = rand()%maxLineWidth;
2168  int col = lineWidth + rand()%(width-2*lineWidth);
2169  int row = lineWidth + rand()%(height-2*lineWidth);
2170  //printf("[PI] Spot[%i/%i] @ (%3i, %3i)\n", i, numSpots, row, col );
2171  for (int r = row-lineWidth/2; r < row+lineWidth/2; r++) {
2172  for (int c = col-lineWidth/2; c < col+lineWidth/2; c++) {
2173  //printf("[PI] \tSpot[%i/%i] @ (%3i, %3i)\n", i, numSpots, r, c );
2174  //imageData4[row*width+col] = pixelBlack;
2175  imageData4[r*width+c][0] = pixelBlack[0];
2176  imageData4[r*width+c][1] = pixelBlack[1];
2177  imageData4[r*width+c][2] = pixelBlack[2];
2178  imageData4[r*width+c][3] = pixelBlack[3];
2179  }
2180  }
2181  }
2182 
2183  input->pix = pixCreate(input->width, input->height, 1);
2184 }
2185 
2186 typedef struct _TessDeviceScore {
2187  float time; // small time means faster device
2188  bool clError; // were there any opencl errors
2189  bool valid; // was the correct response generated
2190 } TessDeviceScore;
2191 
2192 /******************************************************************************
2193  * Micro Benchmarks for Device Selection
2194  *****************************************************************************/
2195 
2196 static double composeRGBPixelMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) {
2197  double time = 0;
2198 #if ON_WINDOWS
2199  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2200  QueryPerformanceFrequency(&freq);
2201 #elif ON_APPLE
2202  mach_timebase_info_data_t info = {0, 0};
2203  mach_timebase_info(&info);
2204  long long start, stop;
2205 #else
2206  timespec time_funct_start, time_funct_end;
2207 #endif
2208  // input data
2209  l_uint32 *tiffdata = (l_uint32 *)input.imageData;// same size and random data; data doesn't change workload
2210 
2211  // function call
2212  if (type == DS_DEVICE_OPENCL_DEVICE) {
2213 #if ON_WINDOWS
2214  QueryPerformanceCounter(&time_funct_start);
2215 #elif ON_APPLE
2216  start = mach_absolute_time();
2217 #else
2218  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2219 #endif
2220 
2221  OpenclDevice::gpuEnv = *env;
2222  int wpl = pixGetWpl(input.pix);
2223  OpenclDevice::pixReadFromTiffKernel(tiffdata, input.width, input.height,
2224  wpl, nullptr);
2225 #if ON_WINDOWS
2226  QueryPerformanceCounter(&time_funct_end);
2227  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2228 #elif ON_APPLE
2229  stop = mach_absolute_time();
2230  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2231 #else
2232  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2233  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2234 #endif
2235 
2236  } else {
2237 #if ON_WINDOWS
2238  QueryPerformanceCounter(&time_funct_start);
2239 #elif ON_APPLE
2240  start = mach_absolute_time();
2241 #else
2242  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2243 #endif
2244  Pix *pix = pixCreate(input.width, input.height, 32);
2245  l_uint32 *pixData = pixGetData(pix);
2246  int wpl = pixGetWpl(pix);
2247  //l_uint32* output_gpu=pixReadFromTiffKernel(tiffdata,w,h,wpl,line);
2248  //pixSetData(pix, output_gpu);
2249  int i, j;
2250  int idx = 0;
2251  for (i = 0; i < input.height ; i++) {
2252  for (j = 0; j < input.width; j++) {
2253  l_uint32 tiffword = tiffdata[i * input.width + j];
2254  l_int32 rval = ((tiffword) & 0xff);
2255  l_int32 gval = (((tiffword) >> 8) & 0xff);
2256  l_int32 bval = (((tiffword) >> 16) & 0xff);
2257  l_uint32 value = (rval << 24) | (gval << 16) | (bval << 8);
2258  pixData[idx] = value;
2259  idx++;
2260  }
2261  }
2262 #if ON_WINDOWS
2263  QueryPerformanceCounter(&time_funct_end);
2264  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2265 #elif ON_APPLE
2266  stop = mach_absolute_time();
2267  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2268 #else
2269  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2270  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2271 #endif
2272  pixDestroy(&pix);
2273  }
2274 
2275 
2276  // cleanup
2277 
2278  return time;
2279 }
2280 
2281 static double histogramRectMicroBench( GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type ) {
2282  double time;
2283 #if ON_WINDOWS
2284  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2285  QueryPerformanceFrequency(&freq);
2286 #elif ON_APPLE
2287  mach_timebase_info_data_t info = {0, 0};
2288  mach_timebase_info(&info);
2289  long long start, stop;
2290 #else
2291  timespec time_funct_start, time_funct_end;
2292 #endif
2293 
2294  unsigned char pixelHi = (unsigned char)255;
2295 
2296  int left = 0;
2297  int top = 0;
2298  int kHistogramSize = 256;
2299  int bytes_per_line = input.width*input.numChannels;
2300  int *histogramAllChannels = new int[kHistogramSize*input.numChannels];
2301  int retVal = 0;
2302  // function call
2303  if (type == DS_DEVICE_OPENCL_DEVICE) {
2304 #if ON_WINDOWS
2305  QueryPerformanceCounter(&time_funct_start);
2306 #elif ON_APPLE
2307  start = mach_absolute_time();
2308 #else
2309  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2310 #endif
2311 
2312  OpenclDevice::gpuEnv = *env;
2313  int wpl = pixGetWpl(input.pix);
2314  retVal = OpenclDevice::HistogramRectOCL(
2315  input.imageData, input.numChannels, bytes_per_line, top, left,
2316  input.width, input.height, kHistogramSize, histogramAllChannels);
2317 
2318 #if ON_WINDOWS
2319  QueryPerformanceCounter(&time_funct_end);
2320  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2321 #elif ON_APPLE
2322  stop = mach_absolute_time();
2323  if (retVal == 0) {
2324  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2325  } else {
2326  time = FLT_MAX;
2327  }
2328 #else
2329  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2330  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2331 #endif
2332  } else {
2333  int *histogram = new int[kHistogramSize];
2334 #if ON_WINDOWS
2335  QueryPerformanceCounter(&time_funct_start);
2336 #elif ON_APPLE
2337  start = mach_absolute_time();
2338 #else
2339  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2340 #endif
2341  for (int ch = 0; ch < input.numChannels; ++ch) {
2342  tesseract::HistogramRect(input.pix, input.numChannels, left, top,
2343  input.width, input.height, histogram);
2344  }
2345 #if ON_WINDOWS
2346  QueryPerformanceCounter(&time_funct_end);
2347  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2348 #elif ON_APPLE
2349  stop = mach_absolute_time();
2350  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2351 #else
2352  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2353  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2354 #endif
2355  delete[] histogram;
2356  }
2357 
2358  // cleanup
2359  delete[] histogramAllChannels;
2360  return time;
2361 }
2362 
2363 //Reproducing the ThresholdRectToPix native version
2364 static void ThresholdRectToPix_Native(const unsigned char* imagedata,
2365  int bytes_per_pixel,
2366  int bytes_per_line,
2367  const int* thresholds,
2368  const int* hi_values,
2369  Pix** pix) {
2370  int top = 0;
2371  int left = 0;
2372  int width = pixGetWidth(*pix);
2373  int height = pixGetHeight(*pix);
2374 
2375  *pix = pixCreate(width, height, 1);
2376  uint32_t *pixdata = pixGetData(*pix);
2377  int wpl = pixGetWpl(*pix);
2378  const unsigned char* srcdata = imagedata + top * bytes_per_line +
2379  left * bytes_per_pixel;
2380  for (int y = 0; y < height; ++y) {
2381  const uint8_t *linedata = srcdata;
2382  uint32_t *pixline = pixdata + y * wpl;
2383  for (int x = 0; x < width; ++x, linedata += bytes_per_pixel) {
2384  bool white_result = true;
2385  for (int ch = 0; ch < bytes_per_pixel; ++ch) {
2386  if (hi_values[ch] >= 0 &&
2387  (linedata[ch] > thresholds[ch]) == (hi_values[ch] == 0)) {
2388  white_result = false;
2389  break;
2390  }
2391  }
2392  if (white_result)
2393  CLEAR_DATA_BIT(pixline, x);
2394  else
2395  SET_DATA_BIT(pixline, x);
2396  }
2397  srcdata += bytes_per_line;
2398  }
2399 }
2400 
2401 static double thresholdRectToPixMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) {
2402  double time;
2403  int retVal = 0;
2404 #if ON_WINDOWS
2405  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2406  QueryPerformanceFrequency(&freq);
2407 #elif ON_APPLE
2408  mach_timebase_info_data_t info = {0, 0};
2409  mach_timebase_info(&info);
2410  long long start, stop;
2411 #else
2412  timespec time_funct_start, time_funct_end;
2413 #endif
2414 
2415  // input data
2416  unsigned char pixelHi = (unsigned char)255;
2417  int* thresholds = new int[4];
2418  thresholds[0] = pixelHi/2;
2419  thresholds[1] = pixelHi/2;
2420  thresholds[2] = pixelHi/2;
2421  thresholds[3] = pixelHi/2;
2422  int *hi_values = new int[4];
2423  thresholds[0] = pixelHi;
2424  thresholds[1] = pixelHi;
2425  thresholds[2] = pixelHi;
2426  thresholds[3] = pixelHi;
2427  //Pix* pix = pixCreate(width, height, 1);
2428  int top = 0;
2429  int left = 0;
2430  int bytes_per_line = input.width*input.numChannels;
2431 
2432  // function call
2433  if (type == DS_DEVICE_OPENCL_DEVICE) {
2434 #if ON_WINDOWS
2435  QueryPerformanceCounter(&time_funct_start);
2436 #elif ON_APPLE
2437  start = mach_absolute_time();
2438 #else
2439  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2440 #endif
2441 
2442  OpenclDevice::gpuEnv = *env;
2443  int wpl = pixGetWpl(input.pix);
2444  retVal = OpenclDevice::ThresholdRectToPixOCL(
2445  input.imageData, input.numChannels, bytes_per_line, thresholds,
2446  hi_values, &input.pix, input.height, input.width, top, left);
2447 
2448 #if ON_WINDOWS
2449  QueryPerformanceCounter(&time_funct_end);
2450  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2451 #elif ON_APPLE
2452  stop = mach_absolute_time();
2453  if (retVal == 0) {
2454  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2455  ;
2456  } else {
2457  time = FLT_MAX;
2458  }
2459 
2460 #else
2461  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2462  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2463 #endif
2464  } else {
2465 
2466 
2467  tesseract::ImageThresholder thresholder;
2468  thresholder.SetImage( input.pix );
2469 #if ON_WINDOWS
2470  QueryPerformanceCounter(&time_funct_start);
2471 #elif ON_APPLE
2472  start = mach_absolute_time();
2473 #else
2474  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2475 #endif
2476  ThresholdRectToPix_Native( input.imageData, input.numChannels, bytes_per_line,
2477  thresholds, hi_values, &input.pix );
2478 
2479 #if ON_WINDOWS
2480  QueryPerformanceCounter(&time_funct_end);
2481  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2482 #elif ON_APPLE
2483  stop = mach_absolute_time();
2484  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2485 #else
2486  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2487  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2488 #endif
2489  }
2490 
2491  // cleanup
2492  delete[] thresholds;
2493  delete[] hi_values;
2494  return time;
2495 }
2496 
2497 static double getLineMasksMorphMicroBench(GPUEnv *env, TessScoreEvaluationInputData input, ds_device_type type) {
2498 
2499  double time = 0;
2500 #if ON_WINDOWS
2501  LARGE_INTEGER freq, time_funct_start, time_funct_end;
2502  QueryPerformanceFrequency(&freq);
2503 #elif ON_APPLE
2504  mach_timebase_info_data_t info = {0, 0};
2505  mach_timebase_info(&info);
2506  long long start, stop;
2507 #else
2508  timespec time_funct_start, time_funct_end;
2509 #endif
2510 
2511  // input data
2512  int resolution = 300;
2513  int wpl = pixGetWpl(input.pix);
2514  int kThinLineFraction = 20; // tess constant
2515  int kMinLineLengthFraction = 4; // tess constant
2516  int max_line_width = resolution / kThinLineFraction;
2517  int min_line_length = resolution / kMinLineLengthFraction;
2518  int closing_brick = max_line_width / 3;
2519 
2520  // function call
2521  if (type == DS_DEVICE_OPENCL_DEVICE) {
2522 #if ON_WINDOWS
2523  QueryPerformanceCounter(&time_funct_start);
2524 #elif ON_APPLE
2525  start = mach_absolute_time();
2526 #else
2527  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2528 #endif
2529  Pix *src_pix = input.pix;
2530  OpenclDevice::gpuEnv = *env;
2531  OpenclDevice::initMorphCLAllocations(wpl, input.height, input.pix);
2532  Pix *pix_vline = nullptr, *pix_hline = nullptr, *pix_closed = nullptr;
2533  OpenclDevice::pixGetLinesCL(
2534  nullptr, input.pix, &pix_vline, &pix_hline, &pix_closed, true,
2535  closing_brick, closing_brick, max_line_width, max_line_width,
2536  min_line_length, min_line_length);
2537 
2538  OpenclDevice::releaseMorphCLBuffers();
2539 
2540 #if ON_WINDOWS
2541  QueryPerformanceCounter(&time_funct_end);
2542  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2543 #elif ON_APPLE
2544  stop = mach_absolute_time();
2545  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2546 #else
2547  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2548  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2549 #endif
2550  } else {
2551 #if ON_WINDOWS
2552  QueryPerformanceCounter(&time_funct_start);
2553 #elif ON_APPLE
2554  start = mach_absolute_time();
2555 #else
2556  clock_gettime( CLOCK_MONOTONIC, &time_funct_start );
2557 #endif
2558 
2559  // native serial code
2560  Pix *src_pix = input.pix;
2561  Pix *pix_closed =
2562  pixCloseBrick(nullptr, src_pix, closing_brick, closing_brick);
2563  Pix *pix_solid =
2564  pixOpenBrick(nullptr, pix_closed, max_line_width, max_line_width);
2565  Pix *pix_hollow = pixSubtract(nullptr, pix_closed, pix_solid);
2566  pixDestroy(&pix_solid);
2567  Pix *pix_vline = pixOpenBrick(nullptr, pix_hollow, 1, min_line_length);
2568  Pix *pix_hline = pixOpenBrick(nullptr, pix_hollow, min_line_length, 1);
2569  pixDestroy(&pix_hollow);
2570 
2571 #if ON_WINDOWS
2572  QueryPerformanceCounter(&time_funct_end);
2573  time = (time_funct_end.QuadPart-time_funct_start.QuadPart)/(double)(freq.QuadPart);
2574 #elif ON_APPLE
2575  stop = mach_absolute_time();
2576  time = ((stop - start) * (double)info.numer / info.denom) / 1.0E9;
2577 #else
2578  clock_gettime( CLOCK_MONOTONIC, &time_funct_end );
2579  time = (time_funct_end.tv_sec - time_funct_start.tv_sec)*1.0 + (time_funct_end.tv_nsec - time_funct_start.tv_nsec)/1000000000.0;
2580 #endif
2581  }
2582 
2583  return time;
2584 }
2585 
2586 
2587 
2588 /******************************************************************************
2589  * Device Selection
2590  *****************************************************************************/
2591 
2592 #include "stdlib.h"
2593 
2594 // encode score object as byte string
2595 static ds_status serializeScore( ds_device* device, void **serializedScore, unsigned int* serializedScoreSize ) {
2596  *serializedScoreSize = sizeof(TessDeviceScore);
2597  *serializedScore = new unsigned char[*serializedScoreSize];
2598  memcpy(*serializedScore, device->score, *serializedScoreSize);
2599  return DS_SUCCESS;
2600 }
2601 
2602 // parses byte string and stores in score object
2603 static ds_status deserializeScore( ds_device* device, const unsigned char* serializedScore, unsigned int serializedScoreSize ) {
2604  // check that serializedScoreSize == sizeof(TessDeviceScore);
2605  device->score = new TessDeviceScore;
2606  memcpy(device->score, serializedScore, serializedScoreSize);
2607  return DS_SUCCESS;
2608 }
2609 
2610 static ds_status releaseScore(void *score) {
2611  delete (TessDeviceScore *)score;
2612  return DS_SUCCESS;
2613 }
2614 
2615 // evaluate devices
2616 static ds_status evaluateScoreForDevice( ds_device *device, void *inputData) {
2617  // overwrite statuc gpuEnv w/ current device
2618  // so native opencl calls can be used; they use static gpuEnv
2619  printf("\n[DS] Device: \"%s\" (%s) evaluation...\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
2620  GPUEnv *env = nullptr;
2621  if (device->type == DS_DEVICE_OPENCL_DEVICE) {
2622  env = new GPUEnv;
2623  //printf("[DS] populating tmp GPUEnv from device\n");
2624  populateGPUEnvFromDevice( env, device->oclDeviceID);
2625  env->mnFileCount = 0; //argc;
2626  env->mnKernelCount = 0UL;
2627  //printf("[DS] compiling kernels for tmp GPUEnv\n");
2628  OpenclDevice::gpuEnv = *env;
2629  OpenclDevice::CompileKernelFile(env, "");
2630  }
2631 
2632  TessScoreEvaluationInputData *input = (TessScoreEvaluationInputData *)inputData;
2633 
2634  // pixReadTiff
2635  double composeRGBPixelTime = composeRGBPixelMicroBench( env, *input, device->type );
2636 
2637  // HistogramRect
2638  double histogramRectTime = histogramRectMicroBench( env, *input, device->type );
2639 
2640  // ThresholdRectToPix
2641  double thresholdRectToPixTime = thresholdRectToPixMicroBench( env, *input, device->type );
2642 
2643  // getLineMasks
2644  double getLineMasksMorphTime = getLineMasksMorphMicroBench( env, *input, device->type );
2645 
2646 
2647  // weigh times (% of cpu time)
2648  // these weights should be the % execution time that the native cpu code took
2649  float composeRGBPixelWeight = 1.2f;
2650  float histogramRectWeight = 2.4f;
2651  float thresholdRectToPixWeight = 4.5f;
2652  float getLineMasksMorphWeight = 5.0f;
2653 
2654  float weightedTime = composeRGBPixelWeight * composeRGBPixelTime +
2655  histogramRectWeight * histogramRectTime +
2656  thresholdRectToPixWeight * thresholdRectToPixTime +
2657  getLineMasksMorphWeight * getLineMasksMorphTime;
2658  device->score = new TessDeviceScore;
2659  ((TessDeviceScore *)device->score)->time = weightedTime;
2660 
2661  printf("[DS] Device: \"%s\" (%s) evaluated\n", device->oclDeviceName, device->type==DS_DEVICE_OPENCL_DEVICE ? "OpenCL" : "Native" );
2662  printf("[DS]%25s: %f (w=%.1f)\n", "composeRGBPixel", composeRGBPixelTime, composeRGBPixelWeight );
2663  printf("[DS]%25s: %f (w=%.1f)\n", "HistogramRect", histogramRectTime, histogramRectWeight );
2664  printf("[DS]%25s: %f (w=%.1f)\n", "ThresholdRectToPix", thresholdRectToPixTime, thresholdRectToPixWeight );
2665  printf("[DS]%25s: %f (w=%.1f)\n", "getLineMasksMorph", getLineMasksMorphTime, getLineMasksMorphWeight );
2666  printf("[DS]%25s: %f\n", "Score", ((TessDeviceScore *)device->score)->time );
2667  return DS_SUCCESS;
2668 }
2669 
2670 // initial call to select device
2671 ds_device OpenclDevice::getDeviceSelection( ) {
2672  if (!deviceIsSelected) {
2673  PERF_COUNT_START("getDeviceSelection")
2674  // check if opencl is available at runtime
2675  if (1 == LoadOpencl()) {
2676  // opencl is available
2677  // PERF_COUNT_SUB("LoadOpencl")
2678  // setup devices
2679  ds_status status;
2680  ds_profile *profile;
2681  status = initDSProfile(&profile, "v0.1");
2682  PERF_COUNT_SUB("initDSProfile")
2683  // try reading scores from file
2684  const char *fileName = "tesseract_opencl_profile_devices.dat";
2685  status = readProfileFromFile(profile, deserializeScore, fileName);
2686  if (status != DS_SUCCESS) {
2687  // need to run evaluation
2688  printf("[DS] Profile file not available (%s); performing profiling.\n",
2689  fileName);
2690 
2691  // create input data
2692  TessScoreEvaluationInputData input;
2693  populateTessScoreEvaluationInputData(&input);
2694  // PERF_COUNT_SUB("populateTessScoreEvaluationInputData")
2695  // perform evaluations
2696  unsigned int numUpdates;
2697  status = profileDevices(profile, DS_EVALUATE_ALL,
2698  evaluateScoreForDevice, &input, &numUpdates);
2699  PERF_COUNT_SUB("profileDevices")
2700  // write scores to file
2701  if (status == DS_SUCCESS) {
2702  status = writeProfileToFile(profile, serializeScore, fileName);
2703  PERF_COUNT_SUB("writeProfileToFile")
2704  if (status == DS_SUCCESS) {
2705  printf("[DS] Scores written to file (%s).\n", fileName);
2706  } else {
2707  printf(
2708  "[DS] Error saving scores to file (%s); scores not written to "
2709  "file.\n",
2710  fileName);
2711  }
2712  } else {
2713  printf(
2714  "[DS] Unable to evaluate performance; scores not written to "
2715  "file.\n");
2716  }
2717  } else {
2718  PERF_COUNT_SUB("readProfileFromFile")
2719  printf("[DS] Profile read from file (%s).\n", fileName);
2720  }
2721 
2722  // we now have device scores either from file or evaluation
2723  // select fastest using custom Tesseract selection algorithm
2724  float bestTime = FLT_MAX; // begin search with worst possible time
2725  int bestDeviceIdx = -1;
2726  for (int d = 0; d < profile->numDevices; d++) {
2727  ds_device device = profile->devices[d];
2728  TessDeviceScore score = *(TessDeviceScore *)device.score;
2729 
2730  float time = score.time;
2731  printf("[DS] Device[%i] %i:%s score is %f\n", d + 1, device.type,
2732  device.oclDeviceName, time);
2733  if (time < bestTime) {
2734  bestTime = time;
2735  bestDeviceIdx = d;
2736  }
2737  }
2738  printf("[DS] Selected Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2739  profile->devices[bestDeviceIdx].oclDeviceName,
2740  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2741  ? "OpenCL"
2742  : "Native");
2743  // cleanup
2744  // TODO: call destructor for profile object?
2745 
2746  bool overridden = false;
2747  char *overrideDeviceStr = getenv("TESSERACT_OPENCL_DEVICE");
2748  if (overrideDeviceStr != nullptr) {
2749  int overrideDeviceIdx = atoi(overrideDeviceStr);
2750  if (overrideDeviceIdx > 0 && overrideDeviceIdx <= profile->numDevices) {
2751  printf(
2752  "[DS] Overriding Device Selection (TESSERACT_OPENCL_DEVICE=%s, "
2753  "%i)\n",
2754  overrideDeviceStr, overrideDeviceIdx);
2755  bestDeviceIdx = overrideDeviceIdx - 1;
2756  overridden = true;
2757  } else {
2758  printf(
2759  "[DS] Ignoring invalid TESSERACT_OPENCL_DEVICE=%s ([1,%i] are "
2760  "valid devices).\n",
2761  overrideDeviceStr, profile->numDevices);
2762  }
2763  }
2764 
2765  if (overridden) {
2766  printf("[DS] Overridden Device[%i]: \"%s\" (%s)\n", bestDeviceIdx + 1,
2767  profile->devices[bestDeviceIdx].oclDeviceName,
2768  profile->devices[bestDeviceIdx].type == DS_DEVICE_OPENCL_DEVICE
2769  ? "OpenCL"
2770  : "Native");
2771  }
2772  selectedDevice = profile->devices[bestDeviceIdx];
2773  // cleanup
2774  releaseDSProfile(profile, releaseScore);
2775  } else {
2776  // opencl isn't available at runtime, select native cpu device
2777  printf("[DS] OpenCL runtime not available.\n");
2778  selectedDevice.type = DS_DEVICE_NATIVE_CPU;
2779  selectedDevice.oclDeviceName = "(null)";
2780  selectedDevice.score = nullptr;
2781  selectedDevice.oclDeviceID = nullptr;
2782  selectedDevice.oclDriverVersion = nullptr;
2783  }
2784  deviceIsSelected = true;
2785  PERF_COUNT_SUB("select from Profile")
2787  }
2788  // PERF_COUNT_END
2789  return selectedDevice;
2790 }
2791 
2792 
2793 bool OpenclDevice::selectedDeviceIsOpenCL() {
2794  ds_device device = getDeviceSelection();
2795  return (device.type == DS_DEVICE_OPENCL_DEVICE);
2796 }
2797 
2798 #endif
const int kHistogramSize
Definition: otsuthr.h:27
#define PERF_COUNT_START(FUNCT_NAME)
voidpf void uLong size
Definition: ioapi.h:39
#define SEEK_SET
Definition: ioapi.c:29
const int kMinLineLengthFraction
Denominator of resolution makes min pixels to demand line lengths to be.
Definition: linefind.cpp:43
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:62
const char * kernel_src
Definition: oclkernels.h:21
void HistogramRect(Pix *src_pix, int channel, int left, int top, int width, int height, int *histogram)
Definition: otsuthr.cpp:151
#define PERF_COUNT_END
const char * filename
Definition: ioapi.h:38
typedef int(ZCALLBACK *close_file_func) OF((voidpf opaque
#define PERF_COUNT_SUB(SUB)
#define SEEK_END
Definition: ioapi.c:25
const int kThinLineFraction
Denominator of resolution makes max pixel width to allow thin lines.
Definition: linefind.cpp:41