tesseract  4.00.00dev
baseapi.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: baseapi.cpp
3  * Description: Simple API for calling tesseract.
4  * Author: Ray Smith
5  * Created: Fri Oct 06 15:35:01 PDT 2006
6  *
7  * (C) Copyright 2006, Google Inc.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #ifdef __linux__
26 #include <signal.h>
27 #endif
28 
29 #if defined(_WIN32)
30 #ifdef _MSC_VER
31 #include "vcsversion.h"
32 #include "mathfix.h"
33 #elif MINGW
34 // workaround for stdlib.h with -std=c++11 for _splitpath and _MAX_FNAME
35 #undef __STRICT_ANSI__
36 #endif // _MSC_VER
37 #include <fcntl.h>
38 #include <io.h>
39 #else
40 #include <dirent.h>
41 #include <libgen.h>
42 #include <string.h>
43 #endif // _WIN32
44 
45 #include <iostream>
46 #include <string>
47 #include <iterator>
48 #include <fstream>
49 #include <memory> // std::unique_ptr
50 
51 #include "allheaders.h"
52 
53 #include "baseapi.h"
54 #include "blobclass.h"
55 #include "resultiterator.h"
56 #include "mutableiterator.h"
57 #include "thresholder.h"
58 #include "tesseractclass.h"
59 #include "pageres.h"
60 #include "paragraphs.h"
61 #include "tessvars.h"
62 #include "control.h"
63 #include "dict.h"
64 #include "pgedit.h"
65 #include "paramsd.h"
66 #include "output.h"
67 #include "globaloc.h"
68 #include "globals.h"
69 #include "edgblob.h"
70 #include "equationdetect.h"
71 #include "tessbox.h"
72 #include "makerow.h"
73 #include "otsuthr.h"
74 #include "osdetect.h"
75 #include "params.h"
76 #include "renderer.h"
77 #include "strngs.h"
78 #include "openclwrapper.h"
79 
80 BOOL_VAR(stream_filelist, FALSE, "Stream a filelist from stdin");
81 
82 namespace tesseract {
83 
85 const int kMinRectSize = 10;
87 const char kTesseractReject = '~';
89 const char kUNLVReject = '~';
91 const char kUNLVSuspect = '^';
96 const char* kInputFile = "noname.tif";
100 const char* kOldVarsFile = "failed_vars.txt";
102 const int kMaxIntSize = 22;
107 const int kMinCredibleResolution = 70;
109 const int kMaxCredibleResolution = 2400;
110 
112  : tesseract_(nullptr),
113  osd_tesseract_(nullptr),
114  equ_detect_(nullptr),
115  reader_(nullptr),
116  // Thresholder is initialized to NULL here, but will be set before use by:
117  // A constructor of a derived API, SetThresholder(), or
118  // created implicitly when used in InternalSetImage.
119  thresholder_(nullptr),
120  paragraph_models_(nullptr),
121  block_list_(nullptr),
122  page_res_(nullptr),
123  input_file_(nullptr),
124  output_file_(nullptr),
125  datapath_(nullptr),
126  language_(nullptr),
127  last_oem_requested_(OEM_DEFAULT),
128  recognition_done_(false),
129  truth_cb_(NULL),
130  rect_left_(0),
131  rect_top_(0),
132  rect_width_(0),
133  rect_height_(0),
134  image_width_(0),
135  image_height_(0) {}
136 
138  End();
139 }
140 
144 const char* TessBaseAPI::Version() {
145 #if defined(GIT_REV) && (defined(DEBUG) || defined(_DEBUG))
146  return GIT_REV;
147 #else
148  return TESSERACT_VERSION_STR;
149 #endif
150 }
151 
159 #ifdef USE_OPENCL
160 #if USE_DEVICE_SELECTION
161 #include "opencl_device_selection.h"
162 #endif
163 #endif
164 size_t TessBaseAPI::getOpenCLDevice(void **data) {
165 #ifdef USE_OPENCL
166 #if USE_DEVICE_SELECTION
167  ds_device device = OpenclDevice::getDeviceSelection();
168  if (device.type == DS_DEVICE_OPENCL_DEVICE) {
169  *data = new cl_device_id;
170  memcpy(*data, &device.oclDeviceID, sizeof(cl_device_id));
171  return sizeof(cl_device_id);
172  }
173 #endif
174 #endif
175 
176  *data = NULL;
177  return 0;
178 }
179 
185 #ifdef __linux__
186  struct sigaction action;
187  memset(&action, 0, sizeof(action));
188  action.sa_handler = &signal_exit;
189  action.sa_flags = SA_RESETHAND;
190  sigaction(SIGSEGV, &action, NULL);
191  sigaction(SIGFPE, &action, NULL);
192  sigaction(SIGBUS, &action, NULL);
193 #else
194  // Warn API users that an implementation is needed.
195  tprintf("CatchSignals has no non-linux implementation!\n");
196 #endif
197 }
198 
203 void TessBaseAPI::SetInputName(const char* name) {
204  if (input_file_ == NULL)
205  input_file_ = new STRING(name);
206  else
207  *input_file_ = name;
208 }
209 
211 void TessBaseAPI::SetOutputName(const char* name) {
212  if (output_file_ == NULL)
213  output_file_ = new STRING(name);
214  else
215  *output_file_ = name;
216 }
217 
218 bool TessBaseAPI::SetVariable(const char* name, const char* value) {
219  if (tesseract_ == NULL) tesseract_ = new Tesseract;
221  tesseract_->params());
222 }
223 
224 bool TessBaseAPI::SetDebugVariable(const char* name, const char* value) {
225  if (tesseract_ == NULL) tesseract_ = new Tesseract;
227  tesseract_->params());
228 }
229 
230 bool TessBaseAPI::GetIntVariable(const char *name, int *value) const {
231  IntParam *p = ParamUtils::FindParam<IntParam>(
233  if (p == NULL) return false;
234  *value = (inT32)(*p);
235  return true;
236 }
237 
238 bool TessBaseAPI::GetBoolVariable(const char *name, bool *value) const {
239  BoolParam *p = ParamUtils::FindParam<BoolParam>(
241  if (p == NULL) return false;
242  *value = (BOOL8)(*p);
243  return true;
244 }
245 
246 const char *TessBaseAPI::GetStringVariable(const char *name) const {
247  StringParam *p = ParamUtils::FindParam<StringParam>(
249  return (p != NULL) ? p->string() : NULL;
250 }
251 
252 bool TessBaseAPI::GetDoubleVariable(const char *name, double *value) const {
253  DoubleParam *p = ParamUtils::FindParam<DoubleParam>(
255  if (p == NULL) return false;
256  *value = (double)(*p);
257  return true;
258 }
259 
261 bool TessBaseAPI::GetVariableAsString(const char *name, STRING *val) {
262  return ParamUtils::GetParamAsString(name, tesseract_->params(), val);
263 }
264 
266 void TessBaseAPI::PrintVariables(FILE *fp) const {
268 }
269 
278 int TessBaseAPI::Init(const char* datapath, const char* language,
279  OcrEngineMode oem, char **configs, int configs_size,
280  const GenericVector<STRING> *vars_vec,
281  const GenericVector<STRING> *vars_values,
282  bool set_only_non_debug_params) {
283  return Init(datapath, 0, language, oem, configs, configs_size, vars_vec,
284  vars_values, set_only_non_debug_params, nullptr);
285 }
286 
287 // In-memory version reads the traineddata file directly from the given
288 // data[data_size] array. Also implements the version with a datapath in data,
289 // flagged by data_size = 0.
290 int TessBaseAPI::Init(const char* data, int data_size, const char* language,
291  OcrEngineMode oem, char** configs, int configs_size,
292  const GenericVector<STRING>* vars_vec,
293  const GenericVector<STRING>* vars_values,
294  bool set_only_non_debug_params, FileReader reader) {
295  PERF_COUNT_START("TessBaseAPI::Init")
296  // Default language is "eng".
297  if (language == nullptr) language = "eng";
298  STRING datapath = data_size == 0 ? data : language;
299  // If the datapath, OcrEngineMode or the language have changed - start again.
300  // Note that the language_ field stores the last requested language that was
301  // initialized successfully, while tesseract_->lang stores the language
302  // actually used. They differ only if the requested language was NULL, in
303  // which case tesseract_->lang is set to the Tesseract default ("eng").
304  if (tesseract_ != nullptr &&
305  (datapath_ == nullptr || language_ == nullptr || *datapath_ != datapath ||
306  last_oem_requested_ != oem ||
307  (*language_ != language && tesseract_->lang != language))) {
308  delete tesseract_;
309  tesseract_ = nullptr;
310  }
311  // PERF_COUNT_SUB("delete tesseract_")
312 #ifdef USE_OPENCL
313  OpenclDevice od;
314  od.InitEnv();
315 #endif
316  PERF_COUNT_SUB("OD::InitEnv()")
317  bool reset_classifier = true;
318  if (tesseract_ == nullptr) {
319  reset_classifier = false;
320  tesseract_ = new Tesseract;
321  if (reader != nullptr) reader_ = reader;
323  if (data_size != 0) {
324  mgr.LoadMemBuffer(language, data, data_size);
325  }
327  datapath.string(),
328  output_file_ != nullptr ? output_file_->string() : nullptr,
329  language, oem, configs, configs_size, vars_vec, vars_values,
330  set_only_non_debug_params, &mgr) != 0) {
331  return -1;
332  }
333  }
334  PERF_COUNT_SUB("update tesseract_")
335  // Update datapath and language requested for the last valid initialization.
336  if (datapath_ == nullptr)
337  datapath_ = new STRING(datapath);
338  else
339  *datapath_ = datapath;
340  if ((strcmp(datapath_->string(), "") == 0) &&
341  (strcmp(tesseract_->datadir.string(), "") != 0))
343 
344  if (language_ == nullptr)
345  language_ = new STRING(language);
346  else
347  *language_ = language;
349  // PERF_COUNT_SUB("update last_oem_requested_")
350  // For same language and datapath, just reset the adaptive classifier.
351  if (reset_classifier) {
353  PERF_COUNT_SUB("tesseract_->ResetAdaptiveClassifier()")
354  }
356  return 0;
357 }
358 
368  return (language_ == NULL || language_->string() == NULL) ?
369  "" : language_->string();
370 }
371 
378  GenericVector<STRING>* langs) const {
379  langs->clear();
380  if (tesseract_ != NULL) {
381  langs->push_back(tesseract_->lang);
382  int num_subs = tesseract_->num_sub_langs();
383  for (int i = 0; i < num_subs; ++i)
384  langs->push_back(tesseract_->get_sub_lang(i)->lang);
385  }
386 }
387 
392  GenericVector<STRING>* langs) const {
393  langs->clear();
394  if (tesseract_ != NULL) {
395 #ifdef _WIN32
396  STRING pattern = tesseract_->datadir + "/*." + kTrainedDataSuffix;
397  char fname[_MAX_FNAME];
398  WIN32_FIND_DATA data;
399  BOOL result = TRUE;
400  HANDLE handle = FindFirstFile(pattern.string(), &data);
401  if (handle != INVALID_HANDLE_VALUE) {
402  for (; result; result = FindNextFile(handle, &data)) {
403  _splitpath(data.cFileName, NULL, NULL, fname, NULL);
404  langs->push_back(STRING(fname));
405  }
406  FindClose(handle);
407  }
408 #else // _WIN32
409  DIR *dir;
410  struct dirent *dirent;
411  char *dot;
412 
413  STRING extension = STRING(".") + kTrainedDataSuffix;
414 
415  dir = opendir(tesseract_->datadir.string());
416  if (dir != NULL) {
417  while ((dirent = readdir(dir))) {
418  // Skip '.', '..', and hidden files
419  if (dirent->d_name[0] != '.') {
420  if (strstr(dirent->d_name, extension.string()) != NULL) {
421  dot = strrchr(dirent->d_name, '.');
422  // This ensures that .traineddata is at the end of the file name
423  if (strncmp(dot, extension.string(),
424  strlen(extension.string())) == 0) {
425  *dot = '\0';
426  langs->push_back(STRING(dirent->d_name));
427  }
428  }
429  }
430  }
431  closedir(dir);
432  }
433 #endif
434  }
435 }
436 
443 int TessBaseAPI::InitLangMod(const char* datapath, const char* language) {
444  if (tesseract_ == NULL)
445  tesseract_ = new Tesseract;
446  else
448  TessdataManager mgr;
449  return tesseract_->init_tesseract_lm(datapath, NULL, language, &mgr);
450 }
451 
457  if (tesseract_ == NULL) {
458  tesseract_ = new Tesseract;
460  }
461 }
462 
470 }
471 
475 }
476 
483  if (tesseract_ == NULL)
484  tesseract_ = new Tesseract;
485  tesseract_->tessedit_pageseg_mode.set_value(mode);
486 }
487 
490  if (tesseract_ == NULL)
491  return PSM_SINGLE_BLOCK;
492  return static_cast<PageSegMode>(
493  static_cast<int>(tesseract_->tessedit_pageseg_mode));
494 }
495 
509 char* TessBaseAPI::TesseractRect(const unsigned char* imagedata,
510  int bytes_per_pixel,
511  int bytes_per_line,
512  int left, int top,
513  int width, int height) {
514  if (tesseract_ == NULL || width < kMinRectSize || height < kMinRectSize)
515  return NULL; // Nothing worth doing.
516 
517  // Since this original api didn't give the exact size of the image,
518  // we have to invent a reasonable value.
519  int bits_per_pixel = bytes_per_pixel == 0 ? 1 : bytes_per_pixel * 8;
520  SetImage(imagedata, bytes_per_line * 8 / bits_per_pixel, height + top,
521  bytes_per_pixel, bytes_per_line);
522  SetRectangle(left, top, width, height);
523 
524  return GetUTF8Text();
525 }
526 
532  if (tesseract_ == NULL)
533  return;
536 }
537 
545 void TessBaseAPI::SetImage(const unsigned char* imagedata,
546  int width, int height,
547  int bytes_per_pixel, int bytes_per_line) {
548  if (InternalSetImage()) {
549  thresholder_->SetImage(imagedata, width, height,
550  bytes_per_pixel, bytes_per_line);
552  }
553 }
554 
556  if (thresholder_)
558  else
559  tprintf("Please call SetImage before SetSourceResolution.\n");
560 }
561 
570 void TessBaseAPI::SetImage(Pix* pix) {
571  if (InternalSetImage()) {
572  thresholder_->SetImage(pix);
574  }
575 }
576 
582 void TessBaseAPI::SetRectangle(int left, int top, int width, int height) {
583  if (thresholder_ == NULL)
584  return;
585  thresholder_->SetRectangle(left, top, width, height);
586  ClearResults();
587 }
588 
594  if (tesseract_ == nullptr || thresholder_ == nullptr) return nullptr;
595  if (tesseract_->pix_binary() == nullptr &&
597  return nullptr;
598  }
599  return pixClone(tesseract_->pix_binary());
600 }
601 
607 Boxa* TessBaseAPI::GetRegions(Pixa** pixa) {
608  return GetComponentImages(RIL_BLOCK, false, pixa, NULL);
609 }
610 
619 Boxa* TessBaseAPI::GetTextlines(const bool raw_image, const int raw_padding,
620  Pixa** pixa, int** blockids, int** paraids) {
621  return GetComponentImages(RIL_TEXTLINE, true, raw_image, raw_padding,
622  pixa, blockids, paraids);
623 }
624 
633 Boxa* TessBaseAPI::GetStrips(Pixa** pixa, int** blockids) {
634  return GetComponentImages(RIL_TEXTLINE, false, pixa, blockids);
635 }
636 
642 Boxa* TessBaseAPI::GetWords(Pixa** pixa) {
643  return GetComponentImages(RIL_WORD, true, pixa, NULL);
644 }
645 
653  return GetComponentImages(RIL_SYMBOL, true, pixa, NULL);
654 }
655 
665  bool text_only, bool raw_image,
666  const int raw_padding,
667  Pixa** pixa, int** blockids,
668  int** paraids) {
669  PageIterator* page_it = GetIterator();
670  if (page_it == NULL)
671  page_it = AnalyseLayout();
672  if (page_it == NULL)
673  return NULL; // Failed.
674 
675  // Count the components to get a size for the arrays.
676  int component_count = 0;
677  int left, top, right, bottom;
678 
679  TessResultCallback<bool>* get_bbox = NULL;
680  if (raw_image) {
681  // Get bounding box in original raw image with padding.
683  level, raw_padding,
684  &left, &top, &right, &bottom);
685  } else {
686  // Get bounding box from binarized imaged. Note that this could be
687  // differently scaled from the original image.
688  get_bbox = NewPermanentTessCallback(page_it,
690  level, &left, &top, &right, &bottom);
691  }
692  do {
693  if (get_bbox->Run() &&
694  (!text_only || PTIsTextType(page_it->BlockType())))
695  ++component_count;
696  } while (page_it->Next(level));
697 
698  Boxa* boxa = boxaCreate(component_count);
699  if (pixa != NULL)
700  *pixa = pixaCreate(component_count);
701  if (blockids != NULL)
702  *blockids = new int[component_count];
703  if (paraids != NULL)
704  *paraids = new int[component_count];
705 
706  int blockid = 0;
707  int paraid = 0;
708  int component_index = 0;
709  page_it->Begin();
710  do {
711  if (get_bbox->Run() &&
712  (!text_only || PTIsTextType(page_it->BlockType()))) {
713  Box* lbox = boxCreate(left, top, right - left, bottom - top);
714  boxaAddBox(boxa, lbox, L_INSERT);
715  if (pixa != NULL) {
716  Pix* pix = NULL;
717  if (raw_image) {
718  pix = page_it->GetImage(level, raw_padding, GetInputImage(), &left,
719  &top);
720  } else {
721  pix = page_it->GetBinaryImage(level);
722  }
723  pixaAddPix(*pixa, pix, L_INSERT);
724  pixaAddBox(*pixa, lbox, L_CLONE);
725  }
726  if (paraids != NULL) {
727  (*paraids)[component_index] = paraid;
728  if (page_it->IsAtFinalElement(RIL_PARA, level))
729  ++paraid;
730  }
731  if (blockids != NULL) {
732  (*blockids)[component_index] = blockid;
733  if (page_it->IsAtFinalElement(RIL_BLOCK, level)) {
734  ++blockid;
735  paraid = 0;
736  }
737  }
738  ++component_index;
739  }
740  } while (page_it->Next(level));
741  delete page_it;
742  delete get_bbox;
743  return boxa;
744 }
745 
747  if (thresholder_ == NULL) {
748  return 0;
749  }
750  return thresholder_->GetScaleFactor();
751 }
752 
754 void TessBaseAPI::DumpPGM(const char* filename) {
755  if (tesseract_ == NULL)
756  return;
757  FILE *fp = fopen(filename, "wb");
758  Pix* pix = tesseract_->pix_binary();
759  int width = pixGetWidth(pix);
760  int height = pixGetHeight(pix);
761  l_uint32* data = pixGetData(pix);
762  fprintf(fp, "P5 %d %d 255\n", width, height);
763  for (int y = 0; y < height; ++y, data += pixGetWpl(pix)) {
764  for (int x = 0; x < width; ++x) {
765  uint8_t b = GET_DATA_BIT(data, x) ? 0 : 255;
766  fwrite(&b, 1, 1, fp);
767  }
768  }
769  fclose(fp);
770 }
771 
788 
789 PageIterator* TessBaseAPI::AnalyseLayout(bool merge_similar_words) {
790  if (FindLines() == 0) {
791  if (block_list_->empty())
792  return NULL; // The page was empty.
793  page_res_ = new PAGE_RES(merge_similar_words, block_list_, NULL);
794  DetectParagraphs(false);
795  return new PageIterator(
799  }
800  return NULL;
801 }
802 
808  if (tesseract_ == NULL)
809  return -1;
810  if (FindLines() != 0)
811  return -1;
812  delete page_res_;
813  if (block_list_->empty()) {
814  page_res_ = new PAGE_RES(false, block_list_,
816  return 0; // Empty page.
817  }
818 
820  recognition_done_ = true;
825  } else {
828  }
829  if (page_res_ == NULL) {
830  return -1;
831  }
835  return 0;
836  }
839  return 0;
840  }
841 
842  if (truth_cb_ != NULL) {
843  tesseract_->wordrec_run_blamer.set_value(true);
844  PageIterator *page_it = new PageIterator(
849  image_height_, page_it, this->tesseract()->pix_grey());
850  delete page_it;
851  }
852 
853  int result = 0;
855  #ifndef GRAPHICS_DISABLED
857  #endif // GRAPHICS_DISABLED
858  // The page_res is invalid after an interactive session, so cleanup
859  // in a way that lets us continue to the next page without crashing.
860  delete page_res_;
861  page_res_ = NULL;
862  return -1;
864  STRING fontname;
865  ExtractFontName(*output_file_, &fontname);
867  } else if (tesseract_->tessedit_ambigs_training) {
868  FILE *training_output_file = tesseract_->init_recog_training(*input_file_);
869  // OCR the page segmented into words by tesseract.
871  *input_file_, page_res_, monitor, training_output_file);
872  fclose(training_output_file);
873  } else {
874  // Now run the main recognition.
875  bool wait_for_text = true;
876  GetBoolVariable("paragraph_text_based", &wait_for_text);
877  if (!wait_for_text) DetectParagraphs(false);
878  if (tesseract_->recog_all_words(page_res_, monitor, NULL, NULL, 0)) {
879  if (wait_for_text) DetectParagraphs(true);
880  } else {
881  result = -1;
882  }
883  }
884  return result;
885 }
886 
889  if (tesseract_ == NULL)
890  return -1;
891  if (thresholder_ == NULL || thresholder_->IsEmpty()) {
892  tprintf("Please call SetImage before attempting recognition.");
893  return -1;
894  }
895  if (page_res_ != NULL)
896  ClearResults();
897  if (FindLines() != 0)
898  return -1;
899  // Additional conditions under which chopper test cannot be run
900  if (tesseract_->interactive_display_mode) return -1;
901 
902  recognition_done_ = true;
903 
904  page_res_ = new PAGE_RES(false, block_list_,
906 
907  PAGE_RES_IT page_res_it(page_res_);
908 
909  while (page_res_it.word() != NULL) {
910  WERD_RES *word_res = page_res_it.word();
911  GenericVector<TBOX> boxes;
912  tesseract_->MaximallyChopWord(boxes, page_res_it.block()->block,
913  page_res_it.row()->row, word_res);
914  page_res_it.forward();
915  }
916  return 0;
917 }
918 
919 // Takes ownership of the input pix.
921 
923 
925  if (input_file_)
926  return input_file_->c_str();
927  return NULL;
928 }
929 
930 const char * TessBaseAPI::GetDatapath() {
931  return tesseract_->datadir.c_str();
932 }
933 
936 }
937 
938 // If flist exists, get data from there. Otherwise get data from buf.
939 // Seems convoluted, but is the easiest way I know of to meet multiple
940 // goals. Support streaming from stdin, and also work on platforms
941 // lacking fmemopen.
942 bool TessBaseAPI::ProcessPagesFileList(FILE *flist,
943  STRING *buf,
944  const char* retry_config,
945  int timeout_millisec,
946  TessResultRenderer* renderer,
947  int tessedit_page_number) {
948  if (!flist && !buf) return false;
949  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
950  char pagename[MAX_PATH];
951 
952  GenericVector<STRING> lines;
953  if (!flist) {
954  buf->split('\n', &lines);
955  if (lines.empty()) return false;
956  }
957 
958  // Skip to the requested page number.
959  for (int i = 0; i < page; i++) {
960  if (flist) {
961  if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
962  }
963  }
964 
965  // Begin producing output
966  if (renderer && !renderer->BeginDocument(unknown_title_)) {
967  return false;
968  }
969 
970  // Loop over all pages - or just the requested one
971  while (true) {
972  if (flist) {
973  if (fgets(pagename, sizeof(pagename), flist) == NULL) break;
974  } else {
975  if (page >= lines.size()) break;
976  snprintf(pagename, sizeof(pagename), "%s", lines[page].c_str());
977  }
978  chomp_string(pagename);
979  Pix *pix = pixRead(pagename);
980  if (pix == NULL) {
981  tprintf("Image file %s cannot be read!\n", pagename);
982  return false;
983  }
984  tprintf("Page %d : %s\n", page, pagename);
985  bool r = ProcessPage(pix, page, pagename, retry_config,
986  timeout_millisec, renderer);
987  pixDestroy(&pix);
988  if (!r) return false;
989  if (tessedit_page_number >= 0) break;
990  ++page;
991  }
992 
993  // Finish producing output
994  if (renderer && !renderer->EndDocument()) {
995  return false;
996  }
997  return true;
998 }
999 
1000 bool TessBaseAPI::ProcessPagesMultipageTiff(const l_uint8 *data,
1001  size_t size,
1002  const char* filename,
1003  const char* retry_config,
1004  int timeout_millisec,
1005  TessResultRenderer* renderer,
1006  int tessedit_page_number) {
1007 #ifndef ANDROID_BUILD
1008  Pix *pix = NULL;
1009  int page = (tessedit_page_number >= 0) ? tessedit_page_number : 0;
1010  size_t offset = 0;
1011  for (; ; ++page) {
1012  if (tessedit_page_number >= 0)
1013  page = tessedit_page_number;
1014  pix = (data) ? pixReadMemFromMultipageTiff(data, size, &offset)
1015  : pixReadFromMultipageTiff(filename, &offset);
1016  if (pix == NULL) break;
1017  tprintf("Page %d\n", page + 1);
1018  char page_str[kMaxIntSize];
1019  snprintf(page_str, kMaxIntSize - 1, "%d", page);
1020  SetVariable("applybox_page", page_str);
1021  bool r = ProcessPage(pix, page, filename, retry_config,
1022  timeout_millisec, renderer);
1023  pixDestroy(&pix);
1024  if (!r) return false;
1025  if (tessedit_page_number >= 0) break;
1026  if (!offset) break;
1027  }
1028  return true;
1029 #else
1030  return false;
1031 #endif
1032 }
1033 
1034 // Master ProcessPages calls ProcessPagesInternal and then does any post-
1035 // processing required due to being in a training mode.
1036 bool TessBaseAPI::ProcessPages(const char* filename, const char* retry_config,
1037  int timeout_millisec,
1038  TessResultRenderer* renderer) {
1039  bool result =
1040  ProcessPagesInternal(filename, retry_config, timeout_millisec, renderer);
1041  if (result) {
1044  tprintf("Write of TR file failed: %s\n", output_file_->string());
1045  return false;
1046  }
1047  }
1048  return result;
1049 }
1050 
1051 // In the ideal scenario, Tesseract will start working on data as soon
1052 // as it can. For example, if you steam a filelist through stdin, we
1053 // should start the OCR process as soon as the first filename is
1054 // available. This is particularly useful when hooking Tesseract up to
1055 // slow hardware such as a book scanning machine.
1056 //
1057 // Unfortunately there are tradeoffs. You can't seek on stdin. That
1058 // makes automatic detection of datatype (TIFF? filelist? PNG?)
1059 // impractical. So we support a command line flag to explicitly
1060 // identify the scenario that really matters: filelists on
1061 // stdin. We'll still do our best if the user likes pipes.
1062 bool TessBaseAPI::ProcessPagesInternal(const char* filename,
1063  const char* retry_config,
1064  int timeout_millisec,
1065  TessResultRenderer* renderer) {
1066  PERF_COUNT_START("ProcessPages")
1067  bool stdInput = !strcmp(filename, "stdin") || !strcmp(filename, "-");
1068  if (stdInput) {
1069 #ifdef WIN32
1070  if (_setmode(_fileno(stdin), _O_BINARY) == -1)
1071  tprintf("ERROR: cin to binary: %s", strerror(errno));
1072 #endif // WIN32
1073  }
1074 
1075  if (stream_filelist) {
1076  return ProcessPagesFileList(stdin, NULL, retry_config,
1077  timeout_millisec, renderer,
1079  }
1080 
1081  // At this point we are officially in autodection territory.
1082  // That means any data in stdin must be buffered, to make it
1083  // seekable.
1084  std::string buf;
1085  const l_uint8 *data = NULL;
1086  if (stdInput) {
1087  buf.assign((std::istreambuf_iterator<char>(std::cin)),
1088  (std::istreambuf_iterator<char>()));
1089  data = reinterpret_cast<const l_uint8 *>(buf.data());
1090  }
1091 
1092  // Here is our autodetection
1093  int format;
1094  int r = (stdInput) ?
1095  findFileFormatBuffer(data, &format) :
1096  findFileFormat(filename, &format);
1097 
1098  // Maybe we have a filelist
1099  if (r != 0 || format == IFF_UNKNOWN) {
1100  STRING s;
1101  if (stdInput) {
1102  s = buf.c_str();
1103  } else {
1104  std::ifstream t(filename);
1105  std::string u((std::istreambuf_iterator<char>(t)),
1106  std::istreambuf_iterator<char>());
1107  s = u.c_str();
1108  }
1109  return ProcessPagesFileList(NULL, &s, retry_config,
1110  timeout_millisec, renderer,
1112  }
1113 
1114  // Maybe we have a TIFF which is potentially multipage
1115  bool tiff = (format == IFF_TIFF || format == IFF_TIFF_PACKBITS ||
1116  format == IFF_TIFF_RLE || format == IFF_TIFF_G3 ||
1117  format == IFF_TIFF_G4 || format == IFF_TIFF_LZW ||
1118  format == IFF_TIFF_ZIP);
1119 
1120  // Fail early if we can, before producing any output
1121  Pix *pix = NULL;
1122  if (!tiff) {
1123  pix = (stdInput) ? pixReadMem(data, buf.size()) : pixRead(filename);
1124  if (pix == NULL) {
1125  return false;
1126  }
1127  }
1128 
1129  // Begin the output
1130  if (renderer && !renderer->BeginDocument(unknown_title_)) {
1131  pixDestroy(&pix);
1132  return false;
1133  }
1134 
1135  // Produce output
1136  r = (tiff) ?
1137  ProcessPagesMultipageTiff(data, buf.size(), filename, retry_config,
1138  timeout_millisec, renderer,
1140  ProcessPage(pix, 0, filename, retry_config,
1141  timeout_millisec, renderer);
1142 
1143  // Clean up memory as needed
1144  pixDestroy(&pix);
1145 
1146  // End the output
1147  if (!r || (renderer && !renderer->EndDocument())) {
1148  return false;
1149  }
1151  return true;
1152 }
1153 
1154 bool TessBaseAPI::ProcessPage(Pix* pix, int page_index, const char* filename,
1155  const char* retry_config, int timeout_millisec,
1156  TessResultRenderer* renderer) {
1157  PERF_COUNT_START("ProcessPage")
1158  SetInputName(filename);
1159  SetImage(pix);
1160  bool failed = false;
1161 
1163  // Disabled character recognition
1164  PageIterator* it = AnalyseLayout();
1165 
1166  if (it == NULL) {
1167  failed = true;
1168  } else {
1169  delete it;
1170  }
1172  failed = FindLines() != 0;
1173  } else if (timeout_millisec > 0) {
1174  // Running with a timeout.
1175  ETEXT_DESC monitor;
1176  monitor.cancel = NULL;
1177  monitor.cancel_this = NULL;
1178  monitor.set_deadline_msecs(timeout_millisec);
1179 
1180  // Now run the main recognition.
1181  failed = Recognize(&monitor) < 0;
1182  } else {
1183  // Normal layout and character recognition with no timeout.
1184  failed = Recognize(NULL) < 0;
1185  }
1186 
1188 #ifndef ANDROID_BUILD
1189  Pix* page_pix = GetThresholdedImage();
1190  pixWrite("tessinput.tif", page_pix, IFF_TIFF_G4);
1191 #endif // ANDROID_BUILD
1192  }
1193 
1194  if (failed && retry_config != NULL && retry_config[0] != '\0') {
1195  // Save current config variables before switching modes.
1196  FILE* fp = fopen(kOldVarsFile, "wb");
1197  PrintVariables(fp);
1198  fclose(fp);
1199  // Switch to alternate mode for retry.
1200  ReadConfigFile(retry_config);
1201  SetImage(pix);
1202  Recognize(NULL);
1203  // Restore saved config variables.
1204  ReadConfigFile(kOldVarsFile);
1205  }
1206 
1207  if (renderer && !failed) {
1208  failed = !renderer->AddImage(this);
1209  }
1210 
1212  return !failed;
1213 }
1214 
1220  if (tesseract_ == NULL || page_res_ == NULL)
1221  return NULL;
1222  return new LTRResultIterator(
1226 }
1227 
1237  if (tesseract_ == NULL || page_res_ == NULL)
1238  return NULL;
1243 }
1244 
1254  if (tesseract_ == NULL || page_res_ == NULL)
1255  return NULL;
1256  return new MutableIterator(page_res_, tesseract_,
1260 }
1261 
1264  if (tesseract_ == NULL ||
1265  (!recognition_done_ && Recognize(NULL) < 0))
1266  return NULL;
1267  STRING text("");
1268  ResultIterator *it = GetIterator();
1269  do {
1270  if (it->Empty(RIL_PARA)) continue;
1271  const std::unique_ptr<const char[]> para_text(it->GetUTF8Text(RIL_PARA));
1272  text += para_text.get();
1273  } while (it->Next(RIL_PARA));
1274  char* result = new char[text.length() + 1];
1275  strncpy(result, text.string(), text.length() + 1);
1276  delete it;
1277  return result;
1278 }
1279 
1283 static tesseract::Orientation GetBlockTextOrientation(const PageIterator *it) {
1284  tesseract::Orientation orientation;
1285  tesseract::WritingDirection writing_direction;
1286  tesseract::TextlineOrder textline_order;
1287  float deskew_angle;
1288  it->Orientation(&orientation, &writing_direction, &textline_order,
1289  &deskew_angle);
1290  return orientation;
1291 }
1292 
1301 static void AddBaselineCoordsTohOCR(const PageIterator *it,
1302  PageIteratorLevel level,
1303  STRING* hocr_str) {
1304  tesseract::Orientation orientation = GetBlockTextOrientation(it);
1305  if (orientation != ORIENTATION_PAGE_UP) {
1306  hocr_str->add_str_int("; textangle ", 360 - orientation * 90);
1307  return;
1308  }
1309 
1310  int left, top, right, bottom;
1311  it->BoundingBox(level, &left, &top, &right, &bottom);
1312 
1313  // Try to get the baseline coordinates at this level.
1314  int x1, y1, x2, y2;
1315  if (!it->Baseline(level, &x1, &y1, &x2, &y2))
1316  return;
1317  // Following the description of this field of the hOCR spec, we convert the
1318  // baseline coordinates so that "the bottom left of the bounding box is the
1319  // origin".
1320  x1 -= left;
1321  x2 -= left;
1322  y1 -= bottom;
1323  y2 -= bottom;
1324 
1325  // Now fit a line through the points so we can extract coefficients for the
1326  // equation: y = p1 x + p0
1327  double p1 = 0;
1328  double p0 = 0;
1329  if (x1 == x2) {
1330  // Problem computing the polynomial coefficients.
1331  return;
1332  }
1333  p1 = (y2 - y1) / static_cast<double>(x2 - x1);
1334  p0 = y1 - static_cast<double>(p1 * x1);
1335 
1336  hocr_str->add_str_double("; baseline ", round(p1 * 1000.0) / 1000.0);
1337  hocr_str->add_str_double(" ", round(p0 * 1000.0) / 1000.0);
1338 }
1339 
1340 static void AddIdTohOCR(STRING* hocr_str, const std::string base, int num1,
1341  int num2) {
1342  const size_t BUFSIZE = 64;
1343  char id_buffer[BUFSIZE];
1344  if (num2 >= 0) {
1345  snprintf(id_buffer, BUFSIZE - 1, "%s_%d_%d", base.c_str(), num1, num2);
1346  } else {
1347  snprintf(id_buffer, BUFSIZE - 1, "%s_%d", base.c_str(), num1);
1348  }
1349  id_buffer[BUFSIZE - 1] = '\0';
1350  *hocr_str += " id='";
1351  *hocr_str += id_buffer;
1352  *hocr_str += "'";
1353 }
1354 
1355 static void AddBoxTohOCR(const ResultIterator* it, PageIteratorLevel level,
1356  STRING* hocr_str) {
1357  int left, top, right, bottom;
1358  it->BoundingBox(level, &left, &top, &right, &bottom);
1359  // This is the only place we use double quotes instead of single quotes,
1360  // but it may too late to change for consistency
1361  hocr_str->add_str_int(" title=\"bbox ", left);
1362  hocr_str->add_str_int(" ", top);
1363  hocr_str->add_str_int(" ", right);
1364  hocr_str->add_str_int(" ", bottom);
1365  // Add baseline coordinates & heights for textlines only.
1366  if (level == RIL_TEXTLINE) {
1367  AddBaselineCoordsTohOCR(it, level, hocr_str);
1368  // add custom height measures
1369  float row_height, descenders, ascenders; // row attributes
1370  it->RowAttributes(&row_height, &descenders, &ascenders);
1371  // TODO(rays): Do we want to limit these to a single decimal place?
1372  hocr_str->add_str_double("; x_size ", row_height);
1373  hocr_str->add_str_double("; x_descenders ", descenders * -1);
1374  hocr_str->add_str_double("; x_ascenders ", ascenders);
1375  }
1376  *hocr_str += "\">";
1377 }
1378 
1379 static void AddBoxToTSV(const PageIterator* it, PageIteratorLevel level,
1380  STRING* hocr_str) {
1381  int left, top, right, bottom;
1382  it->BoundingBox(level, &left, &top, &right, &bottom);
1383  hocr_str->add_str_int("\t", left);
1384  hocr_str->add_str_int("\t", top);
1385  hocr_str->add_str_int("\t", right - left);
1386  hocr_str->add_str_int("\t", bottom - top);
1387 }
1388 
1398 char* TessBaseAPI::GetHOCRText(int page_number) {
1399  return GetHOCRText(NULL, page_number);
1400 }
1401 
1411 char* TessBaseAPI::GetHOCRText(ETEXT_DESC* monitor, int page_number) {
1412  if (tesseract_ == NULL || (page_res_ == NULL && Recognize(monitor) < 0))
1413  return NULL;
1414 
1415  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1416  int page_id = page_number + 1; // hOCR uses 1-based page numbers.
1417  bool para_is_ltr = true; // Default direction is LTR
1418  const char* paragraph_lang = NULL;
1419  bool font_info = false;
1420  GetBoolVariable("hocr_font_info", &font_info);
1421 
1422  STRING hocr_str("");
1423 
1424  if (input_file_ == NULL)
1425  SetInputName(NULL);
1426 
1427 #ifdef _WIN32
1428  // convert input name from ANSI encoding to utf-8
1429  int str16_len =
1430  MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1, NULL, 0);
1431  wchar_t *uni16_str = new WCHAR[str16_len];
1432  str16_len = MultiByteToWideChar(CP_ACP, 0, input_file_->string(), -1,
1433  uni16_str, str16_len);
1434  int utf8_len = WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, NULL, 0,
1435  NULL, NULL);
1436  char *utf8_str = new char[utf8_len];
1437  WideCharToMultiByte(CP_UTF8, 0, uni16_str, str16_len, utf8_str,
1438  utf8_len, NULL, NULL);
1439  *input_file_ = utf8_str;
1440  delete[] uni16_str;
1441  delete[] utf8_str;
1442 #endif
1443 
1444  hocr_str += " <div class='ocr_page'";
1445  AddIdTohOCR(&hocr_str, "page", page_id, -1);
1446  hocr_str += " title='image \"";
1447  if (input_file_) {
1448  hocr_str += HOcrEscape(input_file_->string());
1449  } else {
1450  hocr_str += "unknown";
1451  }
1452  hocr_str.add_str_int("\"; bbox ", rect_left_);
1453  hocr_str.add_str_int(" ", rect_top_);
1454  hocr_str.add_str_int(" ", rect_width_);
1455  hocr_str.add_str_int(" ", rect_height_);
1456  hocr_str.add_str_int("; ppageno ", page_number);
1457  hocr_str += "'>\n";
1458 
1459  ResultIterator *res_it = GetIterator();
1460  while (!res_it->Empty(RIL_BLOCK)) {
1461  if (res_it->Empty(RIL_WORD)) {
1462  res_it->Next(RIL_WORD);
1463  continue;
1464  }
1465 
1466  // Open any new block/paragraph/textline.
1467  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1468  para_is_ltr = true; // reset to default direction
1469  hocr_str += " <div class='ocr_carea'";
1470  AddIdTohOCR(&hocr_str, "block", page_id, bcnt);
1471  AddBoxTohOCR(res_it, RIL_BLOCK, &hocr_str);
1472  }
1473  if (res_it->IsAtBeginningOf(RIL_PARA)) {
1474  hocr_str += "\n <p class='ocr_par'";
1475  para_is_ltr = res_it->ParagraphIsLtr();
1476  if (!para_is_ltr) {
1477  hocr_str += " dir='rtl'";
1478  }
1479  AddIdTohOCR(&hocr_str, "par", page_id, pcnt);
1480  paragraph_lang = res_it->WordRecognitionLanguage();
1481  if (paragraph_lang) {
1482  hocr_str += " lang='";
1483  hocr_str += paragraph_lang;
1484  hocr_str += "'";
1485  }
1486  AddBoxTohOCR(res_it, RIL_PARA, &hocr_str);
1487  }
1488  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1489  hocr_str += "\n <span class='ocr_line'";
1490  AddIdTohOCR(&hocr_str, "line", page_id, lcnt);
1491  AddBoxTohOCR(res_it, RIL_TEXTLINE, &hocr_str);
1492  }
1493 
1494  // Now, process the word...
1495  hocr_str += "<span class='ocrx_word'";
1496  AddIdTohOCR(&hocr_str, "word", page_id, wcnt);
1497  int left, top, right, bottom;
1498  bool bold, italic, underlined, monospace, serif, smallcaps;
1499  int pointsize, font_id;
1500  const char *font_name;
1501  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1502  font_name = res_it->WordFontAttributes(&bold, &italic, &underlined,
1503  &monospace, &serif, &smallcaps,
1504  &pointsize, &font_id);
1505  hocr_str.add_str_int(" title='bbox ", left);
1506  hocr_str.add_str_int(" ", top);
1507  hocr_str.add_str_int(" ", right);
1508  hocr_str.add_str_int(" ", bottom);
1509  hocr_str.add_str_int("; x_wconf ", res_it->Confidence(RIL_WORD));
1510  if (font_info) {
1511  if (font_name) {
1512  hocr_str += "; x_font ";
1513  hocr_str += HOcrEscape(font_name);
1514  }
1515  hocr_str.add_str_int("; x_fsize ", pointsize);
1516  }
1517  hocr_str += "'";
1518  const char* lang = res_it->WordRecognitionLanguage();
1519  if (lang && (!paragraph_lang || strcmp(lang, paragraph_lang))) {
1520  hocr_str += " lang='";
1521  hocr_str += lang;
1522  hocr_str += "'";
1523  }
1524  switch (res_it->WordDirection()) {
1525  // Only emit direction if different from current paragraph direction
1526  case DIR_LEFT_TO_RIGHT:
1527  if (!para_is_ltr) hocr_str += " dir='ltr'";
1528  break;
1529  case DIR_RIGHT_TO_LEFT:
1530  if (para_is_ltr) hocr_str += " dir='rtl'";
1531  break;
1532  case DIR_MIX:
1533  case DIR_NEUTRAL:
1534  default: // Do nothing.
1535  break;
1536  }
1537  hocr_str += ">";
1538  bool last_word_in_line = res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD);
1539  bool last_word_in_para = res_it->IsAtFinalElement(RIL_PARA, RIL_WORD);
1540  bool last_word_in_block = res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD);
1541  if (bold) hocr_str += "<strong>";
1542  if (italic) hocr_str += "<em>";
1543  do {
1544  const std::unique_ptr<const char[]> grapheme(res_it->GetUTF8Text(RIL_SYMBOL));
1545  if (grapheme && grapheme[0] != 0) {
1546  hocr_str += HOcrEscape(grapheme.get());
1547  }
1548  res_it->Next(RIL_SYMBOL);
1549  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1550  if (italic) hocr_str += "</em>";
1551  if (bold) hocr_str += "</strong>";
1552  hocr_str += "</span> ";
1553  wcnt++;
1554  // Close any ending block/paragraph/textline.
1555  if (last_word_in_line) {
1556  hocr_str += "\n </span>";
1557  lcnt++;
1558  }
1559  if (last_word_in_para) {
1560  hocr_str += "\n </p>\n";
1561  pcnt++;
1562  para_is_ltr = true; // back to default direction
1563  }
1564  if (last_word_in_block) {
1565  hocr_str += " </div>\n";
1566  bcnt++;
1567  }
1568  }
1569  hocr_str += " </div>\n";
1570 
1571  char *ret = new char[hocr_str.length() + 1];
1572  strcpy(ret, hocr_str.string());
1573  delete res_it;
1574  return ret;
1575 }
1576 
1582 char* TessBaseAPI::GetTSVText(int page_number) {
1583  if (tesseract_ == NULL || (page_res_ == NULL && Recognize(NULL) < 0))
1584  return NULL;
1585 
1586  int lcnt = 1, bcnt = 1, pcnt = 1, wcnt = 1;
1587  int page_id = page_number + 1; // we use 1-based page numbers.
1588 
1589  STRING tsv_str("");
1590 
1591  int page_num = page_id, block_num = 0, par_num = 0, line_num = 0,
1592  word_num = 0;
1593 
1594  tsv_str.add_str_int("1\t", page_num); // level 1 - page
1595  tsv_str.add_str_int("\t", block_num);
1596  tsv_str.add_str_int("\t", par_num);
1597  tsv_str.add_str_int("\t", line_num);
1598  tsv_str.add_str_int("\t", word_num);
1599  tsv_str.add_str_int("\t", rect_left_);
1600  tsv_str.add_str_int("\t", rect_top_);
1601  tsv_str.add_str_int("\t", rect_width_);
1602  tsv_str.add_str_int("\t", rect_height_);
1603  tsv_str += "\t-1\t\n";
1604 
1605  ResultIterator* res_it = GetIterator();
1606  while (!res_it->Empty(RIL_BLOCK)) {
1607  if (res_it->Empty(RIL_WORD)) {
1608  res_it->Next(RIL_WORD);
1609  continue;
1610  }
1611 
1612  // Add rows for any new block/paragraph/textline.
1613  if (res_it->IsAtBeginningOf(RIL_BLOCK)) {
1614  block_num++, par_num = 0, line_num = 0, word_num = 0;
1615  tsv_str.add_str_int("2\t", page_num); // level 2 - block
1616  tsv_str.add_str_int("\t", block_num);
1617  tsv_str.add_str_int("\t", par_num);
1618  tsv_str.add_str_int("\t", line_num);
1619  tsv_str.add_str_int("\t", word_num);
1620  AddBoxToTSV(res_it, RIL_BLOCK, &tsv_str);
1621  tsv_str += "\t-1\t\n"; // end of row for block
1622  }
1623  if (res_it->IsAtBeginningOf(RIL_PARA)) {
1624  par_num++, line_num = 0, word_num = 0;
1625  tsv_str.add_str_int("3\t", page_num); // level 3 - paragraph
1626  tsv_str.add_str_int("\t", block_num);
1627  tsv_str.add_str_int("\t", par_num);
1628  tsv_str.add_str_int("\t", line_num);
1629  tsv_str.add_str_int("\t", word_num);
1630  AddBoxToTSV(res_it, RIL_PARA, &tsv_str);
1631  tsv_str += "\t-1\t\n"; // end of row for para
1632  }
1633  if (res_it->IsAtBeginningOf(RIL_TEXTLINE)) {
1634  line_num++, word_num = 0;
1635  tsv_str.add_str_int("4\t", page_num); // level 4 - line
1636  tsv_str.add_str_int("\t", block_num);
1637  tsv_str.add_str_int("\t", par_num);
1638  tsv_str.add_str_int("\t", line_num);
1639  tsv_str.add_str_int("\t", word_num);
1640  AddBoxToTSV(res_it, RIL_TEXTLINE, &tsv_str);
1641  tsv_str += "\t-1\t\n"; // end of row for line
1642  }
1643 
1644  // Now, process the word...
1645  int left, top, right, bottom;
1646  res_it->BoundingBox(RIL_WORD, &left, &top, &right, &bottom);
1647  word_num++;
1648  tsv_str.add_str_int("5\t", page_num); // level 5 - word
1649  tsv_str.add_str_int("\t", block_num);
1650  tsv_str.add_str_int("\t", par_num);
1651  tsv_str.add_str_int("\t", line_num);
1652  tsv_str.add_str_int("\t", word_num);
1653  tsv_str.add_str_int("\t", left);
1654  tsv_str.add_str_int("\t", top);
1655  tsv_str.add_str_int("\t", right - left);
1656  tsv_str.add_str_int("\t", bottom - top);
1657  tsv_str.add_str_int("\t", res_it->Confidence(RIL_WORD));
1658  tsv_str += "\t";
1659 
1660  // Increment counts if at end of block/paragraph/textline.
1661  if (res_it->IsAtFinalElement(RIL_TEXTLINE, RIL_WORD)) lcnt++;
1662  if (res_it->IsAtFinalElement(RIL_PARA, RIL_WORD)) pcnt++;
1663  if (res_it->IsAtFinalElement(RIL_BLOCK, RIL_WORD)) bcnt++;
1664 
1665  do {
1666  tsv_str += std::unique_ptr<const char[]>(res_it->GetUTF8Text(RIL_SYMBOL)).get();
1667  res_it->Next(RIL_SYMBOL);
1668  } while (!res_it->Empty(RIL_BLOCK) && !res_it->IsAtBeginningOf(RIL_WORD));
1669  tsv_str += "\n"; // end of row
1670  wcnt++;
1671  }
1672 
1673  char* ret = new char[tsv_str.length() + 1];
1674  strcpy(ret, tsv_str.string());
1675  delete res_it;
1676  return ret;
1677 }
1678 
1680 const int kNumbersPerBlob = 5;
1685 const int kBytesPerNumber = 5;
1691 const int kBytesPerBoxFileLine = (kBytesPerNumber + 1) * kNumbersPerBlob + 1;
1693 const int kBytesPer64BitNumber = 20;
1700 const int kMaxBytesPerLine = kNumbersPerBlob * (kBytesPer64BitNumber + 1) + 1 +
1701  UNICHAR_LEN;
1702 
1709 char* TessBaseAPI::GetBoxText(int page_number) {
1710  if (tesseract_ == NULL ||
1711  (!recognition_done_ && Recognize(NULL) < 0))
1712  return NULL;
1713  int blob_count;
1714  int utf8_length = TextLength(&blob_count);
1715  int total_length = blob_count * kBytesPerBoxFileLine + utf8_length +
1717  char* result = new char[total_length];
1718  result[0] = '\0';
1719  int output_length = 0;
1721  do {
1722  int left, top, right, bottom;
1723  if (it->BoundingBox(RIL_SYMBOL, &left, &top, &right, &bottom)) {
1724  const std::unique_ptr</*non-const*/ char[]> text(it->GetUTF8Text(RIL_SYMBOL));
1725  // Tesseract uses space for recognition failure. Fix to a reject
1726  // character, kTesseractReject so we don't create illegal box files.
1727  for (int i = 0; text[i] != '\0'; ++i) {
1728  if (text[i] == ' ')
1729  text[i] = kTesseractReject;
1730  }
1731  snprintf(result + output_length, total_length - output_length,
1732  "%s %d %d %d %d %d\n",
1733  text.get(), left, image_height_ - bottom,
1734  right, image_height_ - top, page_number);
1735  output_length += strlen(result + output_length);
1736  // Just in case...
1737  if (output_length + kMaxBytesPerLine > total_length)
1738  break;
1739  }
1740  } while (it->Next(RIL_SYMBOL));
1741  delete it;
1742  return result;
1743 }
1744 
1750 const int kUniChs[] = {
1751  0x20ac, 0x201c, 0x201d, 0x2018, 0x2019, 0x2022, 0x2014, 0
1752 };
1754 const int kLatinChs[] = {
1755  0x00a2, 0x0022, 0x0022, 0x0027, 0x0027, 0x00b7, 0x002d, 0
1756 };
1757 
1764  if (tesseract_ == NULL ||
1765  (!recognition_done_ && Recognize(NULL) < 0))
1766  return NULL;
1767  bool tilde_crunch_written = false;
1768  bool last_char_was_newline = true;
1769  bool last_char_was_tilde = false;
1770 
1771  int total_length = TextLength(NULL);
1772  PAGE_RES_IT page_res_it(page_res_);
1773  char* result = new char[total_length];
1774  char* ptr = result;
1775  for (page_res_it.restart_page(); page_res_it.word () != NULL;
1776  page_res_it.forward()) {
1777  WERD_RES *word = page_res_it.word();
1778  // Process the current word.
1779  if (word->unlv_crunch_mode != CR_NONE) {
1780  if (word->unlv_crunch_mode != CR_DELETE &&
1781  (!tilde_crunch_written ||
1782  (word->unlv_crunch_mode == CR_KEEP_SPACE &&
1783  word->word->space() > 0 &&
1784  !word->word->flag(W_FUZZY_NON) &&
1785  !word->word->flag(W_FUZZY_SP)))) {
1786  if (!word->word->flag(W_BOL) &&
1787  word->word->space() > 0 &&
1788  !word->word->flag(W_FUZZY_NON) &&
1789  !word->word->flag(W_FUZZY_SP)) {
1790  /* Write a space to separate from preceding good text */
1791  *ptr++ = ' ';
1792  last_char_was_tilde = false;
1793  }
1794  if (!last_char_was_tilde) {
1795  // Write a reject char.
1796  last_char_was_tilde = true;
1797  *ptr++ = kUNLVReject;
1798  tilde_crunch_written = true;
1799  last_char_was_newline = false;
1800  }
1801  }
1802  } else {
1803  // NORMAL PROCESSING of non tilde crunched words.
1804  tilde_crunch_written = false;
1806  const char* wordstr = word->best_choice->unichar_string().string();
1807  const STRING& lengths = word->best_choice->unichar_lengths();
1808  int length = lengths.length();
1809  int i = 0;
1810  int offset = 0;
1811 
1812  if (last_char_was_tilde &&
1813  word->word->space() == 0 && wordstr[offset] == ' ') {
1814  // Prevent adjacent tilde across words - we know that adjacent tildes
1815  // within words have been removed.
1816  // Skip the first character.
1817  offset = lengths[i++];
1818  }
1819  if (i < length && wordstr[offset] != 0) {
1820  if (!last_char_was_newline)
1821  *ptr++ = ' ';
1822  else
1823  last_char_was_newline = false;
1824  for (; i < length; offset += lengths[i++]) {
1825  if (wordstr[offset] == ' ' ||
1826  wordstr[offset] == kTesseractReject) {
1827  *ptr++ = kUNLVReject;
1828  last_char_was_tilde = true;
1829  } else {
1830  if (word->reject_map[i].rejected())
1831  *ptr++ = kUNLVSuspect;
1832  UNICHAR ch(wordstr + offset, lengths[i]);
1833  int uni_ch = ch.first_uni();
1834  for (int j = 0; kUniChs[j] != 0; ++j) {
1835  if (kUniChs[j] == uni_ch) {
1836  uni_ch = kLatinChs[j];
1837  break;
1838  }
1839  }
1840  if (uni_ch <= 0xff) {
1841  *ptr++ = static_cast<char>(uni_ch);
1842  last_char_was_tilde = false;
1843  } else {
1844  *ptr++ = kUNLVReject;
1845  last_char_was_tilde = true;
1846  }
1847  }
1848  }
1849  }
1850  }
1851  if (word->word->flag(W_EOL) && !last_char_was_newline) {
1852  /* Add a new line output */
1853  *ptr++ = '\n';
1854  tilde_crunch_written = false;
1855  last_char_was_newline = true;
1856  last_char_was_tilde = false;
1857  }
1858  }
1859  *ptr++ = '\n';
1860  *ptr = '\0';
1861  return result;
1862 }
1863 
1873 bool TessBaseAPI::DetectOrientationScript(int* orient_deg, float* orient_conf,
1874  const char** script_name,
1875  float* script_conf) {
1876  OSResults osr;
1877 
1878  bool osd = DetectOS(&osr);
1879  if (!osd) {
1880  return false;
1881  }
1882 
1883  int orient_id = osr.best_result.orientation_id;
1884  int script_id = osr.get_best_script(orient_id);
1885  if (orient_conf) *orient_conf = osr.best_result.oconfidence;
1886  if (orient_deg) *orient_deg = orient_id * 90; // convert quadrant to degrees
1887 
1888  if (script_name) {
1889  const char* script = osr.unicharset->get_script_from_script_id(script_id);
1890 
1891  *script_name = script;
1892  }
1893 
1894  if (script_conf) *script_conf = osr.best_result.sconfidence;
1895 
1896  return true;
1897 }
1898 
1904 char* TessBaseAPI::GetOsdText(int page_number) {
1905  int orient_deg;
1906  float orient_conf;
1907  const char* script_name;
1908  float script_conf;
1909 
1910  if (!DetectOrientationScript(&orient_deg, &orient_conf, &script_name,
1911  &script_conf))
1912  return NULL;
1913 
1914  // clockwise rotation needed to make the page upright
1915  int rotate = OrientationIdToValue(orient_deg / 90);
1916 
1917  const int kOsdBufsize = 255;
1918  char* osd_buf = new char[kOsdBufsize];
1919  snprintf(osd_buf, kOsdBufsize,
1920  "Page number: %d\n"
1921  "Orientation in degrees: %d\n"
1922  "Rotate: %d\n"
1923  "Orientation confidence: %.2f\n"
1924  "Script: %s\n"
1925  "Script confidence: %.2f\n",
1926  page_number, orient_deg, rotate, orient_conf, script_name,
1927  script_conf);
1928 
1929  return osd_buf;
1930 }
1931 
1934  int* conf = AllWordConfidences();
1935  if (!conf) return 0;
1936  int sum = 0;
1937  int *pt = conf;
1938  while (*pt >= 0) sum += *pt++;
1939  if (pt != conf) sum /= pt - conf;
1940  delete [] conf;
1941  return sum;
1942 }
1943 
1946  if (tesseract_ == NULL ||
1947  (!recognition_done_ && Recognize(NULL) < 0))
1948  return NULL;
1949  int n_word = 0;
1950  PAGE_RES_IT res_it(page_res_);
1951  for (res_it.restart_page(); res_it.word() != NULL; res_it.forward())
1952  n_word++;
1953 
1954  int* conf = new int[n_word+1];
1955  n_word = 0;
1956  for (res_it.restart_page(); res_it.word() != NULL; res_it.forward()) {
1957  WERD_RES *word = res_it.word();
1958  WERD_CHOICE* choice = word->best_choice;
1959  int w_conf = static_cast<int>(100 + 5 * choice->certainty());
1960  // This is the eq for converting Tesseract confidence to 1..100
1961  if (w_conf < 0) w_conf = 0;
1962  if (w_conf > 100) w_conf = 100;
1963  conf[n_word++] = w_conf;
1964  }
1965  conf[n_word] = -1;
1966  return conf;
1967 }
1968 
1979 bool TessBaseAPI::AdaptToWordStr(PageSegMode mode, const char* wordstr) {
1980  int debug = 0;
1981  GetIntVariable("applybox_debug", &debug);
1982  bool success = true;
1983  PageSegMode current_psm = GetPageSegMode();
1984  SetPageSegMode(mode);
1985  SetVariable("classify_enable_learning", "0");
1986  const std::unique_ptr<const char[]> text(GetUTF8Text());
1987  if (debug) {
1988  tprintf("Trying to adapt \"%s\" to \"%s\"\n", text.get(), wordstr);
1989  }
1990  if (text != NULL) {
1991  PAGE_RES_IT it(page_res_);
1992  WERD_RES* word_res = it.word();
1993  if (word_res != NULL) {
1994  word_res->word->set_text(wordstr);
1995  } else {
1996  success = false;
1997  }
1998  // Check to see if text matches wordstr.
1999  int w = 0;
2000  int t = 0;
2001  for (t = 0; text[t] != '\0'; ++t) {
2002  if (text[t] == '\n' || text[t] == ' ')
2003  continue;
2004  while (wordstr[w] == ' ') ++w;
2005  if (text[t] != wordstr[w])
2006  break;
2007  ++w;
2008  }
2009  if (text[t] != '\0' || wordstr[w] != '\0') {
2010  // No match.
2011  delete page_res_;
2012  GenericVector<TBOX> boxes;
2016  PAGE_RES_IT pr_it(page_res_);
2017  if (pr_it.word() == NULL)
2018  success = false;
2019  else
2020  word_res = pr_it.word();
2021  } else {
2022  word_res->BestChoiceToCorrectText();
2023  }
2024  if (success) {
2025  tesseract_->EnableLearning = true;
2026  tesseract_->LearnWord(NULL, word_res);
2027  }
2028  } else {
2029  success = false;
2030  }
2031  SetPageSegMode(current_psm);
2032  return success;
2033 }
2034 
2042  if (thresholder_ != NULL)
2043  thresholder_->Clear();
2044  ClearResults();
2045  if (tesseract_ != NULL) SetInputImage(NULL);
2046 }
2047 
2055  Clear();
2056  delete thresholder_;
2057  thresholder_ = NULL;
2058  delete page_res_;
2059  page_res_ = NULL;
2060  delete block_list_;
2061  block_list_ = NULL;
2062  if (paragraph_models_ != NULL) {
2064  delete paragraph_models_;
2065  paragraph_models_ = NULL;
2066  }
2067  if (osd_tesseract_ == tesseract_)
2068  osd_tesseract_ = nullptr;
2069  delete tesseract_;
2070  tesseract_ = nullptr;
2071  delete osd_tesseract_;
2072  osd_tesseract_ = NULL;
2073  delete equ_detect_;
2074  equ_detect_ = NULL;
2075  delete input_file_;
2076  input_file_ = NULL;
2077  delete output_file_;
2078  output_file_ = NULL;
2079  delete datapath_;
2080  datapath_ = NULL;
2081  delete language_;
2082  language_ = NULL;
2083 }
2084 
2085 // Clear any library-level memory caches.
2086 // There are a variety of expensive-to-load constant data structures (mostly
2087 // language dictionaries) that are cached globally -- surviving the Init()
2088 // and End() of individual TessBaseAPI's. This function allows the clearing
2089 // of these caches.
2092 }
2093 
2098 int TessBaseAPI::IsValidWord(const char *word) {
2099  return tesseract_->getDict().valid_word(word);
2100 }
2101 // Returns true if utf8_character is defined in the UniCharset.
2102 bool TessBaseAPI::IsValidCharacter(const char *utf8_character) {
2103  return tesseract_->unicharset.contains_unichar(utf8_character);
2104 }
2105 
2106 
2107 // TODO(rays) Obsolete this function and replace with a more aptly named
2108 // function that returns image coordinates rather than tesseract coordinates.
2109 bool TessBaseAPI::GetTextDirection(int* out_offset, float* out_slope) {
2110  PageIterator* it = AnalyseLayout();
2111  if (it == NULL) {
2112  return false;
2113  }
2114  int x1, x2, y1, y2;
2115  it->Baseline(RIL_TEXTLINE, &x1, &y1, &x2, &y2);
2116  // Calculate offset and slope (NOTE: Kind of ugly)
2117  if (x2 <= x1) x2 = x1 + 1;
2118  // Convert the point pair to slope/offset of the baseline (in image coords.)
2119  *out_slope = static_cast<float>(y2 - y1) / (x2 - x1);
2120  *out_offset = static_cast<int>(y1 - *out_slope * x1);
2121  // Get the y-coord of the baseline at the left and right edges of the
2122  // textline's bounding box.
2123  int left, top, right, bottom;
2124  if (!it->BoundingBox(RIL_TEXTLINE, &left, &top, &right, &bottom)) {
2125  delete it;
2126  return false;
2127  }
2128  int left_y = IntCastRounded(*out_slope * left + *out_offset);
2129  int right_y = IntCastRounded(*out_slope * right + *out_offset);
2130  // Shift the baseline down so it passes through the nearest bottom-corner
2131  // of the textline's bounding box. This is the difference between the y
2132  // at the lowest (max) edge of the box and the actual box bottom.
2133  *out_offset += bottom - MAX(left_y, right_y);
2134  // Switch back to bottom-up tesseract coordinates. Requires negation of
2135  // the slope and height - offset for the offset.
2136  *out_slope = -*out_slope;
2137  *out_offset = rect_height_ - *out_offset;
2138  delete it;
2139 
2140  return true;
2141 }
2142 
2145  if (tesseract_ != NULL) {
2147  }
2148 }
2149 
2159  if (tesseract_ != NULL) {
2161  // Set it for the sublangs too.
2162  int num_subs = tesseract_->num_sub_langs();
2163  for (int i = 0; i < num_subs; ++i) {
2165  }
2166  }
2167 }
2168 
2171  if (tesseract_ != NULL) tesseract_->fill_lattice_ = f;
2172 }
2173 
2176  if (tesseract_ == NULL) {
2177  tprintf("Please call Init before attempting to set an image.");
2178  return false;
2179  }
2180  if (thresholder_ == NULL)
2182  ClearResults();
2183  return true;
2184 }
2185 
2192 bool TessBaseAPI::Threshold(Pix** pix) {
2193  ASSERT_HOST(pix != NULL);
2194  if (*pix != NULL)
2195  pixDestroy(pix);
2196  // Zero resolution messes up the algorithms, so make sure it is credible.
2197  int y_res = thresholder_->GetScaledYResolution();
2198  if (y_res < kMinCredibleResolution || y_res > kMaxCredibleResolution) {
2199  // Use the minimum default resolution, as it is safer to under-estimate
2200  // than over-estimate resolution.
2201  tprintf("Warning. Invalid resolution %d dpi. Using %d instead.\n", y_res,
2202  kMinCredibleResolution);
2203  thresholder_->SetSourceYResolution(kMinCredibleResolution);
2204  }
2205  PageSegMode pageseg_mode =
2206  static_cast<PageSegMode>(
2207  static_cast<int>(tesseract_->tessedit_pageseg_mode));
2208  if (!thresholder_->ThresholdToPix(pageseg_mode, pix)) return false;
2212  if (!thresholder_->IsBinary()) {
2215  } else {
2217  tesseract_->set_pix_grey(NULL);
2218  }
2219  // Set the internal resolution that is used for layout parameters from the
2220  // estimated resolution, rather than the image resolution, which may be
2221  // fabricated, but we will use the image resolution, if there is one, to
2222  // report output point sizes.
2223  int estimated_res = ClipToRange(thresholder_->GetScaledEstimatedResolution(),
2226  if (estimated_res != thresholder_->GetScaledEstimatedResolution()) {
2227  tprintf("Estimated resolution %d out of range! Corrected to %d\n",
2228  thresholder_->GetScaledEstimatedResolution(), estimated_res);
2229  }
2230  tesseract_->set_source_resolution(estimated_res);
2231  SavePixForCrash(estimated_res, *pix);
2232  return true;
2233 }
2234 
2237  if (thresholder_ == NULL || thresholder_->IsEmpty()) {
2238  tprintf("Please call SetImage before attempting recognition.");
2239  return -1;
2240  }
2241  if (recognition_done_)
2242  ClearResults();
2243  if (!block_list_->empty()) {
2244  return 0;
2245  }
2246  if (tesseract_ == NULL) {
2247  tesseract_ = new Tesseract;
2249  }
2250  if (tesseract_->pix_binary() == NULL &&
2252  return -1;
2253  }
2254 
2256 
2258  if (equ_detect_ == NULL && datapath_ != NULL) {
2259  equ_detect_ = new EquationDetect(datapath_->string(), NULL);
2260  }
2262  }
2263 
2264  Tesseract* osd_tess = osd_tesseract_;
2265  OSResults osr;
2267  osd_tess == nullptr) {
2268  if (strcmp(language_->string(), "osd") == 0) {
2269  osd_tess = tesseract_;
2270  } else {
2271  osd_tesseract_ = new Tesseract;
2272  TessdataManager mgr(reader_);
2273  if (osd_tesseract_->init_tesseract(datapath_->string(), nullptr, "osd",
2274  OEM_TESSERACT_ONLY, nullptr, 0,
2275  nullptr, nullptr, false, &mgr) == 0) {
2276  osd_tess = osd_tesseract_;
2279  } else {
2280  tprintf("Warning: Auto orientation and script detection requested,"
2281  " but osd language failed to load\n");
2282  delete osd_tesseract_;
2283  osd_tesseract_ = nullptr;
2284  }
2285  }
2286  }
2287 
2288  if (tesseract_->SegmentPage(input_file_, block_list_, osd_tess, &osr) < 0)
2289  return -1;
2290  // If Devanagari is being recognized, we use different images for page seg
2291  // and for OCR.
2292  tesseract_->PrepareForTessOCR(block_list_, osd_tess, &osr);
2293  return 0;
2294 }
2295 
2298  if (tesseract_ != NULL) {
2299  tesseract_->Clear();
2300  }
2301  if (page_res_ != NULL) {
2302  delete page_res_;
2303  page_res_ = NULL;
2304  }
2305  recognition_done_ = false;
2306  if (block_list_ == NULL)
2307  block_list_ = new BLOCK_LIST;
2308  else
2309  block_list_->clear();
2310  if (paragraph_models_ != NULL) {
2312  delete paragraph_models_;
2313  paragraph_models_ = NULL;
2314  }
2315  SavePixForCrash(0, NULL);
2316 }
2317 
2325 int TessBaseAPI::TextLength(int* blob_count) {
2326  if (tesseract_ == NULL || page_res_ == NULL)
2327  return 0;
2328 
2329  PAGE_RES_IT page_res_it(page_res_);
2330  int total_length = 2;
2331  int total_blobs = 0;
2332  // Iterate over the data structures to extract the recognition result.
2333  for (page_res_it.restart_page(); page_res_it.word () != NULL;
2334  page_res_it.forward()) {
2335  WERD_RES *word = page_res_it.word();
2336  WERD_CHOICE* choice = word->best_choice;
2337  if (choice != NULL) {
2338  total_blobs += choice->length() + 2;
2339  total_length += choice->unichar_string().length() + 2;
2340  for (int i = 0; i < word->reject_map.length(); ++i) {
2341  if (word->reject_map[i].rejected())
2342  ++total_length;
2343  }
2344  }
2345  }
2346  if (blob_count != NULL)
2347  *blob_count = total_blobs;
2348  return total_length;
2349 }
2350 
2356  if (tesseract_ == NULL)
2357  return false;
2358  ClearResults();
2359  if (tesseract_->pix_binary() == NULL &&
2361  return false;
2362  }
2363  if (input_file_ == NULL)
2364  input_file_ = new STRING(kInputFile);
2366 }
2367 
2369  tesseract_->min_orientation_margin.set_value(margin);
2370 }
2371 
2386 void TessBaseAPI::GetBlockTextOrientations(int** block_orientation,
2387  bool** vertical_writing) {
2388  delete[] *block_orientation;
2389  *block_orientation = NULL;
2390  delete[] *vertical_writing;
2391  *vertical_writing = NULL;
2392  BLOCK_IT block_it(block_list_);
2393 
2394  block_it.move_to_first();
2395  int num_blocks = 0;
2396  for (block_it.mark_cycle_pt(); !block_it.cycled_list(); block_it.forward()) {
2397  if (!block_it.data()->poly_block()->IsText()) {
2398  continue;
2399  }
2400  ++num_blocks;
2401  }
2402  if (!num_blocks) {
2403  tprintf("WARNING: Found no blocks\n");
2404  return;
2405  }
2406  *block_orientation = new int[num_blocks];
2407  *vertical_writing = new bool[num_blocks];
2408  block_it.move_to_first();
2409  int i = 0;
2410  for (block_it.mark_cycle_pt(); !block_it.cycled_list();
2411  block_it.forward()) {
2412  if (!block_it.data()->poly_block()->IsText()) {
2413  continue;
2414  }
2415  FCOORD re_rotation = block_it.data()->re_rotation();
2416  float re_theta = re_rotation.angle();
2417  FCOORD classify_rotation = block_it.data()->classify_rotation();
2418  float classify_theta = classify_rotation.angle();
2419  double rot_theta = - (re_theta - classify_theta) * 2.0 / PI;
2420  if (rot_theta < 0) rot_theta += 4;
2421  int num_rotations = static_cast<int>(rot_theta + 0.5);
2422  (*block_orientation)[i] = num_rotations;
2423  // The classify_rotation is non-zero only if the text has vertical
2424  // writing direction.
2425  (*vertical_writing)[i] = classify_rotation.y() != 0.0f;
2426  ++i;
2427  }
2428 }
2429 
2430 // ____________________________________________________________________________
2431 // Ocropus add-ons.
2432 
2435  FindLines();
2436  BLOCK_LIST* result = block_list_;
2437  block_list_ = NULL;
2438  return result;
2439 }
2440 
2446 void TessBaseAPI::DeleteBlockList(BLOCK_LIST *block_list) {
2447  delete block_list;
2448 }
2449 
2450 
2452  float xheight,
2453  float descender,
2454  float ascender) {
2455  inT32 xstarts[] = {-32000};
2456  double quad_coeffs[] = {0, 0, baseline};
2457  return new ROW(1,
2458  xstarts,
2459  quad_coeffs,
2460  xheight,
2461  ascender - (baseline + xheight),
2462  descender - baseline,
2463  0,
2464  0);
2465 }
2466 
2469  int width = pixGetWidth(pix);
2470  int height = pixGetHeight(pix);
2471  BLOCK block("a character", TRUE, 0, 0, 0, 0, width, height);
2472 
2473  // Create C_BLOBs from the page
2474  extract_edges(pix, &block);
2475 
2476  // Merge all C_BLOBs
2477  C_BLOB_LIST *list = block.blob_list();
2478  C_BLOB_IT c_blob_it(list);
2479  if (c_blob_it.empty())
2480  return NULL;
2481  // Move all the outlines to the first blob.
2482  C_OUTLINE_IT ol_it(c_blob_it.data()->out_list());
2483  for (c_blob_it.forward();
2484  !c_blob_it.at_first();
2485  c_blob_it.forward()) {
2486  C_BLOB *c_blob = c_blob_it.data();
2487  ol_it.add_list_after(c_blob->out_list());
2488  }
2489  // Convert the first blob to the output TBLOB.
2490  return TBLOB::PolygonalCopy(false, c_blob_it.data());
2491 }
2492 
2498 void TessBaseAPI::NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode) {
2499  TBOX box = tblob->bounding_box();
2500  float x_center = (box.left() + box.right()) / 2.0f;
2501  float baseline = row->base_line(x_center);
2502  float scale = kBlnXHeight / row->x_height();
2503  tblob->Normalize(NULL, NULL, NULL, x_center, baseline, scale, scale,
2504  0.0f, static_cast<float>(kBlnBaselineOffset), false, NULL);
2505 }
2506 
2511 TBLOB *make_tesseract_blob(float baseline, float xheight,
2512  float descender, float ascender,
2513  bool numeric_mode, Pix* pix) {
2514  TBLOB *tblob = TessBaseAPI::MakeTBLOB(pix);
2515 
2516  // Normalize TBLOB
2517  ROW *row =
2518  TessBaseAPI::MakeTessOCRRow(baseline, xheight, descender, ascender);
2519  TessBaseAPI::NormalizeTBLOB(tblob, row, numeric_mode);
2520  delete row;
2521  return tblob;
2522 }
2523 
2529 void TessBaseAPI::AdaptToCharacter(const char *unichar_repr,
2530  int length,
2531  float baseline,
2532  float xheight,
2533  float descender,
2534  float ascender) {
2535  UNICHAR_ID id = tesseract_->unicharset.unichar_to_id(unichar_repr, length);
2536  TBLOB *blob = make_tesseract_blob(baseline, xheight, descender, ascender,
2538  tesseract_->pix_binary());
2539  float threshold;
2540  float best_rating = -100;
2541 
2542 
2543  // Classify to get a raw choice.
2544  BLOB_CHOICE_LIST choices;
2545  tesseract_->AdaptiveClassifier(blob, &choices);
2546  BLOB_CHOICE_IT choice_it;
2547  choice_it.set_to_list(&choices);
2548  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2549  choice_it.forward()) {
2550  if (choice_it.data()->rating() > best_rating) {
2551  best_rating = choice_it.data()->rating();
2552  }
2553  }
2554 
2555  threshold = tesseract_->matcher_good_threshold;
2556 
2557  if (blob->outlines)
2558  tesseract_->AdaptToChar(blob, id, kUnknownFontinfoId, threshold,
2560  delete blob;
2561 }
2562 
2563 
2564 PAGE_RES* TessBaseAPI::RecognitionPass1(BLOCK_LIST* block_list) {
2565  PAGE_RES *page_res = new PAGE_RES(false, block_list,
2567  tesseract_->recog_all_words(page_res, NULL, NULL, NULL, 1);
2568  return page_res;
2569 }
2570 
2571 PAGE_RES* TessBaseAPI::RecognitionPass2(BLOCK_LIST* block_list,
2572  PAGE_RES* pass1_result) {
2573  if (!pass1_result)
2574  pass1_result = new PAGE_RES(false, block_list,
2576  tesseract_->recog_all_words(pass1_result, NULL, NULL, NULL, 2);
2577  return pass1_result;
2578 }
2579 
2580 void TessBaseAPI::DetectParagraphs(bool after_text_recognition) {
2581  int debug_level = 0;
2582  GetIntVariable("paragraph_debug_level", &debug_level);
2583  if (paragraph_models_ == NULL)
2585  MutableIterator *result_it = GetMutableIterator();
2586  do { // Detect paragraphs for this block
2588  ::tesseract::DetectParagraphs(debug_level, after_text_recognition,
2589  result_it, &models);
2590  *paragraph_models_ += models;
2591  } while (result_it->Next(RIL_BLOCK));
2592  delete result_it;
2593 }
2594 
2597  int length; // of unicode_repr
2598  float cost;
2600 
2601  TESS_CHAR(float _cost, const char *repr, int len = -1) : cost(_cost) {
2602  length = (len == -1 ? strlen(repr) : len);
2603  unicode_repr = new char[length + 1];
2604  strncpy(unicode_repr, repr, length);
2605  }
2606 
2607  TESS_CHAR() { // Satisfies ELISTIZE.
2608  }
2610  delete [] unicode_repr;
2611  }
2612 };
2613 
2616 
2617 static void add_space(TESS_CHAR_IT* it) {
2618  TESS_CHAR *t = new TESS_CHAR(0, " ");
2619  it->add_after_then_move(t);
2620 }
2621 
2622 
2623 static float rating_to_cost(float rating) {
2624  rating = 100 + rating;
2625  // cuddled that to save from coverage profiler
2626  // (I have never seen ratings worse than -100,
2627  // but the check won't hurt)
2628  if (rating < 0) rating = 0;
2629  return rating;
2630 }
2631 
2636 static void extract_result(TESS_CHAR_IT* out,
2637  PAGE_RES* page_res) {
2638  PAGE_RES_IT page_res_it(page_res);
2639  int word_count = 0;
2640  while (page_res_it.word() != NULL) {
2641  WERD_RES *word = page_res_it.word();
2642  const char *str = word->best_choice->unichar_string().string();
2643  const char *len = word->best_choice->unichar_lengths().string();
2644  TBOX real_rect = word->word->bounding_box();
2645 
2646  if (word_count)
2647  add_space(out);
2648  int n = strlen(len);
2649  for (int i = 0; i < n; i++) {
2650  TESS_CHAR *tc = new TESS_CHAR(rating_to_cost(word->best_choice->rating()),
2651  str, *len);
2652  tc->box = real_rect.intersection(word->box_word->BlobBox(i));
2653  out->add_after_then_move(tc);
2654  str += *len;
2655  len++;
2656  }
2657  page_res_it.forward();
2658  word_count++;
2659  }
2660 }
2661 
2667  int** lengths,
2668  float** costs,
2669  int** x0,
2670  int** y0,
2671  int** x1,
2672  int** y1,
2673  PAGE_RES* page_res) {
2674  TESS_CHAR_LIST tess_chars;
2675  TESS_CHAR_IT tess_chars_it(&tess_chars);
2676  extract_result(&tess_chars_it, page_res);
2677  tess_chars_it.move_to_first();
2678  int n = tess_chars.length();
2679  int text_len = 0;
2680  *lengths = new int[n];
2681  *costs = new float[n];
2682  *x0 = new int[n];
2683  *y0 = new int[n];
2684  *x1 = new int[n];
2685  *y1 = new int[n];
2686  int i = 0;
2687  for (tess_chars_it.mark_cycle_pt();
2688  !tess_chars_it.cycled_list();
2689  tess_chars_it.forward(), i++) {
2690  TESS_CHAR *tc = tess_chars_it.data();
2691  text_len += (*lengths)[i] = tc->length;
2692  (*costs)[i] = tc->cost;
2693  (*x0)[i] = tc->box.left();
2694  (*y0)[i] = tc->box.bottom();
2695  (*x1)[i] = tc->box.right();
2696  (*y1)[i] = tc->box.top();
2697  }
2698  char *p = *text = new char[text_len];
2699 
2700  tess_chars_it.move_to_first();
2701  for (tess_chars_it.mark_cycle_pt();
2702  !tess_chars_it.cycled_list();
2703  tess_chars_it.forward()) {
2704  TESS_CHAR *tc = tess_chars_it.data();
2705  strncpy(p, tc->unicode_repr, tc->length);
2706  p += tc->length;
2707  }
2708  return n;
2709 }
2710 
2712 // The resulting features are returned in int_features, which must be
2713 // of size MAX_NUM_INT_FEATURES. The number of features is returned in
2714 // num_features (or 0 if there was a failure).
2715 // On return feature_outline_index is filled with an index of the outline
2716 // corresponding to each feature in int_features.
2717 // TODO(rays) Fix the caller to out outline_counts instead.
2719  INT_FEATURE_STRUCT* int_features,
2720  int* num_features,
2721  int* feature_outline_index) {
2722  GenericVector<int> outline_counts;
2725  INT_FX_RESULT_STRUCT fx_info;
2726  tesseract_->ExtractFeatures(*blob, false, &bl_features,
2727  &cn_features, &fx_info, &outline_counts);
2728  if (cn_features.empty() || cn_features.size() > MAX_NUM_INT_FEATURES) {
2729  *num_features = 0;
2730  return; // Feature extraction failed.
2731  }
2732  *num_features = cn_features.size();
2733  memcpy(int_features, &cn_features[0], *num_features * sizeof(cn_features[0]));
2734  // TODO(rays) Pass outline_counts back and simplify the calling code.
2735  if (feature_outline_index != NULL) {
2736  int f = 0;
2737  for (int i = 0; i < outline_counts.size(); ++i) {
2738  while (f < outline_counts[i])
2739  feature_outline_index[f++] = i;
2740  }
2741  }
2742 }
2743 
2744 // This method returns the row to which a box of specified dimensions would
2745 // belong. If no good match is found, it returns NULL.
2746 ROW* TessBaseAPI::FindRowForBox(BLOCK_LIST* blocks,
2747  int left, int top, int right, int bottom) {
2748  TBOX box(left, bottom, right, top);
2749  BLOCK_IT b_it(blocks);
2750  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
2751  BLOCK* block = b_it.data();
2752  if (!box.major_overlap(block->bounding_box()))
2753  continue;
2754  ROW_IT r_it(block->row_list());
2755  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
2756  ROW* row = r_it.data();
2757  if (!box.major_overlap(row->bounding_box()))
2758  continue;
2759  WERD_IT w_it(row->word_list());
2760  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
2761  WERD* word = w_it.data();
2762  if (box.major_overlap(word->bounding_box()))
2763  return row;
2764  }
2765  }
2766  }
2767  return NULL;
2768 }
2769 
2772  int num_max_matches,
2773  int* unichar_ids,
2774  float* ratings,
2775  int* num_matches_returned) {
2776  BLOB_CHOICE_LIST* choices = new BLOB_CHOICE_LIST;
2777  tesseract_->AdaptiveClassifier(blob, choices);
2778  BLOB_CHOICE_IT choices_it(choices);
2779  int& index = *num_matches_returned;
2780  index = 0;
2781  for (choices_it.mark_cycle_pt();
2782  !choices_it.cycled_list() && index < num_max_matches;
2783  choices_it.forward()) {
2784  BLOB_CHOICE* choice = choices_it.data();
2785  unichar_ids[index] = choice->unichar_id();
2786  ratings[index] = choice->rating();
2787  ++index;
2788  }
2789  *num_matches_returned = index;
2790  delete choices;
2791 }
2792 
2794 const char* TessBaseAPI::GetUnichar(int unichar_id) {
2795  return tesseract_->unicharset.id_to_unichar(unichar_id);
2796 }
2797 
2799 const Dawg *TessBaseAPI::GetDawg(int i) const {
2800  if (tesseract_ == NULL || i >= NumDawgs()) return NULL;
2801  return tesseract_->getDict().GetDawg(i);
2802 }
2803 
2806  return tesseract_ == NULL ? 0 : tesseract_->getDict().NumDawgs();
2807 }
2808 
2810 STRING HOcrEscape(const char* text) {
2811  STRING ret;
2812  const char *ptr;
2813  for (ptr = text; *ptr; ptr++) {
2814  switch (*ptr) {
2815  case '<': ret += "&lt;"; break;
2816  case '>': ret += "&gt;"; break;
2817  case '&': ret += "&amp;"; break;
2818  case '"': ret += "&quot;"; break;
2819  case '\'': ret += "&#39;"; break;
2820  default: ret += *ptr;
2821  }
2822  }
2823  return ret;
2824 }
2825 
2826 } // namespace tesseract.
double u[max]
STRING * input_file_
Name used by training code.
Definition: baseapi.h:878
void set_deadline_msecs(inT32 deadline_msecs)
Definition: ocrclass.h:146
const char kUNLVSuspect
Definition: baseapi.cpp:91
static TESS_LOCAL int TesseractExtractResult(char **text, int **lengths, float **costs, int **x0, int **y0, int **x1, int **y1, PAGE_RES *page_res)
Definition: baseapi.cpp:2666
void DetectParagraphs(int debug_level, GenericVector< RowInfo > *row_infos, GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs, GenericVector< ParagraphModel *> *models)
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:509
PolyBlockType BlockType() const
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
FileReader reader_
Reads files from any filesystem.
Definition: baseapi.h:873
const char * WordFontAttributes(bool *is_bold, bool *is_italic, bool *is_underlined, bool *is_monospace, bool *is_serif, bool *is_smallcaps, int *pointsize, int *font_id) const
GenericVector< StringParam * > string_params
Definition: params.h:46
static ROW * FindRowForBox(BLOCK_LIST *blocks, int left, int top, int right, int bottom)
Definition: baseapi.cpp:2746
bool Empty(PageIteratorLevel level) const
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
int IsValidWord(const char *word)
Definition: baseapi.cpp:2098
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
double matcher_good_threshold
Definition: classify.h:419
virtual ~TessBaseAPI()
Definition: baseapi.cpp:137
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
#define PERF_COUNT_START(FUNCT_NAME)
virtual bool Next(PageIteratorLevel level)
const int kMaxCredibleResolution
Definition: baseapi.cpp:109
TESS_CHAR(float _cost, const char *repr, int len=-1)
Definition: baseapi.cpp:2601
virtual bool IsAtBeginningOf(PageIteratorLevel level) const
#define TRUE
Definition: capi.h:45
const TBOX & BlobBox(int index) const
Definition: boxword.h:86
Definition: points.h:189
void ReadConfigFile(const char *filename)
Definition: baseapi.cpp:468
#define round(x)
Definition: mathfix.h:34
const char * GetUnichar(int unichar_id)
Definition: baseapi.cpp:2794
TESS_LOCAL LTRResultIterator * GetLTRIterator()
Definition: baseapi.cpp:1219
char * TesseractRect(const unsigned char *imagedata, int bytes_per_pixel, int bytes_per_line, int left, int top, int width, int height)
Definition: baseapi.cpp:509
int32_t inT32
Definition: host.h:38
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void(Wordrec::* FillLatticeFunc)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: baseapi.h:90
const char * kInputFile
Definition: baseapi.cpp:96
int num_sub_langs() const
bool classify_bln_numeric_mode
Definition: classify.h:499
TESS_API int get_best_script(int orientation_id) const
Definition: osdetect.cpp:114
static const char * Version()
Definition: baseapi.cpp:144
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:644
int UNICHAR_ID
Definition: unichar.h:33
static ROW * MakeTessOCRRow(float baseline, float xheight, float descender, float ascender)
Definition: baseapi.cpp:2451
void signal_exit(int signal_code)
Definition: globaloc.cpp:52
ParamsVectors * params()
Definition: ccutil.h:62
WERD_CHOICE * best_choice
Definition: pageres.h:219
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
bool(* FileReader)(const STRING &filename, GenericVector< char > *data)
Dict & getDict()
Definition: classify.h:65
void recog_training_segmented(const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
int length() const
Definition: ratngs.h:301
const STRING & unichar_lengths() const
Definition: ratngs.h:546
int GetSourceYResolution() const
Definition: thresholder.h:90
virtual Pix * GetPixRectThresholds()
const int kMaxBytesPerLine
Definition: baseapi.cpp:1700
virtual char * GetUTF8Text(PageIteratorLevel level) const
int RecognizeForChopTest(ETEXT_DESC *monitor)
Definition: baseapi.cpp:888
bool IsEmpty() const
Return true if no image has been set.
Definition: thresholder.cpp:50
STRING * output_file_
Name used by debug code.
Definition: baseapi.h:879
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
Definition: applybox.cpp:796
BLOCK_LIST * block_list_
The page layout.
Definition: baseapi.h:876
char * GetOsdText(int page_number)
Definition: baseapi.cpp:1904
char * GetHOCRText(ETEXT_DESC *monitor, int page_number)
Definition: baseapi.cpp:1411
virtual bool ThresholdToPix(PageSegMode pageseg_mode, Pix **pix)
Returns false on error.
bool LoadMemBuffer(const char *name, const char *data, int size)
char * GetTSVText(int page_number)
Definition: baseapi.cpp:1582
Pix * pix_binary() const
voidpf void uLong size
Definition: ioapi.h:39
int orientation_id
Definition: osdetect.h:41
int Init(const char *datapath, const char *language, OcrEngineMode mode, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_non_debug_params)
Definition: baseapi.cpp:278
#define DIR
Definition: polyaprx.cpp:39
static TBLOB * MakeTBLOB(Pix *pix)
Definition: baseapi.cpp:2468
bool GetTextDirection(int *out_offset, float *out_slope)
Definition: baseapi.cpp:2109
void SetProbabilityInContextFunc(ProbabilityInContextFunc f)
Definition: baseapi.cpp:2158
float Confidence(PageIteratorLevel level) const
#define GIT_REV
Definition: config_auto.h:14
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:179
bool ProcessPagesInternal(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1062
Tesseract * get_sub_lang(int index) const
void SetFillLatticeFunc(FillLatticeFunc f)
Definition: baseapi.cpp:2170
float angle() const
find angle
Definition: points.h:249
virtual Pix * GetPixRectGrey()
Definition: werd.h:36
const int kMinRectSize
Definition: baseapi.cpp:85
int GetThresholdedImageScaleFactor() const
Definition: baseapi.cpp:746
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:814
TESSLINE * outlines
Definition: blobs.h:377
ImageThresholder * thresholder_
Image thresholding module.
Definition: baseapi.h:874
static size_t getOpenCLDevice(void **device)
Definition: baseapi.cpp:164
const int kBlnXHeight
Definition: normalis.h:28
void SetEquationDetect(EquationDetect *detector)
void set_source_resolution(int ppi)
int NumDawgs() const
Definition: baseapi.cpp:2805
bool SetVariable(const char *name, const char *value)
Definition: baseapi.cpp:218
int push_back(T object)
void SetRectangle(int left, int top, int width, int height)
void SavePixForCrash(int resolution, Pix *pix)
Definition: globaloc.cpp:34
float rating() const
Definition: ratngs.h:79
UNICHARSET * unicharset
Definition: osdetect.h:78
#define tprintf(...)
Definition: tprintf.h:31
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:160
BLOCK * block
Definition: pageres.h:99
Pix * GetBinaryImage(PageIteratorLevel level) const
void BestChoiceToCorrectText()
Definition: pageres.cpp:918
const char * string() const
Definition: strngs.cpp:198
void GetAvailableLanguagesAsVector(GenericVector< STRING > *langs) const
Definition: baseapi.cpp:391
void set_text(const char *new_text)
Definition: werd.h:126
void ReadDebugConfigFile(const char *filename)
Definition: baseapi.cpp:473
PAGE_RES * page_res_
The page-level data.
Definition: baseapi.h:877
const int kBlnBaselineOffset
Definition: normalis.h:29
void * cancel_this
called whenever progress increases
Definition: ocrclass.h:127
static bool GetParamAsString(const char *name, const ParamsVectors *member_params, STRING *value)
Definition: params.cpp:135
void RunAdaptiveClassifier(TBLOB *blob, int num_max_matches, int *unichar_ids, float *ratings, int *num_matches_returned)
Definition: baseapi.cpp:2771
ROW * row
Definition: pageres.h:127
PageIterator * AnalyseLayout()
Definition: baseapi.cpp:787
voidpf uLong offset
Definition: ioapi.h:42
WERD_LIST * word_list()
Definition: ocrrow.h:52
void SetRectangle(int left, int top, int width, int height)
Definition: baseapi.cpp:582
bool empty() const
Definition: genericvector.h:90
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:305
TESS_LOCAL PAGE_RES * RecognitionPass1(BLOCK_LIST *block_list)
Definition: baseapi.cpp:2564
FILE * init_recog_training(const STRING &fname)
#define BOOL_VAR(name, val, comment)
Definition: params.h:279
float x_height() const
Definition: ocrrow.h:61
#define BOOL
Definition: capi.h:44
inT32 length() const
Definition: strngs.cpp:193
OcrEngineMode last_oem_requested_
Last ocr language mode requested.
Definition: baseapi.h:882
Boxa * GetConnectedComponents(Pixa **cc)
Definition: baseapi.cpp:652
int IntCastRounded(double x)
Definition: helpers.h:179
TESS_LOCAL bool InternalSetImage()
Definition: baseapi.cpp:2175
int size() const
Definition: genericvector.h:72
TESS_LOCAL void DetectParagraphs(bool after_text_recognition)
Definition: baseapi.cpp:2580
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
void ExtractFontName(const STRING &filename, STRING *fontname)
Definition: blobclass.cpp:46
virtual bool IsAtFinalElement(PageIteratorLevel level, PageIteratorLevel element) const
TBOX bounding_box() const
Definition: werd.cpp:160
#define UNICHAR_LEN
Definition: unichar.h:30
const char kUNLVReject
Definition: baseapi.cpp:89
bool ProcessPages(const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1036
tesseract::BoxWord * box_word
Definition: pageres.h:250
float oconfidence
Definition: osdetect.h:44
Tesseract * tesseract_
The underlying data object.
Definition: baseapi.h:870
ROW_RES * row() const
Definition: pageres.h:739
bool DetectOrientationScript(int *orient_deg, float *orient_conf, const char **script_name, float *script_conf)
Definition: baseapi.cpp:1873
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:706
#define ASSERT_HOST(x)
Definition: errcode.h:84
float sconfidence
Definition: osdetect.h:43
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
inT16 left() const
Definition: rect.h:68
STRING lang
Definition: ccutil.h:66
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
Pix * pix_original() const
MutableIterator * GetMutableIterator()
Definition: baseapi.cpp:1253
virtual void GetImageSizes(int *left, int *top, int *width, int *height, int *imagewidth, int *imageheight)
bool GetBoolVariable(const char *name, bool *value) const
Definition: baseapi.cpp:238
inT32 length() const
Definition: rejctmap.h:235
void SetSourceResolution(int ppi)
Definition: baseapi.cpp:555
TruthCallback * truth_cb_
Definition: baseapi.h:884
const int kBytesPer64BitNumber
Definition: baseapi.cpp:1693
WERD_RES * restart_page()
Definition: pageres.h:683
const char * GetInitLanguagesAsString() const
Definition: baseapi.cpp:367
const int kUniChs[]
Definition: baseapi.cpp:1750
bool IsValidCharacter(const char *utf8_character)
Definition: baseapi.cpp:2102
double(Dict::* probability_in_context_)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Probability in context function used by the ngram permuter.
Definition: dict.h:366
const int kBytesPerBoxFileLine
Definition: baseapi.cpp:1691
TESS_LOCAL void AdaptToCharacter(const char *unichar_repr, int length, float baseline, float xheight, float descender, float ascender)
Definition: baseapi.cpp:2529
int Recognize(ETEXT_DESC *monitor)
Definition: baseapi.cpp:807
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
Definition: dict.h:414
const char * GetStringVariable(const char *name) const
Definition: baseapi.cpp:246
static void NormalizeTBLOB(TBLOB *tblob, ROW *row, bool numeric_mode)
Definition: baseapi.cpp:2498
void chomp_string(char *str)
Definition: helpers.h:82
#define PI
Definition: const.h:19
WERD_RES * forward()
Definition: pageres.h:716
int NumDawgs() const
Return the number of dawgs in the dawgs_ vector.
Definition: dict.h:412
void GetBlockTextOrientations(int **block_orientation, bool **vertical_writing)
Definition: baseapi.cpp:2386
TBLOB * make_tesseract_blob(float baseline, float xheight, float descender, float ascender, bool numeric_mode, Pix *pix)
Definition: baseapi.cpp:2511
BLOCK_LIST * FindLinesCreateBlockList()
Definition: baseapi.cpp:2434
EquationDetect * equ_detect_
The equation detector.
Definition: baseapi.h:872
Pix * pix_grey() const
void GetFeaturesForBlob(TBLOB *blob, INT_FEATURE_STRUCT *int_features, int *num_features, int *feature_outline_index)
Definition: baseapi.cpp:2718
Automatic page segmentation, but no OSD, or OCR.
Definition: publictypes.h:155
unsigned char BOOL8
Definition: host.h:44
double(Dict::* ProbabilityInContextFunc)(const char *lang, const char *context, int context_bytes, const char *character, int character_bytes)
Definition: baseapi.h:83
TESS_LOCAL int TextLength(int *blob_count)
Definition: baseapi.cpp:2325
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: thresholder.cpp:62
Definition: strngs.h:45
Pix * GetImage(PageIteratorLevel level, int padding, Pix *original_img, int *left, int *top) const
#define FALSE
Definition: capi.h:46
const char int mode
Definition: ioapi.h:38
int GetScaledEstimatedResolution() const
Definition: thresholder.h:106
C_BLOB_LIST * blob_list()
get blobs
Definition: ocrblock.h:132
void RowAttributes(float *row_height, float *descenders, float *ascenders) const
void SetImage(const unsigned char *imagedata, int width, int height, int bytes_per_pixel, int bytes_per_line)
Definition: baseapi.cpp:545
virtual TESS_LOCAL bool Threshold(Pix **pix)
Definition: baseapi.cpp:2192
void add_str_double(const char *str, double number)
Definition: strngs.cpp:391
OSBestResult best_result
Definition: osdetect.h:79
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
static void DeleteBlockList(BLOCK_LIST *block_list)
Definition: baseapi.cpp:2446
void SetInputImage(Pix *pix)
Definition: baseapi.cpp:920
Tesseract * tesseract() const
Definition: baseapi.h:769
TESS_LOCAL int FindLines()
Definition: baseapi.cpp:2236
const int kLatinChs[]
Definition: baseapi.cpp:1754
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:295
void AdaptToChar(TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
Definition: adaptmatch.cpp:872
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:750
Pix * GetThresholdedImage()
Definition: baseapi.cpp:593
ResultIterator * GetIterator()
Definition: baseapi.cpp:1236
bool SetDebugVariable(const char *name, const char *value)
Definition: baseapi.cpp:224
void Normalize(const BLOCK *block, const FCOORD *rotation, const DENORM *predecessor, float x_origin, float y_origin, float x_scale, float y_scale, float final_xshift, float final_yshift, bool inverse, Pix *pix)
Definition: blobs.cpp:413
Boxa * GetWords(Pixa **pixa)
Definition: baseapi.cpp:642
OcrEngineMode oem() const
Definition: baseapi.h:771
bool GetDoubleVariable(const char *name, double *value) const
Definition: baseapi.cpp:252
bool DetectOS(OSResults *)
Definition: baseapi.cpp:2355
tesseract::ParamsVectors * GlobalParams()
Definition: params.cpp:33
ADAPT_TEMPLATES AdaptedTemplates
Definition: classify.h:472
int orientation_and_script_detection(STRING &filename, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:191
UNICHARSET unicharset
Definition: ccutil.h:68
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
const int kMaxIntSize
Definition: baseapi.cpp:102
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:125
float certainty() const
Definition: ratngs.h:328
Definition: werd.h:35
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:91
#define PERF_COUNT_END
const Dawg * GetDawg(int i) const
Definition: baseapi.cpp:2799
int first_uni() const
Definition: unichar.cpp:97
TBOX bounding_box() const
Definition: ocrrow.h:85
StrongScriptDirection WordDirection() const
STRING * language_
Last initialized language.
Definition: baseapi.h:881
static ResultIterator * StartOfParagraph(const LTRResultIterator &resit)
bool IsBinary() const
Returns true if the source image is binary.
Definition: thresholder.h:75
bool Baseline(PageIteratorLevel level, int *x1, int *y1, int *x2, int *y2) const
bool stream_filelist
Definition: baseapi.cpp:80
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
inT16 top() const
Definition: rect.h:54
STRING HOcrEscape(const char *text)
Definition: baseapi.cpp:2810
void SetDictFunc(DictFunc f)
Definition: baseapi.cpp:2144
const char * kOldVarsFile
Definition: baseapi.cpp:100
#define MAX(x, y)
Definition: ndminx.h:24
void SetSourceYResolution(int ppi)
Definition: thresholder.h:86
virtual void Clear()
Destroy the Pix if there is one, freeing memory.
Definition: thresholder.cpp:45
virtual bool Next(PageIteratorLevel level)
Tesseract * osd_tesseract_
For orientation & script detection.
Definition: baseapi.h:871
void DumpPGM(const char *filename)
Definition: baseapi.cpp:754
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:445
int(Dict::* DictFunc)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: baseapi.h:81
#define INVALID_HANDLE_VALUE
Definition: iowin32.c:17
const STRING & unichar_string() const
Definition: ratngs.h:539
Definition: rect.h:30
const int kNumbersPerBlob
Definition: baseapi.cpp:1680
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:419
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:60
Boxa * GetTextlines(const bool raw_image, const int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:619
void SetPageSegMode(PageSegMode mode)
Definition: baseapi.cpp:482
STRING * datapath_
Current location of tessdata.
Definition: baseapi.h:880
void SetOutputName(const char *name)
Definition: baseapi.cpp:211
const char * GetInputName()
Definition: baseapi.cpp:924
static void ClearPersistentCache()
Definition: baseapi.cpp:2090
Definition: blobs.h:261
char * GetBoxText(int page_number)
Definition: baseapi.cpp:1709
#define MAX_NUM_INT_FEATURES
Definition: intproto.h:132
GenericVector< IntParam * > int_params
Definition: params.h:44
void set_pix_thresholds(Pix *thresholds)
void DeleteUnusedDawgs()
Definition: dawg_cache.h:43
static void ResetToDefaults(ParamsVectors *member_params)
Definition: params.cpp:198
const char * filename
Definition: ioapi.h:38
float y() const
Definition: points.h:212
PageSegMode GetPageSegMode() const
Definition: baseapi.cpp:489
void GetLoadedLanguagesAsVector(GenericVector< STRING > *langs) const
Definition: baseapi.cpp:377
WERD * word
Definition: pageres.h:175
bool AddImage(TessBaseAPI *api)
Definition: renderer.cpp:83
const char * c_str() const
Definition: strngs.cpp:209
virtual R Run()=0
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185
inT16 right() const
Definition: rect.h:75
int OrientationIdToValue(const int &id)
Definition: osdetect.cpp:565
void PrintVariables(FILE *fp) const
Definition: baseapi.cpp:266
voidpf void * buf
Definition: ioapi.h:39
void TrainLineRecognizer(const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
Definition: linerec.cpp:45
#define PERF_COUNT_SUB(SUB)
const char * GetDatapath()
Definition: baseapi.cpp:930
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
bool PTIsTextType(PolyBlockType type)
Definition: publictypes.h:70
ELISTIZEH(AmbigSpec)
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:97
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:217
const char kTesseractReject
Definition: baseapi.cpp:87
TESS_LOCAL PAGE_RES * RecognitionPass2(BLOCK_LIST *block_list, PAGE_RES *pass1_result)
Definition: baseapi.cpp:2571
WERD_RES * word() const
Definition: pageres.h:736
bool wordrec_run_blamer
Definition: wordrec.h:168
bool BoundingBoxInternal(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
inT16 bottom() const
Definition: rect.h:61
void pgeditor_main(int width, int height, PAGE_RES *page_res)
Definition: pgedit.cpp:337
bool ProcessPage(Pix *pix, int page_index, const char *filename, const char *retry_config, int timeout_millisec, TessResultRenderer *renderer)
Definition: baseapi.cpp:1154
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:117
STRING datadir
Definition: ccutil.h:64
bool GetIntVariable(const char *name, int *value) const
Definition: baseapi.cpp:230
bool BoundingBox(PageIteratorLevel level, int *left, int *top, int *right, int *bottom) const
void set_min_orientation_margin(double margin)
Definition: baseapi.cpp:2368
Boxa * GetComponentImages(const PageIteratorLevel level, const bool text_only, const bool raw_image, const int raw_padding, Pixa **pixa, int **blockids, int **paraids)
Definition: baseapi.cpp:664
int SegmentPage(const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:198
const int kBytesPerNumber
Definition: baseapi.cpp:1685
bool recognition_done_
page_res_ contains recognition data.
Definition: baseapi.h:883
void set_pix_grey(Pix *grey_pix)
GenericVector< DoubleParam * > double_params
Definition: params.h:47
bool GetVariableAsString(const char *name, STRING *val)
Definition: baseapi.cpp:261
bool BeginDocument(const char *title)
Definition: renderer.cpp:72
#define MAX_PATH
Definition: platform.h:49
const int kMinCredibleResolution
Definition: baseapi.cpp:107
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
float base_line(float xpos) const
Definition: ocrrow.h:56
const char * string() const
Definition: params.h:202
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
static TBLOB * PolygonalCopy(bool allow_detailed_fx, C_BLOB *src)
Definition: blobs.cpp:344
ELISTIZE(AmbigSpec)
void delete_data_pointers()
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
TBOX bounding_box() const
Definition: blobs.cpp:482
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:253
Definition: werd.h:60
Orientation and script detection only.
Definition: publictypes.h:152
void CorrectClassifyWords(PAGE_RES *page_res)
Definition: applybox.cpp:772
REJMAP reject_map
Definition: pageres.h:271
void SetInputName(const char *name)
Definition: baseapi.cpp:203
#define TESSERACT_VERSION_STR
Definition: baseapi.h:23
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:300
int InitLangMod(const char *datapath, const char *language)
Definition: baseapi.cpp:443
uinT8 space()
Definition: werd.h:104
Definition: ocrrow.h:32
GenericVector< ParagraphModel * > * paragraph_models_
Definition: baseapi.h:875
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:173
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
void ClearAdaptiveClassifier()
Definition: baseapi.cpp:531
Boxa * GetStrips(Pixa **pixa, int **blockids)
Definition: baseapi.cpp:633
Boxa * GetRegions(Pixa **pixa)
Definition: baseapi.cpp:607
int(Dict::* letter_is_okay_)(void *void_dawg_args, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:356
Definition: ocrblock.h:30
bool AnyLSTMLang() const
void set_pix_original(Pix *original_pix)
const char * WordRecognitionLanguage() const
bool tessedit_resegment_from_line_boxes
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:286
BLOCK_RES * block() const
Definition: pageres.h:742
virtual void Run(A1, A2, A3, A4)=0
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:415
static void CatchSignals()
Definition: baseapi.cpp:184
float rating() const
Definition: ratngs.h:325
bool AdaptToWordStr(PageSegMode mode, const char *wordstr)
Definition: baseapi.cpp:1979
int init_tesseract_lm(const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
Definition: tessedit.cpp:457
int GetScaledYResolution() const
Definition: thresholder.h:93
char * GetUTF8Text(PageIteratorLevel level) const
GenericVector< BoolParam * > bool_params
Definition: params.h:45
void Orientation(tesseract::Orientation *orientation, tesseract::WritingDirection *writing_direction, tesseract::TextlineOrder *textline_order, float *deskew_angle) const