tesseract  4.00.00dev
ScriptDetector Class Reference

#include <osdetect.h>

Public Member Functions

 ScriptDetector (const GenericVector< int > *allowed_scripts, OSResults *osr, tesseract::Tesseract *tess)
 
void detect_blob (BLOB_CHOICE_LIST *scores)
 
bool must_stop (int orientation)
 

Detailed Description

Definition at line 93 of file osdetect.h.

Constructor & Destructor Documentation

◆ ScriptDetector()

ScriptDetector::ScriptDetector ( const GenericVector< int > *  allowed_scripts,
OSResults osr,
tesseract::Tesseract tess 
)

Definition at line 450 of file osdetect.cpp.

451  {
452  osr_ = osr;
453  tess_ = tess;
454  allowed_scripts_ = allowed_scripts;
455  katakana_id_ = tess_->unicharset.add_script(katakana_script);
456  hiragana_id_ = tess_->unicharset.add_script(hiragana_script);
457  han_id_ = tess_->unicharset.add_script(han_script);
458  hangul_id_ = tess_->unicharset.add_script(hangul_script);
459  japanese_id_ = tess_->unicharset.add_script(japanese_script_);
460  korean_id_ = tess_->unicharset.add_script(korean_script_);
461  latin_id_ = tess_->unicharset.add_script(latin_script);
462  fraktur_id_ = tess_->unicharset.add_script(fraktur_script_);
463 }
int add_script(const char *script)
UNICHARSET unicharset
Definition: ccutil.h:68

Member Function Documentation

◆ detect_blob()

void ScriptDetector::detect_blob ( BLOB_CHOICE_LIST *  scores)

Definition at line 468 of file osdetect.cpp.

468  {
469  bool done[kMaxNumberOfScripts];
470  for (int i = 0; i < 4; ++i) {
471  for (int j = 0; j < kMaxNumberOfScripts; ++j)
472  done[j] = false;
473 
474  BLOB_CHOICE_IT choice_it;
475  choice_it.set_to_list(scores + i);
476 
477  float prev_score = -1;
478  int script_count = 0;
479  int prev_id = -1;
480  int prev_fontinfo_id = -1;
481  const char* prev_unichar = "";
482  const char* unichar = "";
483 
484  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
485  choice_it.forward()) {
486  BLOB_CHOICE* choice = choice_it.data();
487  int id = choice->script_id();
488  if (allowed_scripts_ != NULL && !allowed_scripts_->empty()) {
489  // Check that the choice is in an allowed script.
490  int s = 0;
491  for (s = 0; s < allowed_scripts_->size(); ++s) {
492  if ((*allowed_scripts_)[s] == id) break;
493  }
494  if (s == allowed_scripts_->size()) continue; // Not found in list.
495  }
496  // Script already processed before.
497  if (done[id]) continue;
498  done[id] = true;
499 
500  unichar = tess_->unicharset.id_to_unichar(choice->unichar_id());
501  // Save data from the first match
502  if (prev_score < 0) {
503  prev_score = -choice->certainty();
504  script_count = 1;
505  prev_id = id;
506  prev_unichar = unichar;
507  prev_fontinfo_id = choice->fontinfo_id();
508  } else if (-choice->certainty() < prev_score + kNonAmbiguousMargin) {
509  ++script_count;
510  }
511 
512  if (strlen(prev_unichar) == 1)
513  if (unichar[0] >= '0' && unichar[0] <= '9')
514  break;
515 
516  // if script_count is >= 2, character is ambiguous, skip other matches
517  // since they are useless.
518  if (script_count >= 2)
519  break;
520  }
521  // Character is non ambiguous
522  if (script_count == 1) {
523  // Update the score of the winning script
524  osr_->scripts_na[i][prev_id] += 1.0;
525 
526  // Workaround for Fraktur
527  if (prev_id == latin_id_) {
528  if (prev_fontinfo_id >= 0) {
529  const tesseract::FontInfo &fi =
530  tess_->get_fontinfo_table().get(prev_fontinfo_id);
531  //printf("Font: %s i:%i b:%i f:%i s:%i k:%i (%s)\n", fi.name,
532  // fi.is_italic(), fi.is_bold(), fi.is_fixed_pitch(),
533  // fi.is_serif(), fi.is_fraktur(),
534  // prev_unichar);
535  if (fi.is_fraktur()) {
536  osr_->scripts_na[i][prev_id] -= 1.0;
537  osr_->scripts_na[i][fraktur_id_] += 1.0;
538  }
539  }
540  }
541 
542  // Update Japanese / Korean pseudo-scripts
543  if (prev_id == katakana_id_)
544  osr_->scripts_na[i][japanese_id_] += 1.0;
545  if (prev_id == hiragana_id_)
546  osr_->scripts_na[i][japanese_id_] += 1.0;
547  if (prev_id == hangul_id_)
548  osr_->scripts_na[i][korean_id_] += 1.0;
549  if (prev_id == han_id_) {
550  osr_->scripts_na[i][korean_id_] += kHanRatioInKorean;
551  osr_->scripts_na[i][japanese_id_] += kHanRatioInJapanese;
552  }
553  }
554  } // iterate over each orientation
555 }
float scripts_na[4][kMaxNumberOfScripts]
Definition: osdetect.h:76
const float kHanRatioInJapanese
Definition: osdetect.cpp:45
bool empty() const
Definition: genericvector.h:90
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
int size() const
Definition: genericvector.h:72
float certainty() const
Definition: ratngs.h:82
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
inT16 fontinfo_id() const
Definition: ratngs.h:85
UNICHARSET unicharset
Definition: ccutil.h:68
int script_id() const
Definition: ratngs.h:111
bool is_fraktur() const
Definition: fontinfo.h:115
const float kHanRatioInKorean
Definition: osdetect.cpp:44
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76
const float kNonAmbiguousMargin
Definition: osdetect.cpp:47
const int kMaxNumberOfScripts
Definition: osdetect.h:36

◆ must_stop()

bool ScriptDetector::must_stop ( int  orientation)

Definition at line 557 of file osdetect.cpp.

557  {
558  osr_->update_best_script(orientation);
559  return osr_->best_result.sconfidence > 1;
560 }
float sconfidence
Definition: osdetect.h:43
void update_best_script(int orientation_id)
Definition: osdetect.cpp:91
OSBestResult best_result
Definition: osdetect.h:79

The documentation for this class was generated from the following files: