39 #include "config_auto.h"    44 #include "allheaders.h"    56                   "Take segmentation and labeling from box file",
    58       BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
    59                   "Conversion of word/line box file to char box file",
    62                   "Generate training data from boxed chars", this->params()),
    64                   "Generate more boxes from boxed chars", this->params()),
    66                   "Break input into lines and remap boxes if present",
    69                   "Dump intermediate images made during page segmentation",
    75           "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"    76           " 5=line, 6=word, 7=char"    77           " (Values from PageSegMode enum in publictypes.h)",
    80                       "Which OCR engine(s) to run (Tesseract, LSTM, both)."    81                       " Defaults to loading and running the most accurate"    85                     "Blacklist of chars not to recognize", this->params()),
    87                     "Whitelist of chars to recognize", this->params()),
    89                     "List of chars to override tessedit_char_blacklist",
    92                   "Perform training for ambiguities", this->params()),
    95                  "Whether to use the top-line splitting process for Devanagari "    96                  "documents while performing page-segmentation.",
   100                  "Whether to use the top-line splitting process for Devanagari "   101                  "documents while performing ocr.",
   104                     "Write all parameters to the given file.", this->params()),
   106                   "Generate and print debug"   107                   " information for adaption",
   109       INT_MEMBER(bidi_debug, 0, 
"Debug level for BiDi", this->params()),
   110       INT_MEMBER(applybox_debug, 1, 
"Debug level", this->params()),
   111       INT_MEMBER(applybox_page, 0, 
"Page number to apply boxes from",
   114                     "Exposure value follows"   115                     " this pattern in the image filename. The name of the image"   116                     " files are expected to be in the form"   117                     " [lang].[fontname].exp[num].tif",
   119       BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
   120                   "Learn both character fragments (as is done in the"   121                   " special low exposure mode) as well as unfragmented"   126                   " is assumed to contain ngrams. Only learn the ngrams"   127                   " whose outlines overlap horizontally.",
   129       BOOL_MEMBER(tessedit_display_outwords, false, 
"Draw output words",
   131       BOOL_MEMBER(tessedit_dump_choices, false, 
"Dump char choices",
   133       BOOL_MEMBER(tessedit_timing_debug, false, 
"Print timing stats",
   136                   "Try to improve fuzzy spaces", this->params()),
   138                   "Don't bother with word plausibility", this->params()),
   139       BOOL_MEMBER(tessedit_fix_hyphens, true, 
"Crunch double hyphens?",
   141       BOOL_MEMBER(tessedit_redo_xheight, true, 
"Check/Correct x-height",
   144                   "Add words to the document dictionary", this->params()),
   145       BOOL_MEMBER(tessedit_debug_fonts, false, 
"Output font info per char",
   147       BOOL_MEMBER(tessedit_debug_block_rejection, false, 
"Block and Row stats",
   149       BOOL_MEMBER(tessedit_enable_bigram_correction, true,
   150                   "Enable correction based on the word bigram dictionary.",
   152       BOOL_MEMBER(tessedit_enable_dict_correction, false,
   153                   "Enable single word correction based on the dictionary.",
   156                  "Amount of debug output for bigram correction.",
   159                   "Remove and conditionally reassign small outlines when they"   160                   " confuse layout analysis, determining diacritics vs noise",
   162       INT_MEMBER(debug_noise_removal, 0, 
"Debug reassignment of small outlines",
   168                     "Hingepoint for base char certainty", this->params()),
   172                     "Hingepoint for disjoint certainty", this->params()),
   176                     "Threshold for new punc char certainty", this->params()),
   179                     "Scaling on certainty diff from Hingepoint",
   181       INT_MEMBER(noise_maxperblob, 8, 
"Max diacritics to apply to a blob",
   183       INT_MEMBER(noise_maxperword, 16, 
"Max diacritics to apply to a word",
   185       INT_MEMBER(debug_x_ht_level, 0, 
"Reestimate debug", this->params()),
   186       BOOL_MEMBER(debug_acceptable_wds, false, 
"Dump word pass/fail chk",
   188       STRING_MEMBER(chs_leading_punct, 
"('`\"", 
"Leading punctuation",
   190       STRING_MEMBER(chs_trailing_punct1, 
").,;:?!", 
"1st Trailing punctuation",
   192       STRING_MEMBER(chs_trailing_punct2, 
")'`\"", 
"2nd Trailing punctuation",
   195                     "good_quality_doc lte rejection limit", this->params()),
   197                     "good_quality_doc gte good blobs limit", this->params()),
   199                     "good_quality_doc lte outline error limit", this->params()),
   201                     "good_quality_doc gte good char limit", this->params()),
   202       INT_MEMBER(quality_min_initial_alphas_reqd, 2, 
"alphas in a good word",
   205                  "Adaptation decision algorithm for tess", this->params()),
   207                   "Do minimal rejection on pass 1 output", this->params()),
   208       BOOL_MEMBER(tessedit_test_adaption, false, 
"Test adaption criteria",
   210       BOOL_MEMBER(tessedit_matcher_log, false, 
"Log matcher activity",
   213                  "Adaptation decision algorithm for tess", this->params()),
   214       BOOL_MEMBER(test_pt, false, 
"Test for point", this->params()),
   215       double_MEMBER(test_pt_x, 99999.99, 
"xcoord", this->params()),
   216       double_MEMBER(test_pt_y, 99999.99, 
"ycoord", this->params()),
   217       INT_MEMBER(multilang_debug_level, 0, 
"Print multilang debug info.",
   219       INT_MEMBER(paragraph_debug_level, 0, 
"Print paragraph debug info.",
   222                   "Run paragraph detection on the post-text-recognition "   226                   "Use ratings matrix/beam search with lstm", this->params()),
   227       STRING_MEMBER(outlines_odd, 
"%| ", 
"Non standard number of outlines",
   229       STRING_MEMBER(outlines_2, 
"ij!?%\":;", 
"Non standard number of outlines",
   232                   "Allow outline errs in unrejection?", this->params()),
   234                   "Reduce rejection on good docs", this->params()),
   235       BOOL_MEMBER(tessedit_use_reject_spaces, true, 
"Reject spaces?",
   238                     "%rej allowed before rej whole doc", this->params()),
   240                     "%rej allowed before rej whole block", this->params()),
   242                     "%rej allowed before rej whole row", this->params()),
   244                     "Number of row rejects in whole word rejects"   245                     "which prevents whole row rejection",
   247       BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
   248                   "Only rej partially rejected words in block rejection",
   250       BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
   251                   "Only rej partially rejected words in row rejection",
   254                   "Use word segmentation quality metric", this->params()),
   256                   "Use word segmentation quality metric", this->params()),
   258                  "Only preserve wds longer than this", this->params()),
   260                   "Apply row rejection to good docs", this->params()),
   262                     "rej good doc wd if more than this fraction rejected",
   265                   "Reject all bad quality wds", this->params()),
   266       BOOL_MEMBER(tessedit_debug_doc_rejection, false, 
"Page stats",
   269                   "Output data to debug file", this->params()),
   270       BOOL_MEMBER(bland_unrej, false, 
"unrej potential with no checks",
   273                     "good_quality_doc gte good char limit", this->params()),
   275                   "Mark v.bad words for tilde crunch", this->params()),
   276       BOOL_MEMBER(hocr_font_info, false, 
"Add font info to hocr output",
   278       BOOL_MEMBER(crunch_early_merge_tess_fails, true, 
"Before word crunch?",
   280       BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
   281                   "Take out ~^ early?", this->params()),
   282       double_MEMBER(crunch_terrible_rating, 80.0, 
"crunch rating lt this",
   284       BOOL_MEMBER(crunch_terrible_garbage, true, 
"As it says", this->params()),
   286                     "crunch garbage cert lt this", this->params()),
   288                     "crunch garbage rating lt this", this->params()),
   289       double_MEMBER(crunch_pot_poor_rate, 40, 
"POTENTIAL crunch rating lt this",
   291       double_MEMBER(crunch_pot_poor_cert, -8.0, 
"POTENTIAL crunch cert lt this",
   293       BOOL_MEMBER(crunch_pot_garbage, true, 
"POTENTIAL crunch garbage",
   295       double_MEMBER(crunch_del_rating, 60, 
"POTENTIAL crunch rating lt this",
   297       double_MEMBER(crunch_del_cert, -10.0, 
"POTENTIAL crunch cert lt this",
   299       double_MEMBER(crunch_del_min_ht, 0.7, 
"Del if word ht lt xht x this",
   301       double_MEMBER(crunch_del_max_ht, 3.0, 
"Del if word ht gt xht x this",
   304                     "Del if word width lt xht x this", this->params()),
   306                     "Del if word gt xht x this above bl", this->params()),
   308                     "Del if word gt xht x this below bl", this->params()),
   309       double_MEMBER(crunch_small_outlines_size, 0.6, 
"Small if lt xht x this",
   311       INT_MEMBER(crunch_rating_max, 10, 
"For adj length in rating per ch",
   314                  "How many potential indicators needed", this->params()),
   315       BOOL_MEMBER(crunch_leave_ok_strings, true, 
"Don't touch sensible strings",
   317       BOOL_MEMBER(crunch_accept_ok, true, 
"Use acceptability in okstring",
   320                   "Don't pot crunch sensible strings", this->params()),
   321       BOOL_MEMBER(crunch_include_numerals, false, 
"Fiddle alpha figures",
   324                  "Don't crunch words with long lower case strings",
   327                  "Don't crunch words with long lower case strings",
   330                  "Crunch words with long repetitions", this->params()),
   331       INT_MEMBER(crunch_debug, 0, 
"As it says", this->params()),
   333                  "How many non-noise blbs either side?", this->params()),
   334       double_MEMBER(fixsp_small_outlines_size, 0.28, 
"Small if lt xht x this",
   337                   "Reward punctation joins", this->params()),
   338       INT_MEMBER(fixsp_done_mode, 1, 
"What constitues done for spacing",
   340       INT_MEMBER(debug_fix_space_level, 0, 
"Contextual fixspace debug",
   343                     "Punct. chs expected WITHIN numbers", this->params()),
   345                  "Max allowed deviation of blob top outside of font data",
   348                  "Min change in xht before actually trying it", this->params()),
   350                  "Debug level for sub & superscript fixer", this->params()),
   352           superscript_worse_certainty, 2.0,
   353           "How many times worse "   354           "certainty does a superscript position glyph need to be for "   355           "us to try classifying it as a char with a different "   359           superscript_bettered_certainty, 0.97,
   361           "badness do we think sufficient to choose a superscript "   362           "over what we'd thought.  For example, a value of 0.6 means "   363           "we want to reduce badness of certainty by at least 40%",
   366                     "A superscript scaled down more than this is unbelievably "   367                     "small.  For example, 0.3 means we expect the font size to "   368                     "be no smaller than 30% of the text line font size.",
   371                     "Maximum top of a character measured as a multiple of "   372                     "x-height above the baseline for us to reconsider whether "   376                     "Minimum bottom of a character measured as a multiple of "   377                     "x-height above the baseline for us to reconsider whether "   378                     "it's a superscript.",
   380       BOOL_MEMBER(tessedit_write_block_separators, false,
   381                   "Write block separators in output", this->params()),
   382       BOOL_MEMBER(tessedit_write_rep_codes, false, 
"Write repetition char code",
   384       BOOL_MEMBER(tessedit_write_unlv, false, 
"Write .unlv output file",
   386       BOOL_MEMBER(tessedit_create_txt, false, 
"Write .txt output file",
   388       BOOL_MEMBER(tessedit_create_hocr, false, 
"Write .html hOCR output file",
   390       BOOL_MEMBER(tessedit_create_tsv, false, 
"Write .tsv output file",
   392       BOOL_MEMBER(tessedit_create_pdf, false, 
"Write .pdf output file",
   395                   "Create PDF with only one invisible text layer",
   398                     "Output char for unidentified blobs", this->params()),
   399       INT_MEMBER(suspect_level, 99, 
"Suspect marker level", this->params()),
   401                  "Min suspect level for rejecting spaces", this->params()),
   403                  "Don't suspect dict wds longer than this", this->params()),
   404       BOOL_MEMBER(suspect_constrain_1Il, false, 
"UNLV keep 1Il chars rejected",
   407                     "Don't touch bad rating limit", this->params()),
   408       double_MEMBER(suspect_accept_rating, -999.9, 
"Accept good rating limit",
   411                   "Only reject tess failures", this->params()),
   412       BOOL_MEMBER(tessedit_zero_rejection, false, 
"Don't reject ANYTHING",
   415                   "Make output have exactly one word per WERD", this->params()),
   417                   "Don't reject ANYTHING AT ALL", this->params()),
   419                   "Force all rep chars the same", this->params()),
   420       INT_MEMBER(tessedit_reject_mode, 0, 
"Rejection algorithm",
   422       BOOL_MEMBER(tessedit_rejection_debug, false, 
"Adaption debug",
   424       BOOL_MEMBER(tessedit_flip_0O, true, 
"Contextual 0O O0 flips",
   427                     "Aspect ratio dot/hyphen test", this->params()),
   429                     "Aspect ratio dot/hyphen test", this->params()),
   431                   "Use DOC dawg in 11l conf. detector", this->params()),
   432       BOOL_MEMBER(rej_1Il_use_dict_word, false, 
"Use dictword test",
   434       BOOL_MEMBER(rej_1Il_trust_permuter_type, true, 
"Don't double check",
   436       BOOL_MEMBER(rej_use_tess_accepted, true, 
"Individual rejection control",
   438       BOOL_MEMBER(rej_use_tess_blanks, true, 
"Individual rejection control",
   440       BOOL_MEMBER(rej_use_good_perm, true, 
"Individual rejection control",
   442       BOOL_MEMBER(rej_use_sensible_wd, false, 
"Extend permuter check",
   444       BOOL_MEMBER(rej_alphas_in_number_perm, false, 
"Extend permuter check",
   447                     "if >this fract", this->params()),
   448       INT_MEMBER(tessedit_image_border, 2, 
"Rej blbs near image edge limit",
   451                     "Allow NN to unrej", this->params()),
   452       STRING_MEMBER(conflict_set_I_l_1, 
"Il1[]", 
"Il1 conflict set",
   454       INT_MEMBER(min_sane_x_ht_pixels, 8, 
"Reject any x-ht lt or eq than this",
   456       BOOL_MEMBER(tessedit_create_boxfile, false, 
"Output text with boxes",
   460                  " , else specific page to process",
   463                   "Capture the image from the IPE", this->params()),
   464       BOOL_MEMBER(interactive_display_mode, false, 
"Run interactively?",
   466       STRING_MEMBER(file_type, 
".tif", 
"Filename extension", this->params()),
   467       BOOL_MEMBER(tessedit_override_permuter, true, 
"According to dict_word",
   470                     "List of languages to load with this one", this->params()),
   471       BOOL_MEMBER(tessedit_use_primary_params_model, false,
   472                   "In multilingual mode use params model of the"   476                     "Min acceptable orientation margin", this->params()),
   477       BOOL_MEMBER(textord_tabfind_show_vlines, false, 
"Debug line finding",
   482                   "Allow feature extractors to see the original outline",
   485                        "Only initialize with the config file. Useful if the "   486                        "instance is not going to be used for OCR but say only "   487                        "for layout analysis.",
   489       BOOL_MEMBER(textord_equation_detect, false, 
"Turn on equation detector",
   492                   "Enable vertical detection", this->params()),
   493       BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
   494                   "Force using vertical text page mode", this->params()),
   496           textord_tabfind_vertical_text_ratio, 0.5,
   497           "Fraction of textlines deemed vertical to use vertical page "   501           textord_tabfind_aligned_gap_fraction, 0.75,
   502           "Fraction of height used as a minimum gap for aligned blobs.",
   504       INT_MEMBER(tessedit_parallelize, 0, 
"Run in parallel where possible",
   507                   "Preserve multiple interword spaces", this->params()),
   509                   "Include page separator string in output text after each "   513                     "Page separator (default is form feed control character)",
   525       BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
   526                   "find horizontal lines such as headers in vertical page mode",
   528       INT_MEMBER(tessedit_ok_mode, 5, 
"Acceptance decision algorithm",
   531                        "Load fixed length dawgs"   532                        " (e.g. for non-space delimited languages)",
   534       INT_MEMBER(segment_debug, 0, 
"Debug the whole segmentation process",
   536       BOOL_MEMBER(permute_debug, 0, 
"Debug char permutation process",
   539                     "Multiplying factor of"   540                     " current best rate to prune other hypotheses",
   543                   "Turn on word script consistency permuter", this->params()),
   545                   "incorporate segmentation cost in word rating?",
   548                     "Score multipler for script consistency within a word. "   549                     "Being a 'reward' factor, it should be <= 1. "   550                     "Smaller value implies bigger reward.",
   553                   "Turn on fixed-length phrasebook search permuter",
   556                   "Turn on character type (property) consistency permuter",
   559                     "Score multipler for char type consistency within a word. ",
   562                     "Score multipler for ngram permuter's best choice"   563                     " (only used in the Han script path).",
   566                   "Activate character-level n-gram-based permuter",
   568       BOOL_MEMBER(permute_only_top, false, 
"Run only the top choice permuter",
   570       INT_MEMBER(language_model_fixed_length_choices_depth, 3,
   571                  "Depth of blob choice lists to explore"   572                  " when fixed length dawgs are on",
   575                   "use new state cost heuristics for segmentation state"   579                     "base factor for adding segmentation cost into word rating."   580                     "It's a multiplying factor, the larger the value above 1, "   581                     "the bigger the effect of segmentation cost.",
   584                     "weight associated with char rating in combined cost of"   588                     "weight associated with width evidence in combined cost of"   592                     "weight associated with seam cut in combined cost of state",
   595                     "max char width-to-height ratio allowed in segmentation",
   598                   "Enable new segmentation search path.", this->params()),
   600                     "Maximum character width-to-height ratio for"   601                     " fixed-pitch fonts",
   605       backup_config_file_(NULL),
   609       pix_thresholds_(NULL),
   610       source_resolution_(0),
   612       right_to_left_(false),
   617       most_recently_used_(this),
   620 #ifndef ANDROID_BUILD
   621       lstm_recognizer_(NULL),
   623       train_line_page_num_(0) {
   628   pixDestroy(&pix_original_);
   630   sub_langs_.delete_data_pointers();
   631 #ifndef ANDROID_BUILD   632   delete lstm_recognizer_;
   633   lstm_recognizer_ = NULL;
   640   pixDestroy(&pix_binary_);
   641   pixDestroy(&pix_grey_);
   642   pixDestroy(&pix_thresholds_);
   643   pixDestroy(&scaled_color_);
   644   deskew_ = 
FCOORD(1.0f, 0.0f);
   645   reskew_ = 
FCOORD(1.0f, 0.0f);
   648   for (
int i = 0; i < sub_langs_.size(); ++i)
   649     sub_langs_[i]->
Clear();
   653   equ_detect_ = detector;
   660   for (
int i = 0; i < sub_langs_.size(); ++i) {
   661     sub_langs_[i]->ResetAdaptiveClassifierInternal();
   668   for (
int i = 0; i < sub_langs_.size(); ++i) {
   669     sub_langs_[i]->getDict().ResetDocumentDictionary();
   679   for (
int i = 0; i < sub_langs_.size(); ++i) {
   680     sub_langs_[i]->unicharset.set_black_and_whitelist(
   694   for (
int i = 0; i < sub_langs_.size(); ++i) {
   697         static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
   698     if (pageseg_strategy > max_pageseg_strategy)
   699       max_pageseg_strategy = pageseg_strategy;
   700     pixDestroy(&sub_langs_[i]->pix_binary_);
   701     sub_langs_[i]->pix_binary_ = pixClone(
pix_binary());
   707   if (splitter_.
Split(
true, &pixa_debug_)) {
   709     pixDestroy(&pix_binary_);
   725   for (
int i = 0; i < sub_langs_.size(); ++i) {
   728         static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
   729     if (ocr_strategy > max_ocr_strategy)
   730       max_ocr_strategy = ocr_strategy;
   736   bool split_for_ocr = splitter_.
Split(
false, &pixa_debug_);
   739   pixDestroy(&pix_binary_);
   740   pix_binary_ = pixClone(splitter_.
orig_pix());
   745     BLOCK block(
"", 
TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
   746                 pixGetHeight(pix_binary_));
 
void ResetAdaptiveClassifierInternal()
 
void set_segmentation_block_list(BLOCK_LIST *block_list)
 
void set_use_cjk_fp_model(bool flag)
 
int pageseg_devanagari_split_strategy
 
#define double_MEMBER(name, val, comment, vec)
 
void set_ocr_split_strategy(SplitStrategy strategy)
 
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
 
void ResetAdaptiveClassifier()
 
void SetEquationDetect(EquationDetect *detector)
 
Assume a single uniform block of text. (Default.) 
 
const char * string() const
 
bool textord_use_cjk_fp_model
 
char * tessedit_char_whitelist
 
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
void ResetDocumentDictionary()
 
void set_pageseg_split_strategy(SplitStrategy strategy)
 
bool HasDifferentSplitStrategies() const
 
char * tessedit_char_blacklist
 
C_BLOB_LIST * blob_list()
get blobs 
 
void WritePDF(const char *filename)
 
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
 
#define INT_INIT_MEMBER(name, val, comment, vec)
 
void ResetDocumentDictionary()
 
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
 
void SetBlackAndWhitelist()
 
#define STRING_MEMBER(name, val, comment, vec)
 
void set_orig_pix(Pix *pix)
 
#define BOOL_INIT_MEMBER(name, val, comment, vec)
 
void extract_edges(Pix *pix, BLOCK *block)
 
void SetLangTesseract(Tesseract *lang_tesseract)
 
#define BOOL_MEMBER(name, val, comment, vec)
 
char * tessedit_char_unblacklist
 
#define INT_MEMBER(name, val, comment, vec)
 
int ocr_devanagari_split_strategy