39 #include "config_auto.h" 44 #include "allheaders.h" 56 "Take segmentation and labeling from box file",
58 BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
59 "Conversion of word/line box file to char box file",
62 "Generate training data from boxed chars", this->params()),
64 "Generate more boxes from boxed chars", this->params()),
66 "Break input into lines and remap boxes if present",
69 "Dump intermediate images made during page segmentation",
75 "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," 76 " 5=line, 6=word, 7=char" 77 " (Values from PageSegMode enum in publictypes.h)",
80 "Which OCR engine(s) to run (Tesseract, LSTM, both)." 81 " Defaults to loading and running the most accurate" 85 "Blacklist of chars not to recognize", this->params()),
87 "Whitelist of chars to recognize", this->params()),
89 "List of chars to override tessedit_char_blacklist",
92 "Perform training for ambiguities", this->params()),
95 "Whether to use the top-line splitting process for Devanagari " 96 "documents while performing page-segmentation.",
100 "Whether to use the top-line splitting process for Devanagari " 101 "documents while performing ocr.",
104 "Write all parameters to the given file.", this->params()),
106 "Generate and print debug" 107 " information for adaption",
109 INT_MEMBER(bidi_debug, 0,
"Debug level for BiDi", this->params()),
110 INT_MEMBER(applybox_debug, 1,
"Debug level", this->params()),
111 INT_MEMBER(applybox_page, 0,
"Page number to apply boxes from",
114 "Exposure value follows" 115 " this pattern in the image filename. The name of the image" 116 " files are expected to be in the form" 117 " [lang].[fontname].exp[num].tif",
119 BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
120 "Learn both character fragments (as is done in the" 121 " special low exposure mode) as well as unfragmented" 126 " is assumed to contain ngrams. Only learn the ngrams" 127 " whose outlines overlap horizontally.",
129 BOOL_MEMBER(tessedit_display_outwords, false,
"Draw output words",
131 BOOL_MEMBER(tessedit_dump_choices, false,
"Dump char choices",
133 BOOL_MEMBER(tessedit_timing_debug, false,
"Print timing stats",
136 "Try to improve fuzzy spaces", this->params()),
138 "Don't bother with word plausibility", this->params()),
139 BOOL_MEMBER(tessedit_fix_hyphens, true,
"Crunch double hyphens?",
141 BOOL_MEMBER(tessedit_redo_xheight, true,
"Check/Correct x-height",
144 "Add words to the document dictionary", this->params()),
145 BOOL_MEMBER(tessedit_debug_fonts, false,
"Output font info per char",
147 BOOL_MEMBER(tessedit_debug_block_rejection, false,
"Block and Row stats",
149 BOOL_MEMBER(tessedit_enable_bigram_correction, true,
150 "Enable correction based on the word bigram dictionary.",
152 BOOL_MEMBER(tessedit_enable_dict_correction, false,
153 "Enable single word correction based on the dictionary.",
156 "Amount of debug output for bigram correction.",
159 "Remove and conditionally reassign small outlines when they" 160 " confuse layout analysis, determining diacritics vs noise",
162 INT_MEMBER(debug_noise_removal, 0,
"Debug reassignment of small outlines",
168 "Hingepoint for base char certainty", this->params()),
172 "Hingepoint for disjoint certainty", this->params()),
176 "Threshold for new punc char certainty", this->params()),
179 "Scaling on certainty diff from Hingepoint",
181 INT_MEMBER(noise_maxperblob, 8,
"Max diacritics to apply to a blob",
183 INT_MEMBER(noise_maxperword, 16,
"Max diacritics to apply to a word",
185 INT_MEMBER(debug_x_ht_level, 0,
"Reestimate debug", this->params()),
186 BOOL_MEMBER(debug_acceptable_wds, false,
"Dump word pass/fail chk",
188 STRING_MEMBER(chs_leading_punct,
"('`\"",
"Leading punctuation",
190 STRING_MEMBER(chs_trailing_punct1,
").,;:?!",
"1st Trailing punctuation",
192 STRING_MEMBER(chs_trailing_punct2,
")'`\"",
"2nd Trailing punctuation",
195 "good_quality_doc lte rejection limit", this->params()),
197 "good_quality_doc gte good blobs limit", this->params()),
199 "good_quality_doc lte outline error limit", this->params()),
201 "good_quality_doc gte good char limit", this->params()),
202 INT_MEMBER(quality_min_initial_alphas_reqd, 2,
"alphas in a good word",
205 "Adaptation decision algorithm for tess", this->params()),
207 "Do minimal rejection on pass 1 output", this->params()),
208 BOOL_MEMBER(tessedit_test_adaption, false,
"Test adaption criteria",
210 BOOL_MEMBER(tessedit_matcher_log, false,
"Log matcher activity",
213 "Adaptation decision algorithm for tess", this->params()),
214 BOOL_MEMBER(test_pt, false,
"Test for point", this->params()),
215 double_MEMBER(test_pt_x, 99999.99,
"xcoord", this->params()),
216 double_MEMBER(test_pt_y, 99999.99,
"ycoord", this->params()),
217 INT_MEMBER(multilang_debug_level, 0,
"Print multilang debug info.",
219 INT_MEMBER(paragraph_debug_level, 0,
"Print paragraph debug info.",
222 "Run paragraph detection on the post-text-recognition " 226 "Use ratings matrix/beam search with lstm", this->params()),
227 STRING_MEMBER(outlines_odd,
"%| ",
"Non standard number of outlines",
229 STRING_MEMBER(outlines_2,
"ij!?%\":;",
"Non standard number of outlines",
232 "Allow outline errs in unrejection?", this->params()),
234 "Reduce rejection on good docs", this->params()),
235 BOOL_MEMBER(tessedit_use_reject_spaces, true,
"Reject spaces?",
238 "%rej allowed before rej whole doc", this->params()),
240 "%rej allowed before rej whole block", this->params()),
242 "%rej allowed before rej whole row", this->params()),
244 "Number of row rejects in whole word rejects" 245 "which prevents whole row rejection",
247 BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
248 "Only rej partially rejected words in block rejection",
250 BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
251 "Only rej partially rejected words in row rejection",
254 "Use word segmentation quality metric", this->params()),
256 "Use word segmentation quality metric", this->params()),
258 "Only preserve wds longer than this", this->params()),
260 "Apply row rejection to good docs", this->params()),
262 "rej good doc wd if more than this fraction rejected",
265 "Reject all bad quality wds", this->params()),
266 BOOL_MEMBER(tessedit_debug_doc_rejection, false,
"Page stats",
269 "Output data to debug file", this->params()),
270 BOOL_MEMBER(bland_unrej, false,
"unrej potential with no checks",
273 "good_quality_doc gte good char limit", this->params()),
275 "Mark v.bad words for tilde crunch", this->params()),
276 BOOL_MEMBER(hocr_font_info, false,
"Add font info to hocr output",
278 BOOL_MEMBER(crunch_early_merge_tess_fails, true,
"Before word crunch?",
280 BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
281 "Take out ~^ early?", this->params()),
282 double_MEMBER(crunch_terrible_rating, 80.0,
"crunch rating lt this",
284 BOOL_MEMBER(crunch_terrible_garbage, true,
"As it says", this->params()),
286 "crunch garbage cert lt this", this->params()),
288 "crunch garbage rating lt this", this->params()),
289 double_MEMBER(crunch_pot_poor_rate, 40,
"POTENTIAL crunch rating lt this",
291 double_MEMBER(crunch_pot_poor_cert, -8.0,
"POTENTIAL crunch cert lt this",
293 BOOL_MEMBER(crunch_pot_garbage, true,
"POTENTIAL crunch garbage",
295 double_MEMBER(crunch_del_rating, 60,
"POTENTIAL crunch rating lt this",
297 double_MEMBER(crunch_del_cert, -10.0,
"POTENTIAL crunch cert lt this",
299 double_MEMBER(crunch_del_min_ht, 0.7,
"Del if word ht lt xht x this",
301 double_MEMBER(crunch_del_max_ht, 3.0,
"Del if word ht gt xht x this",
304 "Del if word width lt xht x this", this->params()),
306 "Del if word gt xht x this above bl", this->params()),
308 "Del if word gt xht x this below bl", this->params()),
309 double_MEMBER(crunch_small_outlines_size, 0.6,
"Small if lt xht x this",
311 INT_MEMBER(crunch_rating_max, 10,
"For adj length in rating per ch",
314 "How many potential indicators needed", this->params()),
315 BOOL_MEMBER(crunch_leave_ok_strings, true,
"Don't touch sensible strings",
317 BOOL_MEMBER(crunch_accept_ok, true,
"Use acceptability in okstring",
320 "Don't pot crunch sensible strings", this->params()),
321 BOOL_MEMBER(crunch_include_numerals, false,
"Fiddle alpha figures",
324 "Don't crunch words with long lower case strings",
327 "Don't crunch words with long lower case strings",
330 "Crunch words with long repetitions", this->params()),
331 INT_MEMBER(crunch_debug, 0,
"As it says", this->params()),
333 "How many non-noise blbs either side?", this->params()),
334 double_MEMBER(fixsp_small_outlines_size, 0.28,
"Small if lt xht x this",
337 "Reward punctation joins", this->params()),
338 INT_MEMBER(fixsp_done_mode, 1,
"What constitues done for spacing",
340 INT_MEMBER(debug_fix_space_level, 0,
"Contextual fixspace debug",
343 "Punct. chs expected WITHIN numbers", this->params()),
345 "Max allowed deviation of blob top outside of font data",
348 "Min change in xht before actually trying it", this->params()),
350 "Debug level for sub & superscript fixer", this->params()),
352 superscript_worse_certainty, 2.0,
353 "How many times worse " 354 "certainty does a superscript position glyph need to be for " 355 "us to try classifying it as a char with a different " 359 superscript_bettered_certainty, 0.97,
361 "badness do we think sufficient to choose a superscript " 362 "over what we'd thought. For example, a value of 0.6 means " 363 "we want to reduce badness of certainty by at least 40%",
366 "A superscript scaled down more than this is unbelievably " 367 "small. For example, 0.3 means we expect the font size to " 368 "be no smaller than 30% of the text line font size.",
371 "Maximum top of a character measured as a multiple of " 372 "x-height above the baseline for us to reconsider whether " 376 "Minimum bottom of a character measured as a multiple of " 377 "x-height above the baseline for us to reconsider whether " 378 "it's a superscript.",
380 BOOL_MEMBER(tessedit_write_block_separators, false,
381 "Write block separators in output", this->params()),
382 BOOL_MEMBER(tessedit_write_rep_codes, false,
"Write repetition char code",
384 BOOL_MEMBER(tessedit_write_unlv, false,
"Write .unlv output file",
386 BOOL_MEMBER(tessedit_create_txt, false,
"Write .txt output file",
388 BOOL_MEMBER(tessedit_create_hocr, false,
"Write .html hOCR output file",
390 BOOL_MEMBER(tessedit_create_tsv, false,
"Write .tsv output file",
392 BOOL_MEMBER(tessedit_create_pdf, false,
"Write .pdf output file",
395 "Create PDF with only one invisible text layer",
398 "Output char for unidentified blobs", this->params()),
399 INT_MEMBER(suspect_level, 99,
"Suspect marker level", this->params()),
401 "Min suspect level for rejecting spaces", this->params()),
403 "Don't suspect dict wds longer than this", this->params()),
404 BOOL_MEMBER(suspect_constrain_1Il, false,
"UNLV keep 1Il chars rejected",
407 "Don't touch bad rating limit", this->params()),
408 double_MEMBER(suspect_accept_rating, -999.9,
"Accept good rating limit",
411 "Only reject tess failures", this->params()),
412 BOOL_MEMBER(tessedit_zero_rejection, false,
"Don't reject ANYTHING",
415 "Make output have exactly one word per WERD", this->params()),
417 "Don't reject ANYTHING AT ALL", this->params()),
419 "Force all rep chars the same", this->params()),
420 INT_MEMBER(tessedit_reject_mode, 0,
"Rejection algorithm",
422 BOOL_MEMBER(tessedit_rejection_debug, false,
"Adaption debug",
424 BOOL_MEMBER(tessedit_flip_0O, true,
"Contextual 0O O0 flips",
427 "Aspect ratio dot/hyphen test", this->params()),
429 "Aspect ratio dot/hyphen test", this->params()),
431 "Use DOC dawg in 11l conf. detector", this->params()),
432 BOOL_MEMBER(rej_1Il_use_dict_word, false,
"Use dictword test",
434 BOOL_MEMBER(rej_1Il_trust_permuter_type, true,
"Don't double check",
436 BOOL_MEMBER(rej_use_tess_accepted, true,
"Individual rejection control",
438 BOOL_MEMBER(rej_use_tess_blanks, true,
"Individual rejection control",
440 BOOL_MEMBER(rej_use_good_perm, true,
"Individual rejection control",
442 BOOL_MEMBER(rej_use_sensible_wd, false,
"Extend permuter check",
444 BOOL_MEMBER(rej_alphas_in_number_perm, false,
"Extend permuter check",
447 "if >this fract", this->params()),
448 INT_MEMBER(tessedit_image_border, 2,
"Rej blbs near image edge limit",
451 "Allow NN to unrej", this->params()),
452 STRING_MEMBER(conflict_set_I_l_1,
"Il1[]",
"Il1 conflict set",
454 INT_MEMBER(min_sane_x_ht_pixels, 8,
"Reject any x-ht lt or eq than this",
456 BOOL_MEMBER(tessedit_create_boxfile, false,
"Output text with boxes",
460 " , else specific page to process",
463 "Capture the image from the IPE", this->params()),
464 BOOL_MEMBER(interactive_display_mode, false,
"Run interactively?",
466 STRING_MEMBER(file_type,
".tif",
"Filename extension", this->params()),
467 BOOL_MEMBER(tessedit_override_permuter, true,
"According to dict_word",
470 "List of languages to load with this one", this->params()),
471 BOOL_MEMBER(tessedit_use_primary_params_model, false,
472 "In multilingual mode use params model of the" 476 "Min acceptable orientation margin", this->params()),
477 BOOL_MEMBER(textord_tabfind_show_vlines, false,
"Debug line finding",
482 "Allow feature extractors to see the original outline",
485 "Only initialize with the config file. Useful if the " 486 "instance is not going to be used for OCR but say only " 487 "for layout analysis.",
489 BOOL_MEMBER(textord_equation_detect, false,
"Turn on equation detector",
492 "Enable vertical detection", this->params()),
493 BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
494 "Force using vertical text page mode", this->params()),
496 textord_tabfind_vertical_text_ratio, 0.5,
497 "Fraction of textlines deemed vertical to use vertical page " 501 textord_tabfind_aligned_gap_fraction, 0.75,
502 "Fraction of height used as a minimum gap for aligned blobs.",
504 INT_MEMBER(tessedit_parallelize, 0,
"Run in parallel where possible",
507 "Preserve multiple interword spaces", this->params()),
509 "Include page separator string in output text after each " 513 "Page separator (default is form feed control character)",
525 BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
526 "find horizontal lines such as headers in vertical page mode",
528 INT_MEMBER(tessedit_ok_mode, 5,
"Acceptance decision algorithm",
531 "Load fixed length dawgs" 532 " (e.g. for non-space delimited languages)",
534 INT_MEMBER(segment_debug, 0,
"Debug the whole segmentation process",
536 BOOL_MEMBER(permute_debug, 0,
"Debug char permutation process",
539 "Multiplying factor of" 540 " current best rate to prune other hypotheses",
543 "Turn on word script consistency permuter", this->params()),
545 "incorporate segmentation cost in word rating?",
548 "Score multipler for script consistency within a word. " 549 "Being a 'reward' factor, it should be <= 1. " 550 "Smaller value implies bigger reward.",
553 "Turn on fixed-length phrasebook search permuter",
556 "Turn on character type (property) consistency permuter",
559 "Score multipler for char type consistency within a word. ",
562 "Score multipler for ngram permuter's best choice" 563 " (only used in the Han script path).",
566 "Activate character-level n-gram-based permuter",
568 BOOL_MEMBER(permute_only_top, false,
"Run only the top choice permuter",
570 INT_MEMBER(language_model_fixed_length_choices_depth, 3,
571 "Depth of blob choice lists to explore" 572 " when fixed length dawgs are on",
575 "use new state cost heuristics for segmentation state" 579 "base factor for adding segmentation cost into word rating." 580 "It's a multiplying factor, the larger the value above 1, " 581 "the bigger the effect of segmentation cost.",
584 "weight associated with char rating in combined cost of" 588 "weight associated with width evidence in combined cost of" 592 "weight associated with seam cut in combined cost of state",
595 "max char width-to-height ratio allowed in segmentation",
598 "Enable new segmentation search path.", this->params()),
600 "Maximum character width-to-height ratio for" 601 " fixed-pitch fonts",
605 backup_config_file_(NULL),
609 pix_thresholds_(NULL),
610 source_resolution_(0),
612 right_to_left_(false),
617 most_recently_used_(this),
620 #ifndef ANDROID_BUILD
621 lstm_recognizer_(NULL),
623 train_line_page_num_(0) {
628 pixDestroy(&pix_original_);
630 sub_langs_.delete_data_pointers();
631 #ifndef ANDROID_BUILD 632 delete lstm_recognizer_;
633 lstm_recognizer_ = NULL;
640 pixDestroy(&pix_binary_);
641 pixDestroy(&pix_grey_);
642 pixDestroy(&pix_thresholds_);
643 pixDestroy(&scaled_color_);
644 deskew_ =
FCOORD(1.0f, 0.0f);
645 reskew_ =
FCOORD(1.0f, 0.0f);
648 for (
int i = 0; i < sub_langs_.size(); ++i)
649 sub_langs_[i]->
Clear();
653 equ_detect_ = detector;
660 for (
int i = 0; i < sub_langs_.size(); ++i) {
661 sub_langs_[i]->ResetAdaptiveClassifierInternal();
668 for (
int i = 0; i < sub_langs_.size(); ++i) {
669 sub_langs_[i]->getDict().ResetDocumentDictionary();
679 for (
int i = 0; i < sub_langs_.size(); ++i) {
680 sub_langs_[i]->unicharset.set_black_and_whitelist(
694 for (
int i = 0; i < sub_langs_.size(); ++i) {
697 static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
698 if (pageseg_strategy > max_pageseg_strategy)
699 max_pageseg_strategy = pageseg_strategy;
700 pixDestroy(&sub_langs_[i]->pix_binary_);
701 sub_langs_[i]->pix_binary_ = pixClone(
pix_binary());
707 if (splitter_.
Split(
true, &pixa_debug_)) {
709 pixDestroy(&pix_binary_);
725 for (
int i = 0; i < sub_langs_.size(); ++i) {
728 static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
729 if (ocr_strategy > max_ocr_strategy)
730 max_ocr_strategy = ocr_strategy;
736 bool split_for_ocr = splitter_.
Split(
false, &pixa_debug_);
739 pixDestroy(&pix_binary_);
740 pix_binary_ = pixClone(splitter_.
orig_pix());
745 BLOCK block(
"",
TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
746 pixGetHeight(pix_binary_));
void ResetAdaptiveClassifierInternal()
void set_segmentation_block_list(BLOCK_LIST *block_list)
void set_use_cjk_fp_model(bool flag)
int pageseg_devanagari_split_strategy
#define double_MEMBER(name, val, comment, vec)
void set_ocr_split_strategy(SplitStrategy strategy)
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
void ResetAdaptiveClassifier()
void SetEquationDetect(EquationDetect *detector)
Assume a single uniform block of text. (Default.)
const char * string() const
bool textord_use_cjk_fp_model
char * tessedit_char_whitelist
void PrepareForTessOCR(BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
void ResetDocumentDictionary()
void set_pageseg_split_strategy(SplitStrategy strategy)
bool HasDifferentSplitStrategies() const
char * tessedit_char_blacklist
C_BLOB_LIST * blob_list()
get blobs
void WritePDF(const char *filename)
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
#define INT_INIT_MEMBER(name, val, comment, vec)
void ResetDocumentDictionary()
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
void SetBlackAndWhitelist()
#define STRING_MEMBER(name, val, comment, vec)
void set_orig_pix(Pix *pix)
#define BOOL_INIT_MEMBER(name, val, comment, vec)
void extract_edges(Pix *pix, BLOCK *block)
void SetLangTesseract(Tesseract *lang_tesseract)
#define BOOL_MEMBER(name, val, comment, vec)
char * tessedit_char_unblacklist
#define INT_MEMBER(name, val, comment, vec)
int ocr_devanagari_split_strategy