21 #pragma warning(disable:4244) // Conversion warnings 104 *accepted_match_count = 0;
132 int expected_outline_count;
134 if (
STRING (outlines_odd).contains (c))
136 else if (
STRING (outlines_2).contains (c))
137 expected_outline_count = 2;
139 expected_outline_count = 1;
140 return abs (outline_count - expected_outline_count);
144 BOOL8 good_quality_doc) {
145 if ((tessedit_good_quality_unrej && good_quality_doc))
146 unrej_good_quality_words(page_res_it);
147 doc_and_block_rejection(page_res_it, good_quality_doc);
148 if (unlv_tilde_crunching) {
149 tilde_crunch(page_res_it);
150 tilde_delete(page_res_it);
173 while (page_res_it.
word () != NULL) {
174 check_debug_pt (page_res_it.
word (), 100);
176 word = page_res_it.
word ();
178 if (word->
reject_map[i].accept_if_good_quality ())
186 quality_rowrej_pc)) {
187 word = page_res_it.
word ();
189 (tessedit_unrej_any_wd ||
190 acceptable_word_string(*word->
uch_set,
194 unrej_good_chs(word, page_res_it.
row ()->
row);
200 current_row = page_res_it.
row ();
201 while ((page_res_it.
word () != NULL) &&
202 (page_res_it.
row () == current_row))
205 check_debug_pt (page_res_it.
word (), 110);
210 current_block = NULL;
212 while (page_res_it.
word () != NULL) {
213 if (current_block != page_res_it.
block ()) {
214 current_block = page_res_it.
block ();
218 if (current_row != page_res_it.
row ()) {
219 current_row = page_res_it.
row ();
239 BOOL8 good_quality_doc) {
246 BOOL8 prev_word_rejected;
247 inT16 char_quality = 0;
248 inT16 accepted_char_quality;
253 if (tessedit_debug_doc_rejection) {
254 tprintf(
"REJECT ALL #chars: %d #Rejects: %d; \n",
259 if (tessedit_debug_doc_rejection) {
260 tprintf(
"NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
269 while ((word = page_res_it.
word()) != NULL) {
270 current_block = page_res_it.
block();
274 tessedit_reject_block_percent) {
275 if (tessedit_debug_block_rejection) {
276 tprintf(
"REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
280 prev_word_rejected =
FALSE;
281 while ((word = page_res_it.
word()) != NULL &&
282 (page_res_it.
block() == current_block)) {
283 if (tessedit_preserve_blk_rej_perfect_wds) {
286 if (rej_word && tessedit_dont_blkrej_good_wds &&
288 acceptable_word_string(
293 word_char_quality(word, page_res_it.
row()->
row,
295 &accepted_char_quality);
307 if (tessedit_use_reject_spaces &&
308 prev_word_rejected &&
314 prev_word_rejected = rej_word;
318 if (tessedit_debug_block_rejection) {
319 tprintf(
"NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
326 while (page_res_it.
word() != NULL &&
327 page_res_it.
block() == current_block) {
328 current_row = page_res_it.
row();
337 tessedit_reject_row_percent &&
340 tessedit_whole_wd_rej_row_percent) {
341 if (tessedit_debug_block_rejection) {
342 tprintf(
"REJECTING ROW %d #chars: %d; #Rejects: %d\n",
346 prev_word_rejected =
FALSE;
347 while ((word = page_res_it.
word()) != NULL &&
348 page_res_it.
row () == current_row) {
350 if (!tessedit_row_rej_good_docs && good_quality_doc) {
353 tessedit_good_doc_still_rowrej_wd;
354 }
else if (tessedit_preserve_row_rej_perfect_wds) {
358 if (rej_word && tessedit_dont_rowrej_good_wds &&
360 acceptable_word_string(*word->
uch_set,
364 word_char_quality(word, page_res_it.
row()->
row,
366 &accepted_char_quality);
378 if (tessedit_use_reject_spaces &&
379 prev_word_rejected &&
385 prev_word_rejected = rej_word;
389 if (tessedit_debug_block_rejection) {
390 tprintf(
"NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
393 while (page_res_it.
word() != NULL &&
394 page_res_it.
row() == current_row)
413 while (page_res_it.
word () != NULL) {
431 while (page_res_it.
word() != NULL) {
433 if (pb != NULL && !pb->
IsText()) {
437 word = page_res_it.
word();
439 if (crunch_early_convert_bad_unlv_chs)
440 convert_bad_unlv_chs(word);
442 if (crunch_early_merge_tess_fails)
446 found_terrible_word =
FALSE;
448 prev_potential_marked =
FALSE;
451 ok_dict_word = safe_dict_word(word);
452 garbage_level = garbage_word (word, ok_dict_word);
455 (terrible_word_crunch (word, garbage_level))) {
456 if (crunch_debug > 0) {
457 tprintf (
"T CRUNCHING: \"%s\"\n",
461 if (prev_potential_marked) {
463 if (crunch_debug > 0) {
464 tprintf (
"P1 CRUNCHING: \"%s\"\n",
470 prev_potential_marked =
FALSE;
472 found_terrible_word =
TRUE;
475 (potential_word_crunch (word,
476 garbage_level, ok_dict_word))) {
477 if (found_terrible_word) {
478 if (crunch_debug > 0) {
479 tprintf (
"P2 CRUNCHING: \"%s\"\n",
484 else if (!prev_potential_marked) {
485 copy_it = page_res_it;
486 prev_potential_marked =
TRUE;
487 if (crunch_debug > 1) {
488 tprintf (
"P3 CRUNCHING: \"%s\"\n",
494 found_terrible_word =
FALSE;
496 prev_potential_marked =
FALSE;
497 if (crunch_debug > 2) {
498 tprintf (
"NO CRUNCH: \"%s\"\n",
520 if (adjusted_len > crunch_rating_max)
521 adjusted_len = crunch_rating_max;
524 if (rating_per_ch > crunch_terrible_rating)
526 else if (crunch_terrible_garbage && (garbage_level ==
G_TERRIBLE))
529 (garbage_level !=
G_OK))
531 else if ((rating_per_ch > crunch_poor_garbage_rate) &&
532 (garbage_level !=
G_OK))
535 if (crunch_mode > 0) {
536 if (crunch_debug > 2) {
537 tprintf (
"Terrible_word_crunch (%d) on \"%s\"\n",
548 BOOL8 ok_dict_word) {
553 BOOL8 word_crunchable;
554 int poor_indicator_count = 0;
556 word_crunchable = !crunch_leave_accept_strings ||
558 (acceptable_word_string(*word->
uch_set,
563 if (adjusted_len > 10)
567 if (rating_per_ch > crunch_pot_poor_rate) {
568 if (crunch_debug > 2) {
569 tprintf(
"Potential poor rating on \"%s\"\n",
572 poor_indicator_count++;
575 if (word_crunchable &&
577 if (crunch_debug > 2) {
578 tprintf(
"Potential poor cert on \"%s\"\n",
581 poor_indicator_count++;
584 if (garbage_level !=
G_OK) {
585 if (crunch_debug > 2) {
586 tprintf(
"Potential garbage on \"%s\"\n",
589 poor_indicator_count++;
591 return poor_indicator_count >= crunch_pot_indicators;
599 inT16 debug_delete_mode;
601 inT16 x_debug_delete_mode;
605 while (page_res_it.
word() != NULL) {
606 word = page_res_it.
word();
608 delete_mode = word_deletable (word, debug_delete_mode);
611 if (crunch_debug > 0) {
612 tprintf (
"BOL CRUNCH DELETING(%d): \"%s\"\n",
617 deleting_from_bol =
TRUE;
619 if (marked_delete_point) {
621 x_delete_mode = word_deletable (copy_it.
word (),
622 x_debug_delete_mode);
623 if (crunch_debug > 0) {
624 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
632 if (crunch_debug > 0) {
633 tprintf (
"EOL CRUNCH DELETING(%d): \"%s\"\n",
638 deleting_from_bol =
FALSE;
639 marked_delete_point =
FALSE;
642 if (!marked_delete_point) {
643 copy_it = page_res_it;
644 marked_delete_point =
TRUE;
649 deleting_from_bol =
FALSE;
651 marked_delete_point =
FALSE;
657 if (!crunch_early_merge_tess_fails)
699 int isolated_digits = 0;
700 int isolated_alphas = 0;
701 int bad_char_count = 0;
706 int alpha_repetition_count = 0;
707 int longest_alpha_repetition_count = 0;
708 int longest_lower_run_len = 0;
709 int lower_string_count = 0;
710 int longest_upper_run_len = 0;
711 int upper_string_count = 0;
712 int total_alpha_count = 0;
713 int total_digit_count = 0;
715 for (; *str !=
'\0'; str += *(lengths++)) {
720 case SUBSEQUENT_UPPER:
722 state = SUBSEQUENT_UPPER;
723 upper_string_count++;
724 if (longest_upper_run_len < upper_string_count)
725 longest_upper_run_len = upper_string_count;
727 alpha_repetition_count++;
728 if (longest_alpha_repetition_count < alpha_repetition_count) {
729 longest_alpha_repetition_count = alpha_repetition_count;
734 alpha_repetition_count = 1;
742 alpha_repetition_count = 1;
743 upper_string_count = 1;
750 case SUBSEQUENT_LOWER:
752 state = SUBSEQUENT_LOWER;
753 lower_string_count++;
754 if (longest_lower_run_len < lower_string_count)
755 longest_lower_run_len = lower_string_count;
757 alpha_repetition_count++;
758 if (longest_alpha_repetition_count < alpha_repetition_count) {
759 longest_alpha_repetition_count = alpha_repetition_count;
764 alpha_repetition_count = 1;
772 alpha_repetition_count = 1;
773 lower_string_count = 1;
781 state = SUBSEQUENT_NUM;
793 if (*lengths == 1 && *str ==
' ')
822 if (crunch_include_numerals) {
823 total_alpha_count += total_digit_count - isolated_digits;
826 if (crunch_leave_ok_strings && len >= 4 &&
827 2 * (total_alpha_count - isolated_alphas) > len &&
828 longest_alpha_repetition_count < crunch_long_repetitions) {
829 if ((crunch_accept_ok &&
830 acceptable_word_string(*word->
uch_set, str, lengths) !=
832 longest_lower_run_len > crunch_leave_lc_strings ||
833 longest_upper_run_len > crunch_leave_uc_strings)
837 strpbrk(str,
" ") == NULL &&
842 acceptable_word_string(*word->
uch_set, str, lengths) !=
846 ok_chars = len - bad_char_count - isolated_digits -
847 isolated_alphas - tess_rejs;
849 if (crunch_debug > 3) {
850 tprintf(
"garbage_word: \"%s\"\n",
852 tprintf(
"LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
854 bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
856 if (bad_char_count == 0 &&
858 (len > isolated_digits + isolated_alphas || len <= 2))
861 if (tess_rejs > ok_chars ||
862 (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
866 dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
868 if (dodgy_chars > 5 || (dodgy_chars / (
float) len) > 0.5)
873 dodgy_chars = 2 * tess_rejs + bad_char_count;
874 if ((len == 4 && dodgy_chars > 2) ||
875 (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
928 if ((failure_count (word) * 1.5) > word_len) {
940 if (rating_per_ch > crunch_del_rating) {
974 for (; *str !=
'\0'; str++) {
984 inT16 outline_count = 0;
985 inT16 small_outline_count = 0;
987 float small_limit =
kBlnXHeight * crunch_small_outlines_size;
989 for (
int b = 0; b < word->
NumBlobs(); ++b) {
993 box = ol->bounding_box();
995 max_dimension = box.
height();
997 max_dimension = box.
width();
998 if (max_dimension < small_limit)
999 small_outline_count++;
1002 return small_outline_count >= outline_count;
void AcceptIfGoodQuality(int index)
void convert_bad_unlv_chs(WERD_RES *word_res)
UNICHAR_ID unichar_id(int index) const
void rej_word_block_rej()
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
WERD_CHOICE * best_choice
void unrej_good_chs(WERD_RES *word, ROW *row)
const STRING & unichar_lengths() const
ROW_RES * prev_row() const
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
const char * string() const
const int kBlnBaselineOffset
inT16 word_blob_quality(WERD_RES *word, ROW *row)
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
BOOL8 flag(WERD_FLAGS mask) const
WERD_RES * restart_page()
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
inT32 whole_word_rej_count
bool get_isdigit(UNICHAR_ID unichar_id) const
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
void tilde_crunch(PAGE_RES_IT &page_res_it)
inT16 accepted_match_count
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
CRUNCH_MODE unlv_crunch_mode
BOOL8 quality_recoverable_rejects()
void tilde_delete(PAGE_RES_IT &page_res_it)
void set_unichar_id(UNICHAR_ID unichar_id, int index)
const STRING & unichar_string() const
GenericVector< TBLOB * > blobs
tesseract::BoxWord * bln_boxes
POLY_BLOCK * poly_block() const
inT16 word_outline_errs(WERD_RES *word)
DocQualCallbacks(WERD_RES *word0)
inT16 count_outline_errs(char c, inT16 outline_count)
BOOL8 noise_outlines(TWERD *word)
uinT32 unsigned_size() const
bool get_isupper(UNICHAR_ID unichar_id) const
void CountMatchingBlobs(int index)
const UNICHARSET * uch_set
bool get_islower(UNICHAR_ID unichar_id) const
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
inT16 failure_count(WERD_RES *word)
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
void CountAcceptedBlobs(int index)
TBOX bounding_box() const
BLOCK_RES * block() const
void reject_whole_page(PAGE_RES_IT &page_res_it)