tesseract/a00062_source.html

 /******************************************************************
  * File:        docqual.cpp  (Formerly docqual.c)
  * Description: Document Quality Metrics
  * Author:    Phil Cheatle
  * Created:   Mon May  9 11:27:28 BST 1994
  *
  * (C) Copyright 1994, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #ifdef _MSC_VER
 #pragma warning(disable:4244)  // Conversion warnings
 #endif

 #include          <ctype.h>
 #include          "docqual.h"
 #include          "reject.h"
 #include          "tesscallback.h"
 #include          "tessvars.h"
 #include          "globals.h"
 #include          "tesseractclass.h"

 namespace tesseract{

 // A little class to provide the callbacks as we have no pre-bound args.
 struct DocQualCallbacks {
   explicit DocQualCallbacks(WERD_RES* word0)
     : word(word0), match_count(0), accepted_match_count(0) {}

   void CountMatchingBlobs(int index) {
     ++match_count;
   }

   void CountAcceptedBlobs(int index) {
     if (word->reject_map[index].accepted())
       ++accepted_match_count;
     ++match_count;
   }

   void AcceptIfGoodQuality(int index) {
     if (word->reject_map[index].accept_if_good_quality())
       word->reject_map[index].setrej_quality_accept();
   }

   WERD_RES* word;
   inT16 match_count;
   inT16 accepted_match_count;
 };

 /*************************************************************************
  * word_blob_quality()
  * How many blobs in the box_word are identical to those of the inword?
  * ASSUME blobs in both initial word and box_word are in ascending order of
  * left hand blob edge.
  *************************************************************************/
 inT16 Tesseract::word_blob_quality(WERD_RES *word, ROW *row) {
   if (word->bln_boxes == NULL ||
       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
     return 0;

   DocQualCallbacks cb(word);
   word->bln_boxes->ProcessMatchedBlobs(
       *word->rebuild_word,
       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs));
   return cb.match_count;
 }

 inT16 Tesseract::word_outline_errs(WERD_RES *word) {
   inT16 i = 0;
   inT16 err_count = 0;

   if (word->rebuild_word != NULL) {
     for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
       TBLOB* blob = word->rebuild_word->blobs[b];
       err_count += count_outline_errs(word->best_choice->unichar_string()[i],
                                       blob->NumOutlines());
       i++;
     }
   }
   return err_count;
 }

 /*************************************************************************
  * word_char_quality()
  * Combination of blob quality and outline quality - how many good chars are
  * there? - I.e chars which pass the blob AND outline tests.
  *************************************************************************/
 void Tesseract::word_char_quality(WERD_RES *word,
                                   ROW *row,
                                   inT16 *match_count,
                                   inT16 *accepted_match_count) {
   if (word->bln_boxes == NULL || word->rebuild_word == NULL ||
       word->rebuild_word->blobs.empty()) {
     *match_count = 0;
     *accepted_match_count = 0;
     return;
   }

   DocQualCallbacks cb(word);
   word->bln_boxes->ProcessMatchedBlobs(
       *word->rebuild_word,
       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs));
   *match_count = cb.match_count;
   *accepted_match_count = cb.accepted_match_count;
 }

 /*************************************************************************
  * unrej_good_chs()
  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
  *************************************************************************/
 void Tesseract::unrej_good_chs(WERD_RES *word, ROW *row) {
   if (word->bln_boxes == NULL ||
       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
     return;

   DocQualCallbacks cb(word);
   word->bln_boxes->ProcessMatchedBlobs(
       *word->rebuild_word,
       NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality));
 }

 inT16 Tesseract::count_outline_errs(char c, inT16 outline_count) {
   int expected_outline_count;

   if (STRING (outlines_odd).contains (c))
     return 0;  // Don't use this char
   else if (STRING (outlines_2).contains (c))
     expected_outline_count = 2;
   else
     expected_outline_count = 1;
   return abs (outline_count - expected_outline_count);
 }

 void Tesseract::quality_based_rejection(PAGE_RES_IT &page_res_it,
                                         BOOL8 good_quality_doc) {
   if ((tessedit_good_quality_unrej && good_quality_doc))
     unrej_good_quality_words(page_res_it);
   doc_and_block_rejection(page_res_it, good_quality_doc);
   if (unlv_tilde_crunching) {
     tilde_crunch(page_res_it);
     tilde_delete(page_res_it);
   }
 }

 /*************************************************************************
  * unrej_good_quality_words()
  * Accept potential rejects in words which pass the following checks:
  *    - Contains a potential reject
  *    - Word looks like a sensible alpha word.
  *    - Word segmentation is the same as the original image
  *    - All characters have the expected number of outlines
  * NOTE - the rejection counts are recalculated after unrejection
  *      - CAN'T do it in a single pass without a bit of fiddling
  *    - keep it simple but inefficient
  *************************************************************************/
 void Tesseract::unrej_good_quality_words(  //unreject potential
                                          PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   ROW_RES *current_row;
   BLOCK_RES *current_block;
   int i;

   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     check_debug_pt (page_res_it.word (), 100);
     if (bland_unrej) {
       word = page_res_it.word ();
       for (i = 0; i < word->reject_map.length (); i++) {
         if (word->reject_map[i].accept_if_good_quality ())
           word->reject_map[i].setrej_quality_accept ();
       }
       page_res_it.forward ();
     }
     else if ((page_res_it.row ()->char_count > 0) &&
       ((page_res_it.row ()->rej_count /
       (float) page_res_it.row ()->char_count) <=
     quality_rowrej_pc)) {
       word = page_res_it.word ();
       if (word->reject_map.quality_recoverable_rejects() &&
           (tessedit_unrej_any_wd ||
            acceptable_word_string(*word->uch_set,
                                   word->best_choice->unichar_string().string(),
                                   word->best_choice->unichar_lengths().string())
                != AC_UNACCEPTABLE)) {
         unrej_good_chs(word, page_res_it.row ()->row);
       }
       page_res_it.forward ();
     }
     else {
       /* Skip to end of dodgy row */
       current_row = page_res_it.row ();
       while ((page_res_it.word () != NULL) &&
         (page_res_it.row () == current_row))
         page_res_it.forward ();
     }
     check_debug_pt (page_res_it.word (), 110);
   }
   page_res_it.restart_page ();
   page_res_it.page_res->char_count = 0;
   page_res_it.page_res->rej_count = 0;
   current_block = NULL;
   current_row = NULL;
   while (page_res_it.word () != NULL) {
     if (current_block != page_res_it.block ()) {
       current_block = page_res_it.block ();
       current_block->char_count = 0;
       current_block->rej_count = 0;
     }
     if (current_row != page_res_it.row ()) {
       current_row = page_res_it.row ();
       current_row->char_count = 0;
       current_row->rej_count = 0;
       current_row->whole_word_rej_count = 0;
     }
     page_res_it.rej_stat_word ();
     page_res_it.forward ();
   }
 }


 /*************************************************************************
  * doc_and_block_rejection()
  *
  * If the page has too many rejects - reject all of it.
  * If any block has too many rejects - reject all words in the block
  *************************************************************************/

 void Tesseract::doc_and_block_rejection(  //reject big chunks
                                         PAGE_RES_IT &page_res_it,
                                         BOOL8 good_quality_doc) {
   inT16 block_no = 0;
   inT16 row_no = 0;
   BLOCK_RES *current_block;
   ROW_RES *current_row;

   BOOL8 rej_word;
   BOOL8 prev_word_rejected;
   inT16 char_quality = 0;
   inT16 accepted_char_quality;

   if (page_res_it.page_res->rej_count * 100.0 /
       page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
     reject_whole_page(page_res_it);
     if (tessedit_debug_doc_rejection) {
       tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
               page_res_it.page_res->char_count,
               page_res_it.page_res->rej_count);
     }
   } else {
     if (tessedit_debug_doc_rejection) {
       tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
               page_res_it.page_res->char_count,
               page_res_it.page_res->rej_count);
     }

     /* Walk blocks testing for block rejection */

     page_res_it.restart_page();
     WERD_RES* word;
     while ((word = page_res_it.word()) != NULL) {
       current_block = page_res_it.block();
       block_no = current_block->block->index();
       if (current_block->char_count > 0 &&
           (current_block->rej_count * 100.0 / current_block->char_count) >
            tessedit_reject_block_percent) {
         if (tessedit_debug_block_rejection) {
           tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
                   block_no, current_block->char_count,
                   current_block->rej_count);
         }
         prev_word_rejected = FALSE;
         while ((word = page_res_it.word()) != NULL &&
                (page_res_it.block() == current_block)) {
           if (tessedit_preserve_blk_rej_perfect_wds) {
             rej_word = word->reject_map.reject_count() > 0 ||
                 word->reject_map.length () < tessedit_preserve_min_wd_len;
             if (rej_word && tessedit_dont_blkrej_good_wds &&
                 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
                 acceptable_word_string(
                     *word->uch_set,
                     word->best_choice->unichar_string().string(),
                     word->best_choice->unichar_lengths().string()) !=
                 AC_UNACCEPTABLE) {
               word_char_quality(word, page_res_it.row()->row,
                                 &char_quality,
                                 &accepted_char_quality);
               rej_word = char_quality !=  word->reject_map.length();
             }
           } else {
             rej_word = TRUE;
           }
           if (rej_word) {
             /*
               Reject spacing if both current and prev words are rejected.
               NOTE - this is NOT restricted to FUZZY spaces. - When tried this
               generated more space errors.
             */
             if (tessedit_use_reject_spaces &&
                 prev_word_rejected &&
                 page_res_it.prev_row() == page_res_it.row() &&
                 word->word->space() == 1)
               word->reject_spaces = TRUE;
             word->reject_map.rej_word_block_rej();
           }
           prev_word_rejected = rej_word;
           page_res_it.forward();
         }
       } else {
         if (tessedit_debug_block_rejection) {
           tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
                   block_no, page_res_it.block()->char_count,
                   page_res_it.block()->rej_count);
         }

         /* Walk rows in block testing for row rejection */
         row_no = 0;
         while (page_res_it.word() != NULL &&
                page_res_it.block() == current_block) {
           current_row = page_res_it.row();
           row_no++;
           /* Reject whole row if:
             fraction of chars on row which are rejected exceed a limit AND
             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
             limit
           */
           if (current_row->char_count > 0 &&
               (current_row->rej_count * 100.0 / current_row->char_count) >
               tessedit_reject_row_percent &&
               (current_row->whole_word_rej_count * 100.0 /
                   current_row->rej_count) <
               tessedit_whole_wd_rej_row_percent) {
             if (tessedit_debug_block_rejection) {
               tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
                       row_no, current_row->char_count,
                       current_row->rej_count);
             }
             prev_word_rejected = FALSE;
             while ((word = page_res_it.word()) != NULL &&
                    page_res_it.row () == current_row) {
               /* Preserve words on good docs unless they are mostly rejected*/
               if (!tessedit_row_rej_good_docs && good_quality_doc) {
                 rej_word = word->reject_map.reject_count() /
                     static_cast<float>(word->reject_map.length()) >
                     tessedit_good_doc_still_rowrej_wd;
               } else if (tessedit_preserve_row_rej_perfect_wds) {
                 /* Preserve perfect words anyway */
                 rej_word = word->reject_map.reject_count() > 0 ||
                     word->reject_map.length () < tessedit_preserve_min_wd_len;
                 if (rej_word && tessedit_dont_rowrej_good_wds &&
                     word->reject_map.length() >= tessedit_preserve_min_wd_len &&
                     acceptable_word_string(*word->uch_set,
                         word->best_choice->unichar_string().string(),
                         word->best_choice->unichar_lengths().string()) !=
                             AC_UNACCEPTABLE) {
                   word_char_quality(word, page_res_it.row()->row,
                                     &char_quality,
                                     &accepted_char_quality);
                   rej_word = char_quality != word->reject_map.length();
                 }
               } else {
                 rej_word = TRUE;
               }
               if (rej_word) {
                 /*
                   Reject spacing if both current and prev words are rejected.
                   NOTE - this is NOT restricted to FUZZY spaces. - When tried
                   this generated more space errors.
                 */
                 if (tessedit_use_reject_spaces &&
                     prev_word_rejected &&
                     page_res_it.prev_row() == page_res_it.row() &&
                     word->word->space () == 1)
                   word->reject_spaces = TRUE;
                 word->reject_map.rej_word_row_rej();
               }
               prev_word_rejected = rej_word;
               page_res_it.forward();
             }
           } else {
             if (tessedit_debug_block_rejection) {
               tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
                       row_no, current_row->char_count, current_row->rej_count);
             }
             while (page_res_it.word() != NULL &&
                    page_res_it.row() == current_row)
               page_res_it.forward();
           }
         }
       }
     }
   }
 }

 }  // namespace tesseract

 /*************************************************************************
  * reject_whole_page()
  * Don't believe any of it - set the reject map to 00..00 in all words
  *
  *************************************************************************/

 void reject_whole_page(PAGE_RES_IT &page_res_it) {
   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     page_res_it.word ()->reject_map.rej_word_doc_rej ();
     page_res_it.forward ();
   }
                                  //whole page is rejected
   page_res_it.page_res->rejected = TRUE;
 }

 namespace tesseract {
 void Tesseract::tilde_crunch(PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   GARBAGE_LEVEL garbage_level;
   PAGE_RES_IT copy_it;
   BOOL8 prev_potential_marked = FALSE;
   BOOL8 found_terrible_word = FALSE;
   BOOL8 ok_dict_word;

   page_res_it.restart_page();
   while (page_res_it.word() != NULL) {
     POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
     if (pb != NULL && !pb->IsText()) {
       page_res_it.forward();
       continue;
     }
     word = page_res_it.word();

     if (crunch_early_convert_bad_unlv_chs)
       convert_bad_unlv_chs(word);

     if (crunch_early_merge_tess_fails)
       word->merge_tess_fails();

     if (word->reject_map.accept_count () != 0) {
       found_terrible_word = FALSE;
                                  //Forget earlier potential crunches
       prev_potential_marked = FALSE;
     }
     else {
       ok_dict_word = safe_dict_word(word);
       garbage_level = garbage_word (word, ok_dict_word);

       if ((garbage_level != G_NEVER_CRUNCH) &&
       (terrible_word_crunch (word, garbage_level))) {
         if (crunch_debug > 0) {
           tprintf ("T CRUNCHING: \"%s\"\n",
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = CR_KEEP_SPACE;
         if (prev_potential_marked) {
           while (copy_it.word () != word) {
             if (crunch_debug > 0) {
               tprintf ("P1 CRUNCHING: \"%s\"\n",
                 copy_it.word()->best_choice->unichar_string().string());
             }
             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
             copy_it.forward ();
           }
           prev_potential_marked = FALSE;
         }
         found_terrible_word = TRUE;
       }
       else if ((garbage_level != G_NEVER_CRUNCH) &&
         (potential_word_crunch (word,
       garbage_level, ok_dict_word))) {
         if (found_terrible_word) {
           if (crunch_debug > 0) {
             tprintf ("P2 CRUNCHING: \"%s\"\n",
               word->best_choice->unichar_string().string());
           }
           word->unlv_crunch_mode = CR_KEEP_SPACE;
         }
         else if (!prev_potential_marked) {
           copy_it = page_res_it;
           prev_potential_marked = TRUE;
           if (crunch_debug > 1) {
             tprintf ("P3 CRUNCHING: \"%s\"\n",
               word->best_choice->unichar_string().string());
           }
         }
       }
       else {
         found_terrible_word = FALSE;
                                  //Forget earlier potential crunches
         prev_potential_marked = FALSE;
         if (crunch_debug > 2) {
           tprintf ("NO CRUNCH: \"%s\"\n",
             word->best_choice->unichar_string().string());
         }
       }
     }
     page_res_it.forward ();
   }
 }


 BOOL8 Tesseract::terrible_word_crunch(WERD_RES *word,
                                       GARBAGE_LEVEL garbage_level) {
   float rating_per_ch;
   int adjusted_len;
   int crunch_mode = 0;

   if ((word->best_choice->unichar_string().length () == 0) ||
     (strspn (word->best_choice->unichar_string().string(), " ") ==
     word->best_choice->unichar_string().unsigned_size ()))
     crunch_mode = 1;
   else {
     adjusted_len = word->reject_map.length ();
     if (adjusted_len > crunch_rating_max)
       adjusted_len = crunch_rating_max;
     rating_per_ch = word->best_choice->rating () / adjusted_len;

     if (rating_per_ch > crunch_terrible_rating)
       crunch_mode = 2;
     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
       crunch_mode = 3;
     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
       (garbage_level != G_OK))
       crunch_mode = 4;
     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
       (garbage_level != G_OK))
       crunch_mode = 5;
   }
   if (crunch_mode > 0) {
     if (crunch_debug > 2) {
       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
         crunch_mode, word->best_choice->unichar_string().string());
     }
     return TRUE;
   }
   else
     return FALSE;
 }

 BOOL8 Tesseract::potential_word_crunch(WERD_RES *word,
                                        GARBAGE_LEVEL garbage_level,
                                        BOOL8 ok_dict_word) {
   float rating_per_ch;
   int adjusted_len;
   const char *str = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   BOOL8 word_crunchable;
   int poor_indicator_count = 0;

   word_crunchable = !crunch_leave_accept_strings ||
                     word->reject_map.length() < 3 ||
                     (acceptable_word_string(*word->uch_set,
                                             str, lengths) == AC_UNACCEPTABLE &&
                      !ok_dict_word);

   adjusted_len = word->reject_map.length();
   if (adjusted_len > 10)
     adjusted_len = 10;
   rating_per_ch = word->best_choice->rating() / adjusted_len;

   if (rating_per_ch > crunch_pot_poor_rate) {
     if (crunch_debug > 2) {
       tprintf("Potential poor rating on \"%s\"\n",
               word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }

   if (word_crunchable &&
       word->best_choice->certainty() < crunch_pot_poor_cert) {
     if (crunch_debug > 2) {
       tprintf("Potential poor cert on \"%s\"\n",
               word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }

   if (garbage_level != G_OK) {
     if (crunch_debug > 2) {
       tprintf("Potential garbage on \"%s\"\n",
               word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }
   return poor_indicator_count >= crunch_pot_indicators;
 }

 void Tesseract::tilde_delete(PAGE_RES_IT &page_res_it) {
   WERD_RES *word;
   PAGE_RES_IT copy_it;
   BOOL8 deleting_from_bol = FALSE;
   BOOL8 marked_delete_point = FALSE;
   inT16 debug_delete_mode;
   CRUNCH_MODE delete_mode;
   inT16 x_debug_delete_mode;
   CRUNCH_MODE x_delete_mode;

   page_res_it.restart_page();
   while (page_res_it.word() != NULL) {
     word = page_res_it.word();

     delete_mode = word_deletable (word, debug_delete_mode);
     if (delete_mode != CR_NONE) {
       if (word->word->flag (W_BOL) || deleting_from_bol) {
         if (crunch_debug > 0) {
           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
             debug_delete_mode,
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = delete_mode;
         deleting_from_bol = TRUE;
       } else if (word->word->flag(W_EOL)) {
         if (marked_delete_point) {
           while (copy_it.word() != word) {
             x_delete_mode = word_deletable (copy_it.word (),
               x_debug_delete_mode);
             if (crunch_debug > 0) {
               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
                 x_debug_delete_mode,
                 copy_it.word()->best_choice->unichar_string().string());
             }
             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
             copy_it.forward ();
           }
         }
         if (crunch_debug > 0) {
           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
             debug_delete_mode,
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = delete_mode;
         deleting_from_bol = FALSE;
         marked_delete_point = FALSE;
       }
       else {
         if (!marked_delete_point) {
           copy_it = page_res_it;
           marked_delete_point = TRUE;
         }
       }
     }
     else {
       deleting_from_bol = FALSE;
                                  //Forget earlier potential crunches
       marked_delete_point = FALSE;
     }
     /*
       The following step has been left till now as the tess fails are used to
       determine if the word is deletable.
     */
     if (!crunch_early_merge_tess_fails)
       word->merge_tess_fails();
     page_res_it.forward ();
   }
 }


 void Tesseract::convert_bad_unlv_chs(WERD_RES *word_res) {
   int i;
   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
   UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
   UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
   UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
   for (i = 0; i < word_res->reject_map.length(); ++i) {
     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
       word_res->best_choice->set_unichar_id(unichar_dash, i);
       if (word_res->reject_map[i].accepted ())
         word_res->reject_map[i].setrej_unlv_rej ();
     }
     if (word_res->best_choice->unichar_id(i) == unichar_pow) {
       word_res->best_choice->set_unichar_id(unichar_space, i);
       if (word_res->reject_map[i].accepted ())
         word_res->reject_map[i].setrej_unlv_rej ();
     }
   }
 }

 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, BOOL8 ok_dict_word) {
   enum STATES
   {
     JUNK,
     FIRST_UPPER,
     FIRST_LOWER,
     FIRST_NUM,
     SUBSEQUENT_UPPER,
     SUBSEQUENT_LOWER,
     SUBSEQUENT_NUM
   };
   const char *str = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   STATES state = JUNK;
   int len = 0;
   int isolated_digits = 0;
   int isolated_alphas = 0;
   int bad_char_count = 0;
   int tess_rejs = 0;
   int dodgy_chars = 0;
   int ok_chars;
   UNICHAR_ID last_char = -1;
   int alpha_repetition_count = 0;
   int longest_alpha_repetition_count = 0;
   int longest_lower_run_len = 0;
   int lower_string_count = 0;
   int longest_upper_run_len = 0;
   int upper_string_count = 0;
   int total_alpha_count = 0;
   int total_digit_count = 0;

   for (; *str != '\0'; str += *(lengths++)) {
     len++;
     if (word->uch_set->get_isupper (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_UPPER:
         case FIRST_UPPER:
           state = SUBSEQUENT_UPPER;
           upper_string_count++;
           if (longest_upper_run_len < upper_string_count)
             longest_upper_run_len = upper_string_count;
           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
             last_char = word->uch_set->unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
         case FIRST_NUM:
           isolated_digits++;
         default:
           state = FIRST_UPPER;
           last_char = word->uch_set->unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           upper_string_count = 1;
           break;
       }
     }
     else if (word->uch_set->get_islower (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_LOWER:
         case FIRST_LOWER:
           state = SUBSEQUENT_LOWER;
           lower_string_count++;
           if (longest_lower_run_len < lower_string_count)
             longest_lower_run_len = lower_string_count;
           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
             last_char = word->uch_set->unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
         case FIRST_NUM:
           isolated_digits++;
         default:
           state = FIRST_LOWER;
           last_char = word->uch_set->unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           lower_string_count = 1;
           break;
       }
     }
     else if (word->uch_set->get_isdigit (str, *lengths)) {
       total_digit_count++;
       switch (state) {
         case FIRST_NUM:
           state = SUBSEQUENT_NUM;
         case SUBSEQUENT_NUM:
           break;
         case FIRST_UPPER:
         case FIRST_LOWER:
           isolated_alphas++;
         default:
           state = FIRST_NUM;
           break;
       }
     }
     else {
       if (*lengths == 1 && *str == ' ')
         tess_rejs++;
       else
         bad_char_count++;
       switch (state) {
         case FIRST_NUM:
           isolated_digits++;
           break;
         case FIRST_UPPER:
         case FIRST_LOWER:
           isolated_alphas++;
         default:
           break;
       }
       state = JUNK;
     }
   }

   switch (state) {
     case FIRST_NUM:
       isolated_digits++;
       break;
     case FIRST_UPPER:
     case FIRST_LOWER:
       isolated_alphas++;
     default:
       break;
   }

   if (crunch_include_numerals) {
     total_alpha_count += total_digit_count - isolated_digits;
   }

   if (crunch_leave_ok_strings && len >= 4 &&
       2 * (total_alpha_count - isolated_alphas) > len &&
       longest_alpha_repetition_count < crunch_long_repetitions) {
     if ((crunch_accept_ok &&
          acceptable_word_string(*word->uch_set, str, lengths) !=
              AC_UNACCEPTABLE) ||
         longest_lower_run_len > crunch_leave_lc_strings ||
         longest_upper_run_len > crunch_leave_uc_strings)
       return G_NEVER_CRUNCH;
   }
   if (word->reject_map.length() > 1 &&
       strpbrk(str, " ") == NULL &&
       (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
        word->best_choice->permuter() == FREQ_DAWG_PERM ||
        word->best_choice->permuter() == USER_DAWG_PERM ||
        word->best_choice->permuter() == NUMBER_PERM ||
        acceptable_word_string(*word->uch_set, str, lengths) !=
            AC_UNACCEPTABLE || ok_dict_word))
     return G_OK;

   ok_chars = len - bad_char_count - isolated_digits -
     isolated_alphas - tess_rejs;

   if (crunch_debug > 3) {
     tprintf("garbage_word: \"%s\"\n",
             word->best_choice->unichar_string().string());
     tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
             len,
             bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
   }
   if (bad_char_count == 0 &&
       tess_rejs == 0 &&
       (len > isolated_digits + isolated_alphas || len <= 2))
     return G_OK;

   if (tess_rejs > ok_chars ||
       (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
     return G_TERRIBLE;

   if (len > 4) {
     dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
         isolated_alphas;
     if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
       return G_DODGY;
     else
       return G_OK;
   } else {
     dodgy_chars = 2 * tess_rejs + bad_char_count;
     if ((len == 4 && dodgy_chars > 2) ||
         (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
       return G_DODGY;
     else
       return G_OK;
   }
 }


 /*************************************************************************
  * word_deletable()
  *     DELETE WERDS AT ENDS OF ROWS IF
  *        Word is crunched &&
  *        ( string length = 0                                          OR
  *          > 50% of chars are "|" (before merging)                    OR
  *          certainty < -10                                            OR
  *          rating /char > 60                                          OR
  *          TOP of word is more than 0.5 xht BELOW baseline            OR
  *          BOTTOM of word is more than 0.5 xht ABOVE xht              OR
  *          length of word < 3xht                                      OR
  *          height of word < 0.7 xht                                   OR
  *          height of word > 3.0 xht                                   OR
  *          >75% of the outline BBs have longest dimension < 0.5xht
  *************************************************************************/

 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, inT16 &delete_mode) {
   int word_len = word->reject_map.length ();
   float rating_per_ch;
   TBOX box;                       //BB of word

   if (word->unlv_crunch_mode == CR_NONE) {
     delete_mode = 0;
     return CR_NONE;
   }

   if (word_len == 0) {
     delete_mode = 1;
     return CR_DELETE;
   }

   if (word->rebuild_word != NULL) {
     // Cube leaves rebuild_word NULL.
     box = word->rebuild_word->bounding_box();
     if (box.height () < crunch_del_min_ht * kBlnXHeight) {
       delete_mode = 4;
       return CR_DELETE;
     }

     if (noise_outlines(word->rebuild_word)) {
       delete_mode = 5;
       return CR_DELETE;
     }
   }

   if ((failure_count (word) * 1.5) > word_len) {
     delete_mode = 2;
     return CR_LOOSE_SPACE;
   }

   if (word->best_choice->certainty () < crunch_del_cert) {
     delete_mode = 7;
     return CR_LOOSE_SPACE;
   }

   rating_per_ch = word->best_choice->rating () / word_len;

   if (rating_per_ch > crunch_del_rating) {
     delete_mode = 8;
     return CR_LOOSE_SPACE;
   }

   if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
     delete_mode = 9;
     return CR_LOOSE_SPACE;
   }

   if (box.bottom () >
   kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
     delete_mode = 10;
     return CR_LOOSE_SPACE;
   }

   if (box.height () > crunch_del_max_ht * kBlnXHeight) {
     delete_mode = 11;
     return CR_LOOSE_SPACE;
   }

   if (box.width () < crunch_del_min_width * kBlnXHeight) {
     delete_mode = 3;
     return CR_LOOSE_SPACE;
   }

   delete_mode = 0;
   return CR_NONE;
 }

 inT16 Tesseract::failure_count(WERD_RES *word) {
   const char *str = word->best_choice->unichar_string().string();
   int tess_rejs = 0;

   for (; *str != '\0'; str++) {
     if (*str == ' ')
       tess_rejs++;
   }
   return tess_rejs;
 }


 BOOL8 Tesseract::noise_outlines(TWERD *word) {
   TBOX box;                       // BB of outline
   inT16 outline_count = 0;
   inT16 small_outline_count = 0;
   inT16 max_dimension;
   float small_limit = kBlnXHeight * crunch_small_outlines_size;

   for (int b = 0; b < word->NumBlobs(); ++b) {
     TBLOB* blob = word->blobs[b];
     for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
       outline_count++;
       box = ol->bounding_box();
       if (box.height() > box.width())
         max_dimension = box.height();
       else
         max_dimension = box.width();
       if (max_dimension < small_limit)
         small_outline_count++;
     }
   }
   return small_outline_count >= outline_count;
 }

 }  // namespace tesseract
tesseract::DocQualCallbacks
Definition: docqual.cpp:35

PAGE_RES_IT
Definition: pageres.h:659

BLOCK_RES
Definition: pageres.h:97

TESSLINE::next
TESSLINE * next
Definition: blobs.h:258

ROW_RES::rej_count
inT32 rej_count
Definition: pageres.h:129

tesseract::DocQualCallbacks::AcceptIfGoodQuality
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:49

PAGE_RES::char_count
inT32 char_count
Definition: pageres.h:60

G_TERRIBLE
Definition: docqual.h:30

tesseract::Tesseract::convert_bad_unlv_chs
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:664

SYSTEM_DAWG_PERM
Definition: ratngs.h:249

AC_UNACCEPTABLE
Unacceptable word.
Definition: control.h:36

TRUE
#define TRUE
Definition: capi.h:45

WERD_CHOICE::unichar_id
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313

REJMAP::rej_word_block_rej
void rej_word_block_rej()
Definition: rejctmap.cpp:503

NewPermanentTessCallback
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116

PDBLK::index
int index() const
Definition: pdblock.h:67

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

tesseract::BoxWord::ProcessMatchedBlobs
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193

WERD_RES::best_choice
WERD_CHOICE * best_choice
Definition: pageres.h:219

tesseract::Tesseract::unrej_good_chs
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:120

WERD_CHOICE::unichar_lengths
const STRING & unichar_lengths() const
Definition: ratngs.h:546

PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:730

tesseract::DocQualCallbacks::word
WERD_RES * word
Definition: docqual.cpp:54

tesseractclass.h

W_EOL
Definition: werd.h:36

TBLOB::outlines
TESSLINE * outlines
Definition: blobs.h:377

kBlnXHeight
const int kBlnXHeight
Definition: normalis.h:28

PAGE_RES::rej_count
inT32 rej_count
Definition: pageres.h:61

WERD_RES::rebuild_word
TWERD * rebuild_word
Definition: pageres.h:244

tprintf
#define tprintf(...)
Definition: tprintf.h:31

tesseract::Tesseract::quality_based_rejection
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:143

TBLOB::NumOutlines
int NumOutlines() const
Definition: blobs.cpp:469

BLOCK_RES::block
BLOCK * block
Definition: pageres.h:99

PAGE_RES_IT::rej_stat_word
void rej_stat_word()
Definition: pageres.cpp:1675

STRING::string
const char * string() const
Definition: strngs.cpp:198

kBlnBaselineOffset
const int kBlnBaselineOffset
Definition: normalis.h:29

CR_LOOSE_SPACE
Definition: pageres.h:149

ROW_RES::row
ROW * row
Definition: pageres.h:127

GenericVector::empty
bool empty() const
Definition: genericvector.h:90

POLY_BLOCK
Definition: polyblk.h:28

tessvars.h

STRING::length
inT32 length() const
Definition: strngs.cpp:193

tesseract::Tesseract::word_blob_quality
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65

tesseract::Tesseract::doc_and_block_rejection
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:237

tesseract
Definition: baseapi.cpp:82

tesscallback.h

inT16
int16_t inT16
Definition: host.h:36

tesseract::Tesseract::potential_word_crunch
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
Definition: docqual.cpp:546

PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:739

WERD::flag
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128

REJMAP::accept_count
inT16 accept_count()
Definition: rejctmap.cpp:329

WERD_RES::reject_spaces
BOOL8 reject_spaces
Definition: pageres.h:320

TWERD
Definition: blobs.h:395

REJMAP::length
inT32 length() const
Definition: rejctmap.h:235

WERD_CHOICE::permuter
uinT8 permuter() const
Definition: ratngs.h:344

PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:683

POLY_BLOCK::IsText
bool IsText() const
Definition: polyblk.h:52

tesseract::Tesseract::word_deletable
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
Definition: docqual.cpp:899

ROW_RES::whole_word_rej_count
inT32 whole_word_rej_count
Definition: pageres.h:130

PAGE_RES_IT::page_res
PAGE_RES * page_res
Definition: pageres.h:661

CRUNCH_MODE
CRUNCH_MODE
Definition: pageres.h:145

PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:716

UNICHARSET::get_isdigit
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472

REJMAP::reject_count
inT16 reject_count()
Definition: rejctmap.h:241

BLOCK_RES::rej_count
inT32 rej_count
Definition: pageres.h:101

PAGE_RES::rejected
BOOL8 rejected
Definition: pageres.h:63

BOOL8
unsigned char BOOL8
Definition: host.h:44

STRING
Definition: strngs.h:45

FALSE
#define FALSE
Definition: capi.h:46

G_OK
Definition: docqual.h:28

CR_DELETE
Definition: pageres.h:150

ROW_RES::char_count
inT32 char_count
Definition: pageres.h:128

CR_NONE
Definition: pageres.h:147

tesseract::Tesseract::garbage_word
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:684

tesseract::Tesseract::terrible_word_crunch
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:508

tesseract::Tesseract::tilde_crunch
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:422

tesseract::DocQualCallbacks::accepted_match_count
inT16 accepted_match_count
Definition: docqual.cpp:56

WERD_RES
Definition: pageres.h:155

WERD_CHOICE::certainty
float certainty() const
Definition: ratngs.h:328

tesseract::Tesseract::word_char_quality
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97

W_BOL
Definition: werd.h:35

reject.h

WERD_RES::unlv_crunch_mode
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294

TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:425

TBOX::top
inT16 top() const
Definition: rect.h:54

REJMAP::quality_recoverable_rejects
BOOL8 quality_recoverable_rejects()
Definition: rejctmap.cpp:352

tesseract::DocQualCallbacks::match_count
inT16 match_count
Definition: docqual.cpp:55

tesseract::Tesseract::tilde_delete
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:594

WERD_CHOICE::set_unichar_id
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:357

WERD_CHOICE::unichar_string
const STRING & unichar_string() const
Definition: ratngs.h:539

TBOX
Definition: rect.h:30

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:436

WERD_RES::bln_boxes
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184

NUMBER_PERM
Definition: ratngs.h:247

PDBLK::poly_block
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55

tesseract::Tesseract::word_outline_errs
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77

TBLOB
Definition: blobs.h:261

WERD_RES::merge_tess_fails
void merge_tess_fails()
Definition: pageres.cpp:1062

TBOX::height
inT16 height() const
Definition: rect.h:104

WERD_RES::word
WERD * word
Definition: pageres.h:175

G_DODGY
Definition: docqual.h:29

CR_KEEP_SPACE
Definition: pageres.h:148

TBOX::width
inT16 width() const
Definition: rect.h:111

tesseract::DocQualCallbacks::DocQualCallbacks
DocQualCallbacks(WERD_RES *word0)
Definition: docqual.cpp:36

tesseract::Tesseract::count_outline_errs
inT16 count_outline_errs(char c, inT16 outline_count)
Definition: docqual.cpp:131

globals.h

tesseract::Tesseract::noise_outlines
BOOL8 noise_outlines(TWERD *word)
Definition: docqual.cpp:982

PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:736

TBOX::bottom
inT16 bottom() const
Definition: rect.h:61

USER_DAWG_PERM
Definition: ratngs.h:251

STRING::unsigned_size
uinT32 unsigned_size() const
Definition: strngs.h:71

TESSLINE
Definition: blobs.h:180

docqual.h

UNICHARSET::get_isupper
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465

tesseract::DocQualCallbacks::CountMatchingBlobs
void CountMatchingBlobs(int index)
Definition: docqual.cpp:39

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:192

UNICHARSET::get_islower
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458

REJMAP::rej_word_row_rej
void rej_word_row_rej()
Definition: rejctmap.cpp:512

UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

BLOCK_RES::char_count
inT32 char_count
Definition: pageres.h:100

ROW_RES
Definition: pageres.h:125

REJMAP::rej_word_doc_rej
void rej_word_doc_rej()
Definition: rejctmap.cpp:494

FREQ_DAWG_PERM
Definition: ratngs.h:252

WERD_RES::reject_map
REJMAP reject_map
Definition: pageres.h:271

GARBAGE_LEVEL
GARBAGE_LEVEL
Definition: docqual.h:25

WERD::space
uinT8 space()
Definition: werd.h:104

ROW
Definition: ocrrow.h:32

tesseract::Tesseract::failure_count
inT16 failure_count(WERD_RES *word)
Definition: docqual.cpp:970

tesseract::Tesseract::unrej_good_quality_words
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:165

tesseract::DocQualCallbacks::CountAcceptedBlobs
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:43

G_NEVER_CRUNCH
Definition: docqual.h:27

TWERD::bounding_box
TBOX bounding_box() const
Definition: blobs.cpp:879

PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:742

WERD_CHOICE::rating
float rating() const
Definition: ratngs.h:325

reject_whole_page
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:411