tesseract/a00053_source.html

 /**********************************************************************
  * File:        applybox.cpp  (Formerly applybox.c)
  * Description: Re segment rows according to box file data
  * Author:      Phil Cheatle
  * Created:     Wed Nov 24 09:11:23 GMT 1993
  *
  * (C) Copyright 1993, Hewlett-Packard Ltd.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #ifdef _MSC_VER
 #pragma warning(disable:4244)  // Conversion warnings
 #endif

 #include <ctype.h>
 #include <string.h>
 #ifdef __UNIX__
 #include <assert.h>
 #include <errno.h>
 #endif
 #include "allheaders.h"
 #include "boxread.h"
 #include "chopper.h"
 #include "pageres.h"
 #include "unichar.h"
 #include "unicharset.h"
 #include "tesseractclass.h"
 #include "genericvector.h"

 const int kMaxGroupSize = 4;
 const double kMaxXHeightDeviationFraction = 0.125;

 namespace tesseract {

 static void clear_any_old_text(BLOCK_LIST *block_list) {
   BLOCK_IT block_it(block_list);
   for (block_it.mark_cycle_pt();
        !block_it.cycled_list(); block_it.forward()) {
     ROW_IT row_it(block_it.data()->row_list());
     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
       WERD_IT word_it(row_it.data()->word_list());
       for (word_it.mark_cycle_pt();
            !word_it.cycled_list(); word_it.forward()) {
         word_it.data()->set_text("");
       }
     }
   }
 }

 // Applies the box file based on the image name fname, and resegments
 // the words in the block_list (page), with:
 // blob-mode: one blob per line in the box file, words as input.
 // word/line-mode: one blob per space-delimited unit after the #, and one word
 // per line in the box file. (See comment above for box file format.)
 // If find_segmentation is true, (word/line mode) then the classifier is used
 // to re-segment words/lines to match the space-delimited truth string for
 // each box. In this case, the input box may be for a word or even a whole
 // text line, and the output words will contain multiple blobs corresponding
 // to the space-delimited input string.
 // With find_segmentation false, no classifier is needed, but the chopper
 // can still be used to correctly segment touching characters with the help
 // of the input boxes.
 // In the returned PAGE_RES, the WERD_RES are setup as they would be returned
 // from normal classification, ie. with a word, chopped_word, rebuild_word,
 // seam_array, denorm, box_word, and best_state, but NO best_choice or
 // raw_choice, as they would require a UNICHARSET, which we aim to avoid.
 // Instead, the correct_text member of WERD_RES is set, and this may be later
 // converted to a best_choice using CorrectClassifyWords. CorrectClassifyWords
 // is not required before calling ApplyBoxTraining.
 PAGE_RES* Tesseract::ApplyBoxes(const STRING& fname,
                                 bool find_segmentation,
                                 BLOCK_LIST *block_list) {
   GenericVector<TBOX> boxes;
   GenericVector<STRING> texts, full_texts;
   if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
                     NULL)) {
     return NULL;  // Can't do it.
   }

   int box_count = boxes.size();
   int box_failures = 0;
   // Add an empty everything to the end.
   boxes.push_back(TBOX());
   texts.push_back(STRING());
   full_texts.push_back(STRING());

   // In word mode, we use the boxes to make a word for each box, but
   // in blob mode we use the existing words and maximally chop them first.
   PAGE_RES* page_res = find_segmentation ?
       NULL : SetupApplyBoxes(boxes, block_list);
   clear_any_old_text(block_list);

   for (int i = 0; i < boxes.size() - 1; i++) {
     bool foundit = false;
     if (page_res != NULL) {
       if (i == 0) {
         foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
                                    full_texts[i].string());
       } else {
         foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
                                    boxes[i + 1], full_texts[i].string());
       }
     } else {
       foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
                                  texts[i].string());
     }
     if (!foundit) {
       box_failures++;
       ReportFailedBox(i, boxes[i], texts[i].string(),
                       "FAILURE! Couldn't find a matching blob");
     }
   }

   if (page_res == NULL) {
     // In word/line mode, we now maximally chop all the words and resegment
     // them with the classifier.
     page_res = SetupApplyBoxes(boxes, block_list);
     ReSegmentByClassification(page_res);
   }
   if (applybox_debug > 0) {
     tprintf("APPLY_BOXES:\n");
     tprintf("   Boxes read from boxfile:  %6d\n", box_count);
     if (box_failures > 0)
       tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
   }
   TidyUp(page_res);
   return page_res;
 }

 // Helper computes median xheight in the image.
 static double MedianXHeight(BLOCK_LIST *block_list) {
   BLOCK_IT block_it(block_list);
   STATS xheights(0, block_it.data()->bounding_box().height());
   for (block_it.mark_cycle_pt();
        !block_it.cycled_list(); block_it.forward()) {
     ROW_IT row_it(block_it.data()->row_list());
     for (row_it.mark_cycle_pt(); !row_it.cycled_list(); row_it.forward()) {
       xheights.add(IntCastRounded(row_it.data()->x_height()), 1);
     }
   }
   return xheights.median();
 }

 void Tesseract::PreenXHeights(BLOCK_LIST *block_list) {
   double median_xheight = MedianXHeight(block_list);
   double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
   // Strip all fuzzy space markers to simplify the PAGE_RES.
   BLOCK_IT b_it(block_list);
   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
     BLOCK* block = b_it.data();
     ROW_IT r_it(block->row_list());
     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
       ROW* row = r_it.data();
       float diff = fabs(row->x_height() - median_xheight);
       if (diff > max_deviation) {
         if (applybox_debug) {
           tprintf("row xheight=%g, but median xheight = %g\n",
                   row->x_height(), median_xheight);
         }
         row->set_x_height(static_cast<float>(median_xheight));
       }
     }
   }
 }

 PAGE_RES* Tesseract::SetupApplyBoxes(const GenericVector<TBOX>& boxes,
                                      BLOCK_LIST *block_list) {
   PreenXHeights(block_list);
   // Strip all fuzzy space markers to simplify the PAGE_RES.
   BLOCK_IT b_it(block_list);
   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
     BLOCK* block = b_it.data();
     ROW_IT r_it(block->row_list());
     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
       ROW* row = r_it.data();
       WERD_IT w_it(row->word_list());
       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
         WERD* word = w_it.data();
         if (word->cblob_list()->empty()) {
           delete w_it.extract();
         } else {
           word->set_flag(W_FUZZY_SP, false);
           word->set_flag(W_FUZZY_NON, false);
         }
       }
     }
   }
   PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
   PAGE_RES_IT pr_it(page_res);
   WERD_RES* word_res;
   while ((word_res = pr_it.word()) != NULL) {
     MaximallyChopWord(boxes, pr_it.block()->block,
                       pr_it.row()->row, word_res);
     pr_it.forward();
   }
   return page_res;
 }

 void Tesseract::MaximallyChopWord(const GenericVector<TBOX>& boxes,
                                   BLOCK* block, ROW* row,
                                   WERD_RES* word_res) {
   if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
                                      tessedit_ocr_engine_mode, NULL,
                                      classify_bln_numeric_mode,
                                      textord_use_cjk_fp_model,
                                      poly_allow_detailed_fx,
                                      row, block)) {
     word_res->CloneChoppedToRebuild();
     return;
   }
   if (chop_debug) {
     tprintf("Maximally chopping word at:");
     word_res->word->bounding_box().print();
   }
   GenericVector<BLOB_CHOICE*> blob_choices;
   ASSERT_HOST(!word_res->chopped_word->blobs.empty());
   float rating = static_cast<float>(MAX_INT8);
   for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
     // The rating and certainty are not quite arbitrary. Since
     // select_blob_to_chop uses the worst certainty to choose, they all have
     // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
     // in here, and then divide by e each time they are chopped, which
     // should guarantee a set of unequal values for the whole tree of blobs
     // produced, however much chopping is required. The chops are thus only
     // limited by the ability of the chopper to find suitable chop points,
     // and not by the value of the certainties.
     BLOB_CHOICE* choice =
         new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
     blob_choices.push_back(choice);
     rating -= 0.125f;
   }
   const double e = exp(1.0);  // The base of natural logs.
   int blob_number;
   int right_chop_index = 0;
   if (!assume_fixed_pitch_char_segment) {
     // We only chop if the language is not fixed pitch like CJK.
     SEAM* seam = NULL;
     while ((seam = chop_one_blob(boxes, blob_choices, word_res,
                                  &blob_number)) != NULL) {
       word_res->InsertSeam(blob_number, seam);
       BLOB_CHOICE* left_choice = blob_choices[blob_number];
       rating = left_choice->rating() / e;
       left_choice->set_rating(rating);
       left_choice->set_certainty(-rating);
       // combine confidence w/ serial #
       BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
                                                   rating - 0.125f, -rating, -1,
                                                   0.0f, 0.0f, 0.0f, BCC_FAKE);
       blob_choices.insert(right_choice, blob_number + 1);
     }
   }
   word_res->CloneChoppedToRebuild();
   word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
 }

 static double BoxMissMetric(const TBOX& box1, const TBOX& box2) {
   int overlap_area = box1.intersection(box2).area();
   double miss_metric = box1.area()- overlap_area;
   miss_metric /= box1.area();
   miss_metric *= box2.area() - overlap_area;
   miss_metric /= box2.area();
   return miss_metric;
 }

 bool Tesseract::ResegmentCharBox(PAGE_RES* page_res, const TBOX *prev_box,
                                  const TBOX& box, const TBOX& next_box,
                                  const char* correct_text) {
   if (applybox_debug > 1) {
     tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
   }
   PAGE_RES_IT page_res_it(page_res);
   WERD_RES* word_res;
   for (word_res = page_res_it.word(); word_res != NULL;
        word_res = page_res_it.forward()) {
     if (!word_res->box_word->bounding_box().major_overlap(box))
       continue;
     if (applybox_debug > 1) {
       tprintf("Checking word box:");
       word_res->box_word->bounding_box().print();
     }
     int word_len = word_res->box_word->length();
     for (int i = 0; i < word_len; ++i) {
       TBOX char_box = TBOX();
       int blob_count = 0;
       for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
         TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
         if (!blob_box.major_overlap(box))
           break;
         if (word_res->correct_text[i + blob_count].length() > 0)
           break;  // Blob is claimed already.
         double current_box_miss_metric = BoxMissMetric(blob_box, box);
         double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
         if (applybox_debug > 2) {
           tprintf("Checking blob:");
           blob_box.print();
           tprintf("Current miss metric = %g, next = %g\n",
                   current_box_miss_metric, next_box_miss_metric);
         }
         if (current_box_miss_metric > next_box_miss_metric)
           break;  // Blob is a better match for next box.
         char_box += blob_box;
       }
       if (blob_count > 0) {
         if (applybox_debug > 1) {
           tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
         }
         if (!char_box.almost_equal(box, 3) &&
             (box.x_gap(next_box) < -3 ||
              (prev_box != NULL && prev_box->x_gap(box) < -3))) {
           return false;
         }
         // We refine just the box_word, best_state and correct_text here.
         // The rebuild_word is made in TidyUp.
         // blob_count blobs are put together to match the box. Merge the
         // box_word boxes, save the blob_count in the state and the text.
         word_res->box_word->MergeBoxes(i, i + blob_count);
         word_res->best_state[i] = blob_count;
         word_res->correct_text[i] = correct_text;
         if (applybox_debug > 2) {
           tprintf("%d Blobs match: blob box:", blob_count);
           word_res->box_word->BlobBox(i).print();
           tprintf("Matches box:");
           box.print();
           tprintf("With next box:");
           next_box.print();
         }
         // Eliminated best_state and correct_text entries for the consumed
         // blobs.
         for (int j = 1; j < blob_count; ++j) {
           word_res->best_state.remove(i + 1);
           word_res->correct_text.remove(i + 1);
         }
         // Assume that no box spans multiple source words, so we are done with
         // this box.
         if (applybox_debug > 1) {
           tprintf("Best state = ");
           for (int j = 0; j < word_res->best_state.size(); ++j) {
             tprintf("%d ", word_res->best_state[j]);
           }
           tprintf("\n");
           tprintf("Correct text = [[ ");
           for (int j = 0; j < word_res->correct_text.size(); ++j) {
             tprintf("%s ", word_res->correct_text[j].string());
           }
           tprintf("]]\n");
         }
         return true;
       }
     }
   }
   if (applybox_debug > 0) {
     tprintf("FAIL!\n");
   }
   return false;  // Failure.
 }

 bool Tesseract::ResegmentWordBox(BLOCK_LIST *block_list,
                                  const TBOX& box, const TBOX& next_box,
                                  const char* correct_text) {
   if (applybox_debug > 1) {
     tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
   }
   WERD* new_word = NULL;
   BLOCK_IT b_it(block_list);
   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
     BLOCK* block = b_it.data();
     if (!box.major_overlap(block->bounding_box()))
       continue;
     ROW_IT r_it(block->row_list());
     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
       ROW* row = r_it.data();
       if (!box.major_overlap(row->bounding_box()))
         continue;
       WERD_IT w_it(row->word_list());
       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
         WERD* word = w_it.data();
         if (applybox_debug > 2) {
           tprintf("Checking word:");
           word->bounding_box().print();
         }
         if (word->text() != NULL && word->text()[0] != '\0')
           continue;  // Ignore words that are already done.
         if (!box.major_overlap(word->bounding_box()))
           continue;
         C_BLOB_IT blob_it(word->cblob_list());
         for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
              blob_it.forward()) {
           C_BLOB* blob = blob_it.data();
           TBOX blob_box = blob->bounding_box();
           if (!blob_box.major_overlap(box))
             continue;
           double current_box_miss_metric = BoxMissMetric(blob_box, box);
           double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
           if (applybox_debug > 2) {
             tprintf("Checking blob:");
             blob_box.print();
             tprintf("Current miss metric = %g, next = %g\n",
                     current_box_miss_metric, next_box_miss_metric);
           }
           if (current_box_miss_metric > next_box_miss_metric)
             continue;  // Blob is a better match for next box.
           if (applybox_debug > 2) {
             tprintf("Blob match: blob:");
             blob_box.print();
             tprintf("Matches box:");
             box.print();
             tprintf("With next box:");
             next_box.print();
           }
           if (new_word == NULL) {
             // Make a new word with a single blob.
             new_word = word->shallow_copy();
             new_word->set_text(correct_text);
             w_it.add_to_end(new_word);
           }
           C_BLOB_IT new_blob_it(new_word->cblob_list());
           new_blob_it.add_to_end(blob_it.extract());
         }
       }
     }
   }
   if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
   return new_word != NULL;
 }

 void Tesseract::ReSegmentByClassification(PAGE_RES* page_res) {
   PAGE_RES_IT pr_it(page_res);
   WERD_RES* word_res;
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
     WERD* word = word_res->word;
     if (word->text() == NULL || word->text()[0] == '\0')
       continue;  // Ignore words that have no text.
     // Convert the correct text to a vector of UNICHAR_ID
     GenericVector<UNICHAR_ID> target_text;
     if (!ConvertStringToUnichars(word->text(), &target_text)) {
       tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
               word->text());
       pr_it.DeleteCurrentWord();
       continue;
     }
     if (!FindSegmentation(target_text, word_res)) {
       tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
               word->text());
       pr_it.DeleteCurrentWord();
       continue;
     }
   }
 }

 bool Tesseract::ConvertStringToUnichars(const char* utf8,
                                         GenericVector<UNICHAR_ID>* class_ids) {
   for (int step = 0; *utf8 != '\0'; utf8 += step) {
     const char* next_space = strchr(utf8, ' ');
     if (next_space == NULL)
       next_space = utf8 + strlen(utf8);
     step = next_space - utf8;
     UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
     if (class_id == INVALID_UNICHAR_ID) {
       return false;
     }
     while (utf8[step] == ' ')
       ++step;
     class_ids->push_back(class_id);
   }
   return true;
 }

 bool Tesseract::FindSegmentation(const GenericVector<UNICHAR_ID>& target_text,
                                  WERD_RES* word_res) {
   // Classify all required combinations of blobs and save results in choices.
   int word_length = word_res->box_word->length();
   GenericVector<BLOB_CHOICE_LIST*>* choices =
       new GenericVector<BLOB_CHOICE_LIST*>[word_length];
   for (int i = 0; i < word_length; ++i) {
     for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
       BLOB_CHOICE_LIST* match_result = classify_piece(
           word_res->seam_array, i, i + j - 1, "Applybox",
           word_res->chopped_word, word_res->blamer_bundle);
       if (applybox_debug > 2) {
         tprintf("%d+%d:", i, j);
         print_ratings_list("Segment:", match_result, unicharset);
       }
       choices[i].push_back(match_result);
     }
   }
   // Search the segmentation graph for the target text. Must be an exact
   // match. Using wildcards makes it difficult to find the correct
   // segmentation even when it is there.
   word_res->best_state.clear();
   GenericVector<int> search_segmentation;
   float best_rating = 0.0f;
   SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
                 &search_segmentation, &best_rating, &word_res->best_state);
   for (int i = 0; i < word_length; ++i)
     choices[i].delete_data_pointers();
   delete [] choices;
   if (word_res->best_state.empty()) {
     // Build the original segmentation and if it is the same length as the
     // truth, assume it will do.
     int blob_count = 1;
     for (int s = 0; s < word_res->seam_array.size(); ++s) {
       SEAM* seam = word_res->seam_array[s];
       if (!seam->HasAnySplits()) {
         word_res->best_state.push_back(blob_count);
         blob_count = 1;
       } else {
         ++blob_count;
       }
     }
     word_res->best_state.push_back(blob_count);
     if (word_res->best_state.size() != target_text.size()) {
       word_res->best_state.clear();  // No good. Original segmentation bad size.
       return false;
     }
   }
   word_res->correct_text.clear();
   for (int i = 0; i < target_text.size(); ++i) {
     word_res->correct_text.push_back(
         STRING(unicharset.id_to_unichar(target_text[i])));
   }
   return true;
 }

 void Tesseract::SearchForText(const GenericVector<BLOB_CHOICE_LIST*>* choices,
                               int choices_pos, int choices_length,
                               const GenericVector<UNICHAR_ID>& target_text,
                               int text_index,
                               float rating, GenericVector<int>* segmentation,
                               float* best_rating,
                               GenericVector<int>* best_segmentation) {
   const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
   for (int length = 1; length <= choices[choices_pos].size(); ++length) {
     // Rating of matching choice or worst choice if no match.
     float choice_rating = 0.0f;
     // Find the corresponding best BLOB_CHOICE.
     BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
          choice_it.forward()) {
       BLOB_CHOICE* choice = choice_it.data();
       choice_rating = choice->rating();
       UNICHAR_ID class_id = choice->unichar_id();
       if (class_id == target_text[text_index]) {
         break;
       }
       // Search ambigs table.
       if (class_id < table.size() && table[class_id] != NULL) {
         AmbigSpec_IT spec_it(table[class_id]);
         for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
              spec_it.forward()) {
           const AmbigSpec *ambig_spec = spec_it.data();
           // We'll only do 1-1.
           if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
               ambig_spec->correct_ngram_id == target_text[text_index])
             break;
         }
         if (!spec_it.cycled_list())
           break;  // Found an ambig.
       }
     }
     if (choice_it.cycled_list())
       continue;  // No match.
     segmentation->push_back(length);
     if (choices_pos + length == choices_length &&
         text_index + 1 == target_text.size()) {
       // This is a complete match. If the rating is good record a new best.
       if (applybox_debug > 2) {
         tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
                 rating + choice_rating, *best_rating, segmentation->size(),
                 best_segmentation->size());
       }
       if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
         *best_segmentation = *segmentation;
         *best_rating = rating + choice_rating;
       }
     } else if (choices_pos + length < choices_length &&
                text_index + 1 < target_text.size()) {
       if (applybox_debug > 3) {
         tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
                 target_text[text_index],
                 unicharset.id_to_unichar(target_text[text_index]),
                 choice_it.data()->unichar_id() == target_text[text_index]
                      ? "Match" : "Ambig",
                 choices_pos, length);
       }
       SearchForText(choices, choices_pos + length, choices_length, target_text,
                     text_index + 1, rating + choice_rating, segmentation,
                     best_rating, best_segmentation);
       if (applybox_debug > 3) {
         tprintf("End recursion for %d=%s\n", target_text[text_index],
                 unicharset.id_to_unichar(target_text[text_index]));
       }
     }
     segmentation->truncate(segmentation->size() - 1);
   }
 }

 void Tesseract::TidyUp(PAGE_RES* page_res) {
   int ok_blob_count = 0;
   int bad_blob_count = 0;
   int ok_word_count = 0;
   int unlabelled_words = 0;
   PAGE_RES_IT pr_it(page_res);
   WERD_RES* word_res;
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
     int ok_in_word = 0;
     int blob_count = word_res->correct_text.size();
     WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
     word_choice->set_permuter(TOP_CHOICE_PERM);
     for (int c = 0; c < blob_count; ++c) {
       if (word_res->correct_text[c].length() > 0) {
         ++ok_in_word;
       }
       // Since we only need a fake word_res->best_choice, the actual
       // unichar_ids do not matter. Which is fortunate, since TidyUp()
       // can be called while training Tesseract, at the stage where
       // unicharset is not meaningful yet.
       word_choice->append_unichar_id_space_allocated(
           INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
     }
     if (ok_in_word > 0) {
       ok_blob_count += ok_in_word;
       bad_blob_count += word_res->correct_text.size() - ok_in_word;
       word_res->LogNewRawChoice(word_choice);
       word_res->LogNewCookedChoice(1, false, word_choice);
     } else {
       ++unlabelled_words;
       if (applybox_debug > 0) {
         tprintf("APPLY_BOXES: Unlabelled word at :");
         word_res->word->bounding_box().print();
       }
       pr_it.DeleteCurrentWord();
       delete word_choice;
     }
   }
   pr_it.restart_page();
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
     // Denormalize back to a BoxWord.
     word_res->RebuildBestState();
     word_res->SetupBoxWord();
     word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
     word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
   }
   if (applybox_debug > 0) {
     tprintf("   Found %d good blobs.\n", ok_blob_count);
     if (bad_blob_count > 0) {
       tprintf("   Leaving %d unlabelled blobs in %d words.\n",
               bad_blob_count, ok_word_count);
     }
     if (unlabelled_words > 0)
       tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
   }
 }

 void Tesseract::ReportFailedBox(int boxfile_lineno, TBOX box,
                                 const char *box_ch, const char *err_msg) {
   tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
           boxfile_lineno + 1, box_ch,
           box.left(), box.bottom(), box.right(), box.top(), err_msg);
 }

 void Tesseract::CorrectClassifyWords(PAGE_RES* page_res) {
   PAGE_RES_IT pr_it(page_res);
   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
        word_res = pr_it.forward()) {
     WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
                                           word_res->correct_text.size());
     for (int i = 0; i < word_res->correct_text.size(); ++i) {
       // The part before the first space is the real ground truth, and the
       // rest is the bounding box location and page number.
       GenericVector<STRING> tokens;
       word_res->correct_text[i].split(' ', &tokens);
       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
       choice->append_unichar_id_space_allocated(char_id,
                                                 word_res->best_state[i],
                                                 0.0f, 0.0f);
     }
     word_res->ClearWordChoices();
     word_res->LogNewRawChoice(choice);
     word_res->LogNewCookedChoice(1, false, choice);
   }
 }

 void Tesseract::ApplyBoxTraining(const STRING& fontname, PAGE_RES* page_res) {
   PAGE_RES_IT pr_it(page_res);
   int word_count = 0;
   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
        word_res = pr_it.forward()) {
     LearnWord(fontname.string(), word_res);
     ++word_count;
   }
   tprintf("Generated training data for %d words\n", word_count);
 }


 }  // namespace tesseract
tesseract::Tesseract::poly_allow_detailed_fx
bool poly_allow_detailed_fx
Definition: tesseractclass.h:1079

tesseract::AmbigSpec
Definition: ambigs.h:114

PAGE_RES_IT
Definition: pageres.h:659

tesseract::Tesseract::ReSegmentByClassification
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:509

C_BLOB
Definition: stepblob.h:30

TBOX::intersection
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87

tesseract::BoxWord::BlobBox
const TBOX & BlobBox(int index) const
Definition: boxword.h:86

WERD_RES::RebuildBestState
void RebuildBestState()
Definition: pageres.cpp:800

pageres.h

tesseract::Classify::classify_bln_numeric_mode
bool classify_bln_numeric_mode
Definition: classify.h:499

genericvector.h

UNICHAR_ID
int UNICHAR_ID
Definition: unichar.h:33

WERD_RES::LogNewCookedChoice
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612

WERD_RES::best_state
GenericVector< int > best_state
Definition: pageres.h:255

tesseract::Tesseract::SearchForText
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:629

PDBLK::bounding_box
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59

boxread.h

tesseract::Classify::getDict
Dict & getDict()
Definition: classify.h:65

tesseract::Tesseract::ApplyBoxTraining
void ApplyBoxTraining(const STRING &fontname, PAGE_RES *page_res)
Definition: applybox.cpp:796

TBOX::area
inT32 area() const
Definition: rect.h:118

PAGE_RES_IT::prev_row
ROW_RES * prev_row() const
Definition: pageres.h:730

WERD_RES::blamer_bundle
BlamerBundle * blamer_bundle
Definition: pageres.h:230

tesseract::BoxWord::MergeBoxes
void MergeBoxes(int start, int end)
Definition: boxword.cpp:134

W_FUZZY_NON
Definition: werd.h:43

tesseractclass.h

GenericVector::remove
void remove(int index)
Definition: genericvector.h:754

unicharset.h

W_EOL
Definition: werd.h:36

WERD_RES::correct_text
GenericVector< STRING > correct_text
Definition: pageres.h:259

GenericVector::push_back
int push_back(T object)
Definition: genericvector.h:787

BLOB_CHOICE::rating
float rating() const
Definition: ratngs.h:79

WERD::set_flag
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129

tprintf
#define tprintf(...)
Definition: tprintf.h:31

tesseract::Tesseract::tessedit_ocr_engine_mode
int tessedit_ocr_engine_mode
Definition: tesseractclass.h:808

tesseract::BoxWord::bounding_box
const TBOX & bounding_box() const
Definition: boxword.h:82

BLOCK_RES::block
BLOCK * block
Definition: pageres.h:99

STRING::string
const char * string() const
Definition: strngs.cpp:198

WERD::set_text
void set_text(const char *new_text)
Definition: werd.h:126

ROW_RES::row
ROW * row
Definition: pageres.h:127

ROW::word_list
WERD_LIST * word_list()
Definition: ocrrow.h:52

GenericVector::empty
bool empty() const
Definition: genericvector.h:90

GenericVector::truncate
void truncate(int size)
Definition: genericvector.h:136

ROW::x_height
float x_height() const
Definition: ocrrow.h:61

tesseract::Tesseract::applybox_page
int applybox_page
Definition: tesseractclass.h:831

tesseract::Tesseract::textord_use_cjk_fp_model
bool textord_use_cjk_fp_model
Definition: tesseractclass.h:1077

IntCastRounded
int IntCastRounded(double x)
Definition: helpers.h:179

WERD_RES::CloneChoppedToRebuild
void CloneChoppedToRebuild()
Definition: pageres.cpp:828

kMaxGroupSize
const int kMaxGroupSize
Definition: applybox.cpp:40

GenericVector::size
int size() const
Definition: genericvector.h:72

tesseract
Definition: baseapi.cpp:82

WERD::bounding_box
TBOX bounding_box() const
Definition: werd.cpp:160

WERD_RES::box_word
tesseract::BoxWord * box_word
Definition: pageres.h:250

PAGE_RES_IT::row
ROW_RES * row() const
Definition: pageres.h:739

tesseract::Tesseract::TidyUp
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:706

ASSERT_HOST
#define ASSERT_HOST(x)
Definition: errcode.h:84

UNICHARSET::id_to_unichar
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266

TBOX::left
inT16 left() const
Definition: rect.h:68

PAGE_RES_IT::next_row
ROW_RES * next_row() const
Definition: pageres.h:748

GenericVector< TBOX >

BLOB_CHOICE
Definition: ratngs.h:48

kMaxXHeightDeviationFraction
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:43

PAGE_RES_IT::restart_page
WERD_RES * restart_page()
Definition: pageres.h:683

tesseract::Wordrec::assume_fixed_pitch_char_segment
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161

WERD_RES::InsertSeam
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:410

WERD_RES::FakeClassifyWord
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:872

W_FUZZY_SP
Definition: werd.h:42

GenericVector::insert
void insert(T t, int index)
Definition: genericvector.h:740

PAGE_RES_IT::forward
WERD_RES * forward()
Definition: pageres.h:716

tesseract::UnicharAmbigs::dang_ambigs
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:154

tesseract::Tesseract::ReportFailedBox
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:764

SEAM
Definition: seam.h:44

STRING
Definition: strngs.h:45

tesseract::Tesseract::BestPix
Pix * BestPix() const
Definition: tesseractclass.h:216

BLOB_CHOICE::set_certainty
void set_certainty(float newrat)
Definition: ratngs.h:150

unichar.h

TOP_CHOICE_PERM
Definition: ratngs.h:243

WERD::text
const char * text() const
Definition: werd.h:125

tesseract::Tesseract::applybox_debug
int applybox_debug
Definition: tesseractclass.h:830

tesseract::Tesseract::ResegmentWordBox
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:438

tesseract::AmbigSpec::wrong_ngram
UNICHAR_ID wrong_ngram[MAX_AMBIG_SIZE+1]
Definition: ambigs.h:133

WERD_RES
Definition: pageres.h:155

tesseract::CCUtil::unicharset
UNICHARSET unicharset
Definition: ccutil.h:68

TBOX::major_overlap
bool major_overlap(const TBOX &box) const
Definition: rect.h:358

tesseract::Dict::getUnicharAmbigs
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:103

PAGE_RES_IT::DeleteCurrentWord
void DeleteCurrentWord()
Definition: pageres.cpp:1451

W_BOL
Definition: werd.h:35

ROW::bounding_box
TBOX bounding_box() const
Definition: ocrrow.h:85

GenericVector::length
int length() const
Definition: genericvector.h:85

STATS::add
void add(inT32 value, inT32 count)
Definition: statistc.cpp:101

TWERD::NumBlobs
int NumBlobs() const
Definition: blobs.h:425

GenericVector::clear
void clear()
Definition: genericvector.h:856

TBOX::top
inT16 top() const
Definition: rect.h:54

WERD_RES::SetupBoxWord
void SetupBoxWord()
Definition: pageres.cpp:843

TBOX::almost_equal
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258

WERD::shallow_copy
WERD * shallow_copy()
Definition: werd.cpp:352

TBOX
Definition: rect.h:30

TWERD::blobs
GenericVector< TBLOB * > blobs
Definition: blobs.h:436

WERD_CHOICE::append_unichar_id_space_allocated
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:450

ReadAllBoxes
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:50

C_BLOB::bounding_box
TBOX bounding_box() const
Definition: stepblob.cpp:250

WERD::cblob_list
C_BLOB_LIST * cblob_list()
Definition: werd.h:100

WERD_RES::word
WERD * word
Definition: pageres.h:175

TBOX::right
inT16 right() const
Definition: rect.h:75

tesseract::Wordrec::classify_piece
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:56

tesseract::Wordrec::chop_one_blob
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:373

tesseract::Tesseract::PreenXHeights
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:193

BLOB_CHOICE::unichar_id
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76

chopper.h

STATS
Definition: statistc.h:33

PAGE_RES
Definition: pageres.h:58

TBOX::print
void print() const
Definition: rect.h:270

WERD_RES::SetupForRecognition
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294

MAX_INT8
#define MAX_INT8
Definition: host.h:60

tesseract::Tesseract::SetupApplyBoxes
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:217

PAGE_RES_IT::word
WERD_RES * word() const
Definition: pageres.h:736

TBOX::bottom
inT16 bottom() const
Definition: rect.h:61

tesseract::Tesseract::ApplyBoxes
PAGE_RES * ApplyBoxes(const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
Definition: applybox.cpp:117

ROW::set_x_height
void set_x_height(float new_xheight)
Definition: ocrrow.h:64

WERD_CHOICE
Definition: ratngs.h:271

tesseract::Tesseract::ResegmentCharBox
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:340

BLOB_CHOICE::set_rating
void set_rating(float newrat)
Definition: ratngs.h:147

WERD_RES::LogNewRawChoice
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596

tesseract::AmbigSpec::correct_ngram_id
UNICHAR_ID correct_ngram_id
Definition: ambigs.h:135

print_ratings_list
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819

WERD_RES::uch_set
const UNICHARSET * uch_set
Definition: pageres.h:192

tesseract::Classify::LearnWord
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244

UNICHARSET::unichar_to_id
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

tesseract::Tesseract::MaximallyChopWord
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:253

WERD
Definition: werd.h:60

WERD_RES::chopped_word
TWERD * chopped_word
Definition: pageres.h:201

tesseract::Tesseract::CorrectClassifyWords
void CorrectClassifyWords(PAGE_RES *page_res)
Definition: applybox.cpp:772

WERD_CHOICE::set_permuter
void set_permuter(uinT8 perm)
Definition: ratngs.h:373

ROW
Definition: ocrrow.h:32

BLOCK::row_list
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120

BLOCK
Definition: ocrblock.h:30

tesseract::Tesseract::FindSegmentation
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:559

BCC_FAKE
Definition: ratngs.h:45

tesseract::Tesseract::ConvertStringToUnichars
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:535

tesseract::Wordrec::chop_debug
int chop_debug
Definition: wordrec.h:139

PAGE_RES_IT::block
BLOCK_RES * block() const
Definition: pageres.h:742

tesseract::BoxWord::length
int length() const
Definition: boxword.h:85

WERD_RES::seam_array
GenericVector< SEAM * > seam_array
Definition: pageres.h:203

TBOX::x_gap
int x_gap(const TBOX &box) const
Definition: rect.h:217

SEAM::HasAnySplits
bool HasAnySplits() const
Definition: seam.h:67