tesseract/a00695_source.html

 // Copyright 2011 Google Inc. All Rights Reserved.
 // Author: rays@google.com (Ray Smith)
 //
 // Licensed under the Apache License, Version 2.0 (the "License");
 // you may not use this file except in compliance with the License.
 // You may obtain a copy of the License at
 // http://www.apache.org/licenses/LICENSE-2.0
 // Unless required by applicable law or agreed to in writing, software
 // distributed under the License is distributed on an "AS IS" BASIS,
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
 //

 #ifndef THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_
 #define THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_

 #include "genericvector.h"
 #include "matrix.h"
 #include "statistc.h"

 struct Pix;
 template <typename T> class UnicityTable;

 namespace tesseract {

 struct FontInfo;
 class FontInfoTable;
 class SampleIterator;
 class ShapeClassifier;
 class TrainingSample;
 struct UnicharRating;

 // Enumeration of the different types of error count.
 // Error counts work as follows:
 //
 // Ground truth is a valid unichar-id / font-id pair:
 //        Number of classifier answers?
 //          0                       >0
 //     CT_REJECT          unichar-id matches top shape?
 //     __________             yes!                      no
 //                   CT_UNICHAR_TOP_OK           CT_UNICHAR_TOP1_ERR
 //      Top shape-id has multiple unichars?   2nd shape unichar id matches?
 //            yes!              no              yes!              no
 //      CT_OK_MULTI_UNICHAR     |              _____    CT_UNICHAR_TOP2_ERR
 //             Font attributes match?                 Any unichar-id matches?
 //              yes!              no                  yes!        no
 //      CT_FONT_ATTR_OK   CT_FONT_ATTR_ERR          ______  CT_UNICHAR_TOPN_ERR
 //                |       __________________                 _________________
 //      Top shape-id has multiple font attrs?
 //            yes!              no
 //      CT_OK_MULTI_FONT
 //      _____________________________
 //
 // Note that multiple counts may be activated for a single sample!
 //
 // Ground truth is for a fragment/n-gram that is NOT in the unicharset.
 // This is called junk and is expected to be rejected:
 //        Number of classifier answers?
 //          0                       >0
 //     CT_REJECTED_JUNK     CT_ACCEPTED_JUNK
 //
 // Also, CT_NUM_RESULTS stores the mean number of results, and CT_RANK stores
 // the mean rank of the correct result, counting from 0, and with an error
 // receiving the number of answers as the correct rank.
 //
 // Keep in sync with the ReportString function.
 enum CountTypes {
   CT_UNICHAR_TOP_OK,     // Top shape contains correct unichar id.
   // The rank of the results in TOP1, TOP2, TOPN is determined by a gap of
   // kRatingEpsilon from the first result in each group. The real top choice
   // is measured using TOPTOP.
   CT_UNICHAR_TOP1_ERR,   // Top shape does not contain correct unichar id.
   CT_UNICHAR_TOP2_ERR,   // Top 2 shapes don't contain correct unichar id.
   CT_UNICHAR_TOPN_ERR,   // No output shape contains correct unichar id.
   CT_UNICHAR_TOPTOP_ERR,   // Very top choice not correct.
   CT_OK_MULTI_UNICHAR,   // Top shape id has correct unichar id, and others.
   CT_OK_JOINED,          // Top shape id is correct but marked joined.
   CT_OK_BROKEN,          // Top shape id is correct but marked broken.
   CT_REJECT,             // Classifier hates this.
   CT_FONT_ATTR_ERR,      // Top unichar OK, but font attributes incorrect.
   CT_OK_MULTI_FONT,      // CT_FONT_ATTR_OK but there are multiple font attrs.
   CT_NUM_RESULTS,        // Number of answers produced.
   CT_RANK,               // Rank of correct answer.
   CT_REJECTED_JUNK,      // Junk that was correctly rejected.
   CT_ACCEPTED_JUNK,      // Junk that was incorrectly classified otherwise.

   CT_SIZE                // Number of types for array sizing.
 };

 // Class to encapsulate all the functionality and sub-structures required
 // to count errors for an isolated character classifier (ShapeClassifier).
 class ErrorCounter {
  public:
   // Computes and returns the unweighted boosting_mode error rate of the given
   // classifier. Can be used for testing, or inside an iterative training
   // system, including one that uses boosting.
   // report_levels:
   // 0 = no output.
   // 1 = bottom-line error rate.
   // 2 = bottom-line error rate + time.
   // 3 = font-level error rate + time.
   // 4 = list of all errors + short classifier debug output on 16 errors.
   // 5 = list of all errors + short classifier debug output on 25 errors.
   // * The boosting_mode determines which error type is used for computing the
   //   scaled_error output, and setting the is_error flag in the samples.
   // * The fontinfo_table is used to get string font names for the debug
   //   output, and also to count font attributes errors.
   // * The page_images vector may contain a Pix* (which may be NULL) for each
   //   page index assigned to the samples.
   // * The it provides encapsulated iteration over some sample set.
   // * The outputs unichar_error, scaled_error and totals_report are all
   //   optional.
   // * If not NULL, unichar error gets the top1 unichar error rate.
   // * Scaled_error gets the error chosen by boosting_mode weighted by the
   //   weights on the samples.
   // * Fonts_report gets a string summarizing the error rates for each font in
   //   both human-readable form and as a tab-separated list of error counts.
   //   The human-readable form is all before the first tab.
   // * The return value is the un-weighted version of the scaled_error.
   static double ComputeErrorRate(ShapeClassifier* classifier,
                                  int report_level, CountTypes boosting_mode,
                                  const FontInfoTable& fontinfo_table,
                                  const GenericVector<Pix*>& page_images,
                                  SampleIterator* it,
                                  double* unichar_error,
                                  double* scaled_error,
                                  STRING* fonts_report);
   // Tests a pair of classifiers, debugging errors of the new against the old.
   // See errorcounter.h for description of arguments.
   // Iterates over the samples, calling the classifiers in normal/silent mode.
   // If the new_classifier makes a boosting_mode error that the old_classifier
   // does not, and the appropriate, it will then call the new_classifier again
   // with a debug flag and a keep_this argument to find out what is going on.
   static void DebugNewErrors(ShapeClassifier* new_classifier,
                              ShapeClassifier* old_classifier,
                              CountTypes boosting_mode,
                              const FontInfoTable& fontinfo_table,
                              const GenericVector<Pix*>& page_images,
                              SampleIterator* it);

  private:
   // Simple struct to hold an array of counts.
   struct Counts {
     Counts();
     // Adds other into this for computing totals.
     void operator+=(const Counts& other);

     int n[CT_SIZE];
   };

   // Constructor is private. Only anticipated use of ErrorCounter is via
   // the static ComputeErrorRate.
   ErrorCounter(const UNICHARSET& unicharset, int fontsize);
   ~ErrorCounter();

   // Accumulates the errors from the classifier results on a single sample.
   // Returns true if debug is true and a CT_UNICHAR_TOPN_ERR error occurred.
   // boosting_mode selects the type of error to be used for boosting and the
   // is_error_ member of sample is set according to whether the required type
   // of error occurred. The font_table provides access to font properties
   // for error counting and shape_table is used to understand the relationship
   // between unichar_ids and shape_ids in the results
   bool AccumulateErrors(bool debug, CountTypes boosting_mode,
                         const FontInfoTable& font_table,
                         const GenericVector<UnicharRating>& results,
                         TrainingSample* sample);

   // Accumulates counts for junk. Counts only whether the junk was correctly
   // rejected or not.
   bool AccumulateJunk(bool debug, const GenericVector<UnicharRating>& results,
                       TrainingSample* sample);

   // Creates a report of the error rate. The report_level controls the detail
   // that is reported to stderr via tprintf:
   // 0   -> no output.
   // >=1 -> bottom-line error rate.
   // >=3 -> font-level error rate.
   // boosting_mode determines the return value. It selects which (un-weighted)
   // error rate to return.
   // The fontinfo_table from MasterTrainer provides the names of fonts.
   // The it determines the current subset of the training samples.
   // If not NULL, the top-choice unichar error rate is saved in unichar_error.
   // If not NULL, the report string is saved in fonts_report.
   // (Ignoring report_level).
   double ReportErrors(int report_level, CountTypes boosting_mode,
                       const FontInfoTable& fontinfo_table,
                       const SampleIterator& it,
                       double* unichar_error,
                       STRING* fonts_report);

   // Sets the report string to a combined human and machine-readable report
   // string of the error rates.
   // Returns false if there is no data, leaving report unchanged, unless
   // even_if_empty is true.
   static bool ReportString(bool even_if_empty, const Counts& counts,
                            STRING* report);

   // Computes the error rates and returns in rates which is an array of size
   // CT_SIZE. Returns false if there is no data, leaving rates unchanged.
   static bool ComputeRates(const Counts& counts, double rates[CT_SIZE]);


   // Total scaled error used by boosting algorithms.
   double scaled_error_;
   // Difference in result rating to be thought of as an "equal" choice.
   double rating_epsilon_;
   // Vector indexed by font_id from the samples of error accumulators.
   GenericVector<Counts> font_counts_;
   // Counts of the results that map each unichar_id (from samples) to an
   // incorrect shape_id.
   GENERIC_2D_ARRAY<int> unichar_counts_;
   // Count of the number of times each shape_id occurs, is correct, and multi-
   // unichar.
   GenericVector<int> multi_unichar_counts_;
   // Histogram of scores (as percent) for correct answers.
   STATS ok_score_hist_;
   // Histogram of scores (as percent) for incorrect answers.
   STATS bad_score_hist_;
   // Unicharset for printing character ids in results.
   const UNICHARSET& unicharset_;
 };

 }  // namespace tesseract.

 #endif /* THIRD_PARTY_TESSERACT_CLASSIFY_ERRORCOUNTER_H_ */
tesseract::ErrorCounter::ComputeErrorRate
static double ComputeErrorRate(ShapeClassifier *classifier, int report_level, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix *> &page_images, SampleIterator *it, double *unichar_error, double *scaled_error, STRING *fonts_report)
Definition: errorcounter.cpp:39

tesseract::CT_UNICHAR_TOPTOP_ERR
Definition: errorcounter.h:77

UNICHARSET
Definition: unicharset.h:139

tesseract::CT_UNICHAR_TOP1_ERR
Definition: errorcounter.h:74

tesseract::CT_UNICHAR_TOP2_ERR
Definition: errorcounter.h:75

tesseract::CT_RANK
Definition: errorcounter.h:85

tesseract::CT_SIZE
Definition: errorcounter.h:89

genericvector.h

tesseract::CT_UNICHAR_TOPN_ERR
Definition: errorcounter.h:76

tesseract::CT_OK_BROKEN
Definition: errorcounter.h:80

tesseract::CT_ACCEPTED_JUNK
Definition: errorcounter.h:87

tesseract::ErrorCounter::DebugNewErrors
static void DebugNewErrors(ShapeClassifier *new_classifier, ShapeClassifier *old_classifier, CountTypes boosting_mode, const FontInfoTable &fontinfo_table, const GenericVector< Pix *> &page_images, SampleIterator *it)
Definition: errorcounter.cpp:106

tesseract::CountTypes
CountTypes
Definition: errorcounter.h:69

tesseract::ShapeClassifier
Definition: shapeclassifier.h:43

tesseract
Definition: baseapi.cpp:82

GenericVector
Definition: baseapi.h:42

tesseract::CT_REJECT
Definition: errorcounter.h:81

tesseract::CT_OK_MULTI_FONT
Definition: errorcounter.h:83

STRING
Definition: strngs.h:45

tesseract::CT_OK_MULTI_UNICHAR
Definition: errorcounter.h:78

GENERIC_2D_ARRAY< int >

operator+=
ICOORD & operator+=(ICOORD &op1, const ICOORD &op2)
Definition: ipoints.h:86

UnicityTable
Definition: fontinfo.h:28

tesseract::SampleIterator
Definition: sampleiterator.h:92

tesseract::CT_NUM_RESULTS
Definition: errorcounter.h:84

tesseract::CT_OK_JOINED
Definition: errorcounter.h:79

tesseract::FontInfoTable
Definition: fontinfo.h:146

tesseract::CT_UNICHAR_TOP_OK
Definition: errorcounter.h:70

STATS
Definition: statistc.h:33

statistc.h

tesseract::CT_REJECTED_JUNK
Definition: errorcounter.h:86

tesseract::ErrorCounter
Definition: errorcounter.h:94

sample
Definition: cluster.h:32

matrix.h

tesseract::CT_FONT_ATTR_ERR
Definition: errorcounter.h:82

tesseract::TrainingSample
Definition: trainingsample.h:53