tesseract/a00128_source.html

 /**********************************************************************
  * File:        paragraphs.h
  * Description: Paragraph Detection internal data structures.
  * Author:      David Eger
  * Created:     11 March 2011
  *
  * (C) Copyright 2011, Google Inc.
  ** Licensed under the Apache License, Version 2.0 (the "License");
  ** you may not use this file except in compliance with the License.
  ** You may obtain a copy of the License at
  ** http://www.apache.org/licenses/LICENSE-2.0
  ** Unless required by applicable law or agreed to in writing, software
  ** distributed under the License is distributed on an "AS IS" BASIS,
  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  ** See the License for the specific language governing permissions and
  ** limitations under the License.
  *
  **********************************************************************/

 #ifndef TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
 #define TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_

 #include "paragraphs.h"
 #ifdef _MSC_VER
 #include <string>
 #else
 #include "strings.h"
 #endif

 // NO CODE OUTSIDE OF paragraphs.cpp AND TESTS SHOULD NEED TO ACCESS
 // DATA STRUCTURES OR FUNCTIONS IN THIS FILE.

 class WERD_CHOICE;

 namespace tesseract {

 // Return whether the given word is likely to be a list item start word.
 bool AsciiLikelyListItem(const STRING &word);

 // Return the first Unicode Codepoint from werd[pos].
 int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos);

 // Set right word attributes given either a unicharset and werd or a utf8
 // string.
 void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
                          const STRING &utf8,
                          bool *is_list, bool *starts_idea, bool *ends_idea);

 // Set left word attributes given either a unicharset and werd or a utf8 string.
 void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd,
                         const STRING &utf8,
                         bool *is_list, bool *starts_idea, bool *ends_idea);

 enum LineType {
   LT_START = 'S',     // First line of a paragraph.
   LT_BODY = 'C',      // Continuation line of a paragraph.
   LT_UNKNOWN = 'U',   // No clues.
   LT_MULTIPLE = 'M',  // Matches for both LT_START and LT_BODY.
 };

 // The first paragraph in a page of body text is often un-indented.
 // This is a typographic convention which is common to indicate either that:
 // (1) The paragraph is the continuation of a previous paragraph, or
 // (2) The paragraph is the first paragraph in a chapter.
 //
 // I refer to such paragraphs as "crown"s, and the output of the paragraph
 // detection algorithm attempts to give them the same paragraph model as
 // the rest of the body text.
 //
 // Nonetheless, while building hypotheses, it is useful to mark the lines
 // of crown paragraphs temporarily as crowns, either aligned left or right.
 extern const ParagraphModel *kCrownLeft;
 extern const ParagraphModel *kCrownRight;

 inline bool StrongModel(const ParagraphModel *model) {
   return model != NULL && model != kCrownLeft && model != kCrownRight;
 }

 struct LineHypothesis {
   LineHypothesis() : ty(LT_UNKNOWN), model(NULL) {}
   LineHypothesis(LineType line_type, const ParagraphModel *m)
       : ty(line_type), model(m) {}
   LineHypothesis(const LineHypothesis &other)
       : ty(other.ty), model(other.model) {}

   bool operator==(const LineHypothesis &other) const {
     return ty == other.ty && model == other.model;
   }

   LineType ty;
   const ParagraphModel *model;
 };

 class ParagraphTheory;  // Forward Declaration

 typedef GenericVectorEqEq<const ParagraphModel *> SetOfModels;

 // Row Scratch Registers are data generated by the paragraph detection
 // algorithm based on a RowInfo input.
 class RowScratchRegisters {
  public:
   // We presume row will outlive us.
   void Init(const RowInfo &row);

   LineType GetLineType() const;

   LineType GetLineType(const ParagraphModel *model) const;

   // Mark this as a start line type, sans model.  This is useful for the
   // initial marking of probable body lines or paragraph start lines.
   void SetStartLine();

   // Mark this as a body line type, sans model.  This is useful for the
   // initial marking of probably body lines or paragraph start lines.
   void SetBodyLine();

   // Record that this row fits as a paragraph start line in the given model,
   void AddStartLine(const ParagraphModel *model);
   // Record that this row fits as a paragraph body line in the given model,
   void AddBodyLine(const ParagraphModel *model);

   // Clear all hypotheses about this line.
   void SetUnknown() { hypotheses_.truncate(0); }

   // Append all hypotheses of strong models that match this row as a start.
   void StartHypotheses(SetOfModels *models) const;

   // Append all hypotheses of strong models matching this row.
   void StrongHypotheses(SetOfModels *models) const;

   // Append all hypotheses for this row.
   void NonNullHypotheses(SetOfModels *models) const;

   // Discard any hypotheses whose model is not in the given list.
   void DiscardNonMatchingHypotheses(const SetOfModels &models);

   // If we have only one hypothesis and that is that this line is a paragraph
   // start line of a certain model, return that model.  Else return NULL.
   const ParagraphModel *UniqueStartHypothesis() const;

   // If we have only one hypothesis and that is that this line is a paragraph
   // body line of a certain model, return that model.  Else return NULL.
   const ParagraphModel *UniqueBodyHypothesis() const;

   // Return the indentation for the side opposite of the aligned side.
   int OffsideIndent(tesseract::ParagraphJustification just) const {
     switch (just) {
       case tesseract::JUSTIFICATION_RIGHT: return lindent_;
       case tesseract::JUSTIFICATION_LEFT: return rindent_;
       default: return lindent_ > rindent_ ? lindent_ : rindent_;
     }
   }

   // Return the indentation for the side the text is aligned to.
   int AlignsideIndent(tesseract::ParagraphJustification just) const {
     switch (just) {
       case tesseract::JUSTIFICATION_RIGHT: return rindent_;
       case tesseract::JUSTIFICATION_LEFT: return lindent_;
       default: return lindent_ > rindent_ ? lindent_ : rindent_;
     }
   }

   // Append header fields to a vector of row headings.
   static void AppendDebugHeaderFields(GenericVector<STRING> *header);

   // Append data for this row to a vector of debug strings.
   void AppendDebugInfo(const ParagraphTheory &theory,
                        GenericVector<STRING> *dbg) const;

   const RowInfo *ri_;

   // These four constants form a horizontal box model for the white space
   // on the edges of each line.  At each point in the algorithm, the following
   // shall hold:
   //   ri_->pix_ldistance = lmargin_ + lindent_
   //   ri_->pix_rdistance = rindent_ + rmargin_
   int lmargin_;
   int lindent_;
   int rindent_;
   int rmargin_;

  private:
   // Hypotheses of either LT_START or LT_BODY
   GenericVectorEqEq<LineHypothesis> hypotheses_;
 };

 // A collection of convenience functions for wrapping the set of
 // Paragraph Models we believe correctly model the paragraphs in the image.
 class ParagraphTheory {
  public:
   // We presume models will outlive us, and that models will take ownership
   // of any ParagraphModel *'s we add.
   explicit ParagraphTheory(GenericVector<ParagraphModel *> *models)
       : models_(models) {}
   GenericVector<ParagraphModel *> &models() { return *models_; }
   const GenericVector<ParagraphModel *> &models() const { return *models_; }

   // Return an existing model if one that is Comparable() can be found.
   // Else, allocate a new copy of model to save and return a pointer to it.
   const ParagraphModel *AddModel(const ParagraphModel &model);

   // Discard any models we've made that are not in the list of used models.
   void DiscardUnusedModels(const SetOfModels &used_models);

   // Return the set of all non-centered models.
   void NonCenteredModels(SetOfModels *models);

   // If any of the non-centered paragraph models we know about fit
   // rows[start, end), return it.  Else NULL.
   const ParagraphModel *Fits(const GenericVector<RowScratchRegisters> *rows,
                              int start, int end) const;

   int IndexOf(const ParagraphModel *model) const;

  private:
   GenericVector<ParagraphModel *> *models_;
   GenericVectorEqEq<ParagraphModel *> models_we_added_;
 };

 bool ValidFirstLine(const GenericVector<RowScratchRegisters> *rows,
                     int row, const ParagraphModel *model);
 bool ValidBodyLine(const GenericVector<RowScratchRegisters> *rows,
                    int row, const ParagraphModel *model);
 bool CrownCompatible(const GenericVector<RowScratchRegisters> *rows,
                      int a, int b, const ParagraphModel *model);

 // A class for smearing Paragraph Model hypotheses to surrounding rows.
 // The idea here is that StrongEvidenceClassify first marks only exceedingly
 // obvious start and body rows and constructs models of them.  Thereafter,
 // we may have left over unmarked lines (mostly end-of-paragraph lines) which
 // were too short to have much confidence about, but which fit the models we've
 // constructed perfectly and which we ought to mark.  This class is used to
 // "smear" our models over the text.
 class ParagraphModelSmearer {
  public:
   ParagraphModelSmearer(GenericVector<RowScratchRegisters> *rows,
                         int row_start, int row_end,
                         ParagraphTheory *theory);

   // Smear forward paragraph models from existing row markings to subsequent
   // text lines if they fit, and mark any thereafter still unmodeled rows
   // with any model in the theory that fits them.
   void Smear();

  private:
   // Record in open_models_ for rows [start_row, end_row) the list of models
   // currently open at each row.
   // A model is still open in a row if some previous row has said model as a
   // start hypothesis, and all rows since (including this row) would fit as
   // either a body or start line in that model.
   void CalculateOpenModels(int row_start, int row_end);

   SetOfModels &OpenModels(int row) {
     return open_models_[row - row_start_ + 1];
   }

   ParagraphTheory *theory_;
   GenericVector<RowScratchRegisters> *rows_;
   int row_start_;
   int row_end_;

   // open_models_ corresponds to rows[start_row_ - 1, end_row_]
   //
   // open_models_:  Contains models which there was an active (open) paragraph
   //                as of the previous line and for which the left and right
   //                indents admit the possibility that this text line continues
   //                to fit the same model.
   // TODO(eger): Think about whether we can get rid of "Open" models and just
   //   use the current hypotheses on RowScratchRegisters.
   GenericVector<SetOfModels> open_models_;
 };

 // Clear all hypotheses about lines [start, end) and reset the margins to the
 // percentile (0..100) value of the left and right row edges for this run of
 // rows.
 void RecomputeMarginsAndClearHypotheses(
     GenericVector<RowScratchRegisters> *rows, int start, int end,
     int percentile);

 // Return the median inter-word space in rows[row_start, row_end).
 int InterwordSpace(const GenericVector<RowScratchRegisters> &rows,
                    int row_start, int row_end);

 // Return whether the first word on the after line can fit in the space at
 // the end of the before line (knowing which way the text is aligned and read).
 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
                            const RowScratchRegisters &after,
                            tesseract::ParagraphJustification justification);

 // Return whether the first word on the after line can fit in the space at
 // the end of the before line (not knowing the text alignment).
 bool FirstWordWouldHaveFit(const RowScratchRegisters &before,
                            const RowScratchRegisters &after);

 // Do rows[start, end) form a single instance of the given paragraph model?
 bool RowsFitModel(const GenericVector<RowScratchRegisters> *rows,
                   int start, int end, const ParagraphModel *model);

 // Do the text and geometry of two rows support a paragraph break between them?
 bool LikelyParagraphStart(const RowScratchRegisters &before,
                           const RowScratchRegisters &after,
                           tesseract::ParagraphJustification j);

 // Given a set of row_owners pointing to PARAs or NULL (no paragraph known),
 // normalize each row_owner to point to an actual PARA, and output the
 // paragraphs in order onto paragraphs.
 void CanonicalizeDetectionResults(
     GenericVector<PARA *> *row_owners,
     PARA_LIST *paragraphs);

 }  // namespace
 #endif  // TESSERACT_CCMAIN_PARAGRAPHS_INTERNAL_H_
u
double u[max]
Definition: dotproduct-main.cpp:5

tesseract::DiscardUnusedModels
void DiscardUnusedModels(const GenericVector< RowScratchRegisters > &rows, ParagraphTheory *theory)
Definition: paragraphs.cpp:1456

GenericVectorEqEq
Definition: resultiterator.h:29

tesseract::AsciiLikelyListItem
bool AsciiLikelyListItem(const STRING &word)
Definition: paragraphs.cpp:268

UNICHARSET
Definition: unicharset.h:139

tesseract::RowScratchRegisters::ri_
const RowInfo * ri_
Definition: paragraphs_internal.h:170

tesseract::StrongModel
bool StrongModel(const ParagraphModel *model)
Definition: paragraphs_internal.h:75

tesseract::kCrownRight
const ParagraphModel * kCrownRight
Definition: paragraphs.cpp:48

tesseract::ValidFirstLine
bool ValidFirstLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
Definition: paragraphs.cpp:1267

tesseract::JUSTIFICATION_LEFT
Definition: publictypes.h:241

tesseract::LikelyParagraphStart
bool LikelyParagraphStart(const RowScratchRegisters &before, const RowScratchRegisters &after)
Definition: paragraphs.cpp:1673

tesseract::RowScratchRegisters::SetUnknown
void SetUnknown()
Definition: paragraphs_internal.h:123

tesseract::RowsFitModel
bool RowsFitModel(const GenericVector< RowScratchRegisters > *rows, int start, int end, const ParagraphModel *model)
Definition: paragraphs.cpp:1809

tesseract::ParagraphTheory::models
GenericVector< ParagraphModel * > & models()
Definition: paragraphs_internal.h:195

tesseract::UnicodeFor
int UnicodeFor(const UNICHARSET *u, const WERD_CHOICE *werd, int pos)
Definition: paragraphs.cpp:275

tesseract::RightWordAttributes
void RightWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:442

tesseract::RecomputeMarginsAndClearHypotheses
void RecomputeMarginsAndClearHypotheses(GenericVector< RowScratchRegisters > *rows, int start, int end, int percentile)
Definition: paragraphs.cpp:1559

tesseract::LineHypothesis::ty
LineType ty
Definition: paragraphs_internal.h:90

tesseract::LineHypothesis::LineHypothesis
LineHypothesis(const LineHypothesis &other)
Definition: paragraphs_internal.h:83

tesseract::JUSTIFICATION_RIGHT
Definition: publictypes.h:243

tesseract
Definition: baseapi.cpp:82

tesseract::SetOfModels
GenericVectorEqEq< const ParagraphModel * > SetOfModels
Definition: paragraphs_internal.h:94

GenericVector< STRING >

tesseract::LineHypothesis::model
const ParagraphModel * model
Definition: paragraphs_internal.h:91

tesseract::ParagraphTheory::models
const GenericVector< ParagraphModel * > & models() const
Definition: paragraphs_internal.h:196

tesseract::ParagraphTheory
Definition: paragraphs_internal.h:189

ParagraphModel
Definition: ocrpara.h:114

tesseract::RowScratchRegisters::lindent_
int lindent_
Definition: paragraphs_internal.h:178

STRING
Definition: strngs.h:45

tesseract::ParagraphTheory::ParagraphTheory
ParagraphTheory(GenericVector< ParagraphModel *> *models)
Definition: paragraphs_internal.h:193

tesseract::LT_UNKNOWN
Definition: paragraphs_internal.h:57

tesseract::FirstWordWouldHaveFit
bool FirstWordWouldHaveFit(const RowScratchRegisters &before, const RowScratchRegisters &after, tesseract::ParagraphJustification justification)
Definition: paragraphs.cpp:1622

tesseract::LineType
LineType
Definition: paragraphs_internal.h:54

tesseract::LineHypothesis::operator==
bool operator==(const LineHypothesis &other) const
Definition: paragraphs_internal.h:86

tesseract::LT_BODY
Definition: paragraphs_internal.h:56

tesseract::InterwordSpace
int InterwordSpace(const GenericVector< RowScratchRegisters > &rows, int row_start, int row_end)
Definition: paragraphs.cpp:1599

tesseract::RowScratchRegisters::OffsideIndent
int OffsideIndent(tesseract::ParagraphJustification just) const
Definition: paragraphs_internal.h:146

paragraphs.h

tesseract::LT_MULTIPLE
Definition: paragraphs_internal.h:58

tesseract::RowInfo
Definition: paragraphs.h:40

tesseract::RowScratchRegisters::AlignsideIndent
int AlignsideIndent(tesseract::ParagraphJustification just) const
Definition: paragraphs_internal.h:155

tesseract::LT_START
Definition: paragraphs_internal.h:55

tesseract::RowScratchRegisters
Definition: paragraphs_internal.h:100

WERD_CHOICE
Definition: ratngs.h:271

tesseract::ValidBodyLine
bool ValidBodyLine(const GenericVector< RowScratchRegisters > *rows, int row, const ParagraphModel *model)
Definition: paragraphs.cpp:1278

tesseract::RowScratchRegisters::rmargin_
int rmargin_
Definition: paragraphs_internal.h:180

tesseract::ParagraphJustification
ParagraphJustification
Definition: publictypes.h:239

tesseract::LeftWordAttributes
void LeftWordAttributes(const UNICHARSET *unicharset, const WERD_CHOICE *werd, const STRING &utf8, bool *is_list, bool *starts_idea, bool *ends_idea)
Definition: paragraphs.cpp:395

tesseract::RowScratchRegisters::lmargin_
int lmargin_
Definition: paragraphs_internal.h:177

tesseract::ParagraphModelSmearer
Definition: paragraphs_internal.h:234

tesseract::LineHypothesis::LineHypothesis
LineHypothesis(LineType line_type, const ParagraphModel *m)
Definition: paragraphs_internal.h:81

tesseract::LineHypothesis::LineHypothesis
LineHypothesis()
Definition: paragraphs_internal.h:80

tesseract::LineHypothesis
Definition: paragraphs_internal.h:79

tesseract::RowScratchRegisters::rindent_
int rindent_
Definition: paragraphs_internal.h:179

tesseract::kCrownLeft
const ParagraphModel * kCrownLeft
Definition: paragraphs.cpp:46

tesseract::CrownCompatible
bool CrownCompatible(const GenericVector< RowScratchRegisters > *rows, int a, int b, const ParagraphModel *model)
Definition: paragraphs.cpp:1289

tesseract::CanonicalizeDetectionResults
void CanonicalizeDetectionResults(GenericVector< PARA *> *row_owners, PARA_LIST *paragraphs)
Definition: paragraphs.cpp:2234