#include <resultiterator.h>

Inheritance diagram for tesseract::ResultIterator:

Public Member Functions
virtual	~ResultIterator ()

virtual void	Begin ()

virtual bool	Next (PageIteratorLevel level)

virtual bool	IsAtBeginningOf (PageIteratorLevel level) const

virtual bool	IsAtFinalElement (PageIteratorLevel level, PageIteratorLevel element) const

virtual char *	GetUTF8Text (PageIteratorLevel level) const

bool	ParagraphIsLtr () const

Public Member Functions inherited from tesseract::LTRResultIterator
	LTRResultIterator (PAGE_RES page_res, Tesseract tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)

virtual	~LTRResultIterator ()

char *	GetUTF8Text (PageIteratorLevel level) const

void	SetLineSeparator (const char *new_line)

void	SetParagraphSeparator (const char *new_para)

float	Confidence (PageIteratorLevel level) const

void	RowAttributes (float row_height, float descenders, float *ascenders) const

const char *	WordFontAttributes (bool is_bold, bool is_italic, bool is_underlined, bool is_monospace, bool is_serif, bool is_smallcaps, int pointsize, int font_id) const

const char *	WordRecognitionLanguage () const

StrongScriptDirection	WordDirection () const

bool	WordIsFromDictionary () const

int	BlanksBeforeWord () const

bool	WordIsNumeric () const

bool	HasBlamerInfo () const

const void *	GetParamsTrainingBundle () const

const char *	GetBlamerDebug () const

const char *	GetBlamerMisadaptionDebug () const

bool	HasTruthString () const

bool	EquivalentToTruth (const char *str) const

char *	WordTruthUTF8Text () const

char *	WordNormedUTF8Text () const

const char *	WordLattice (int *lattice_size) const

bool	SymbolIsSuperscript () const

bool	SymbolIsSubscript () const

bool	SymbolIsDropcap () const

Public Member Functions inherited from tesseract::PageIterator
	PageIterator (PAGE_RES page_res, Tesseract tesseract, int scale, int scaled_yres, int rect_left, int rect_top, int rect_width, int rect_height)

virtual	~PageIterator ()

	PageIterator (const PageIterator &src)

const PageIterator &	operator= (const PageIterator &src)

bool	PositionedAtSameWord (const PAGE_RES_IT *other) const

virtual void	RestartParagraph ()

bool	IsWithinFirstTextlineOfParagraph () const

virtual void	RestartRow ()

int	Cmp (const PageIterator &other) const

void	SetBoundingBoxComponents (bool include_upper_dots, bool include_lower_dots)

bool	BoundingBox (PageIteratorLevel level, int left, int top, int right, int bottom) const

bool	BoundingBox (PageIteratorLevel level, const int padding, int left, int top, int right, int bottom) const

bool	BoundingBoxInternal (PageIteratorLevel level, int left, int top, int right, int bottom) const

bool	Empty (PageIteratorLevel level) const

PolyBlockType	BlockType () const

Pta *	BlockPolygon () const

Pix *	GetBinaryImage (PageIteratorLevel level) const

Pix *	GetImage (PageIteratorLevel level, int padding, Pix original_img, int left, int *top) const

bool	Baseline (PageIteratorLevel level, int x1, int y1, int x2, int y2) const

void	Orientation (tesseract::Orientation orientation, tesseract::WritingDirection writing_direction, tesseract::TextlineOrder textline_order, float deskew_angle) const

void	ParagraphInfo (tesseract::ParagraphJustification justification, bool is_list_item, bool is_crown, int first_line_indent) const

bool	SetWordBlamerBundle (BlamerBundle *blamer_bundle)

Static Public Member Functions
static ResultIterator *	StartOfParagraph (const LTRResultIterator &resit)

static void	CalculateTextlineOrder (bool paragraph_is_ltr, const GenericVector< StrongScriptDirection > &word_dirs, GenericVectorEqEq< int > *reading_order)

Static Public Attributes
static const int	kMinorRunStart = -1

static const int	kMinorRunEnd = -2

static const int	kComplexWord = -3

Protected Member Functions
TESS_LOCAL	ResultIterator (const LTRResultIterator &resit)

Protected Member Functions inherited from tesseract::PageIterator
TESS_LOCAL void	BeginWord (int offset)

Additional Inherited Members
Protected Attributes inherited from tesseract::LTRResultIterator
const char *	line_separator_

const char *	paragraph_separator_

Protected Attributes inherited from tesseract::PageIterator
PAGE_RES *	page_res_

Tesseract *	tesseract_

PAGE_RES_IT *	it_

WERD *	word_

int	word_length_

int	blob_index_

C_BLOB_IT *	cblob_it_

bool	include_upper_dots_

bool	include_lower_dots_

int	scale_

int	scaled_yres_

int	rect_left_

int	rect_top_

int	rect_width_

int	rect_height_

Detailed Description

Definition at line 38 of file resultiterator.h.

Constructor & Destructor Documentation

◆ ~ResultIterator()

virtual tesseract::ResultIterator::~ResultIterator ( )

inlinevirtual

ResultIterator is copy constructible! The default copy constructor works just fine for us.

Definition at line 46 of file resultiterator.h.

46 {}

◆ ResultIterator()

tesseract::ResultIterator::ResultIterator ( const LTRResultIterator & resit )

explicitprotected

We presume the data associated with the given iterator will outlive us. NB: This is private because it does something that is non-obvious: it resets to the beginning of the paragraph instead of staying wherever resit might have pointed.

Definition at line 33 of file resultiterator.cpp.

     : LTRResultIterator(resit) {
   in_minor_direction_ = false;
   at_beginning_of_minor_run_ = false;
   preserve_interword_spaces_ = false;
 
   BoolParam *p = ParamUtils::FindParam<BoolParam>(
       "preserve_interword_spaces", GlobalParams()->bool_params,
       tesseract_->params()->bool_params);
   if (p != NULL) preserve_interword_spaces_ = (bool)(*p);
 
   current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
   MoveToLogicalStartOfTextline();
 }

Member Function Documentation

◆ Begin()

void tesseract::ResultIterator::Begin ( )

virtual

Moves the iterator to point to the start of the page to begin an iteration.

Reimplemented from tesseract::PageIterator.

Definition at line 413 of file resultiterator.cpp.

                            {
   LTRResultIterator::Begin();
   current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
   in_minor_direction_ = false;
   at_beginning_of_minor_run_ = false;
   MoveToLogicalStartOfTextline();
 }

◆ CalculateTextlineOrder()

void tesseract::ResultIterator::CalculateTextlineOrder	(	bool	paragraph_is_ltr,
		const GenericVector< StrongScriptDirection > &	word_dirs,
		GenericVectorEqEq< int > *	reading_order
	)

static

Yields the reading order as a sequence of indices and (optional) meta-marks for a set of words (given left-to-right). The meta marks are passed as negative values: kMinorRunStart Start of minor direction text. kMinorRunEnd End of minor direction text. kComplexWord The next indexed word contains both left-to-right and right-to-left characters and was treated as neutral.

For example, suppose we have five words in a text line, indexed [0,1,2,3,4] from the leftmost side of the text line. The following are all believable reading_orders:

Left-to-Right (in ltr paragraph): { 0, 1, 2, 3, 4 } Left-to-Right (in rtl paragraph): { kMinorRunStart, 0, 1, 2, 3, 4, kMinorRunEnd } Right-to-Left (in rtl paragraph): { 4, 3, 2, 1, 0 } Left-to-Right except for an RTL phrase in words 2, 3 in an ltr paragraph: { 0, 1, kMinorRunStart, 3, 2, kMinorRunEnd, 4 }

Definition at line 255 of file resultiterator.cpp.

                                            {
   reading_order->truncate(0);
   if (word_dirs.size() == 0) return;
 
   // Take all of the runs of minor direction words and insert them
   // in reverse order.
   int minor_direction, major_direction, major_step, start, end;
   if (paragraph_is_ltr) {
     start = 0;
     end = word_dirs.size();
     major_step = 1;
     major_direction = DIR_LEFT_TO_RIGHT;
     minor_direction = DIR_RIGHT_TO_LEFT;
   } else {
     start = word_dirs.size() - 1;
     end = -1;
     major_step = -1;
     major_direction = DIR_RIGHT_TO_LEFT;
     minor_direction = DIR_LEFT_TO_RIGHT;
     // Special rule: if there are neutral words at the right most side
     //   of a line adjacent to a left-to-right word in the middle of the
     //   line, we interpret the end of the line as a single LTR sequence.
     if (word_dirs[start] == DIR_NEUTRAL) {
       int neutral_end = start;
       while (neutral_end > 0 && word_dirs[neutral_end] == DIR_NEUTRAL) {
         neutral_end--;
       }
       if (neutral_end >= 0 && word_dirs[neutral_end] == DIR_LEFT_TO_RIGHT) {
         // LTR followed by neutrals.
         // Scan for the beginning of the minor left-to-right run.
         int left = neutral_end;
         for (int i = left; i >= 0 && word_dirs[i] != DIR_RIGHT_TO_LEFT; i--) {
           if (word_dirs[i] == DIR_LEFT_TO_RIGHT) left = i;
         }
         reading_order->push_back(kMinorRunStart);
         for (int i = left; i < word_dirs.size(); i++) {
           reading_order->push_back(i);
           if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
         }
         reading_order->push_back(kMinorRunEnd);
         start = left - 1;
       }
     }
   }
   for (int i = start; i != end;) {
     if (word_dirs[i] == minor_direction) {
       int j = i;
       while (j != end && word_dirs[j] != major_direction)
         j += major_step;
       if (j == end) j -= major_step;
       while (j != i && word_dirs[j] != minor_direction)
         j -= major_step;
       //  [j..i] is a minor direction run.
       reading_order->push_back(kMinorRunStart);
       for (int k = j; k != i; k -= major_step) {
         reading_order->push_back(k);
       }
       reading_order->push_back(i);
       reading_order->push_back(kMinorRunEnd);
       i = j + major_step;
     } else {
       reading_order->push_back(i);
       if (word_dirs[i] == DIR_MIX) reading_order->push_back(kComplexWord);
       i += major_step;
     }
   }
 }

◆ GetUTF8Text()

char * tesseract::ResultIterator::GetUTF8Text ( PageIteratorLevel level ) const

virtual

Returns the null terminated UTF-8 encoded text string for the current object at the given level. Use delete [] to free after use.

Definition at line 556 of file resultiterator.cpp.

                                                                {
   if (it_->word() == NULL) return NULL;  // Already at the end!
   STRING text;
   switch (level) {
     case RIL_BLOCK:
       {
         ResultIterator pp(*this);
         do {
           pp.AppendUTF8ParagraphText(&text);
         } while (pp.Next(RIL_PARA) && pp.it_->block() == it_->block());
       }
       break;
     case RIL_PARA:
       AppendUTF8ParagraphText(&text);
       break;
     case RIL_TEXTLINE:
       {
         ResultIterator it(*this);
         it.MoveToLogicalStartOfTextline();
         it.IterateAndAppendUTF8TextlineText(&text);
       }
       break;
     case RIL_WORD:
       AppendUTF8WordText(&text);
       break;
     case RIL_SYMBOL:
       {
         bool reading_direction_is_ltr =
           current_paragraph_is_ltr_ ^ in_minor_direction_;
         if (at_beginning_of_minor_run_) {
           text += reading_direction_is_ltr ? kLRM : kRLM;
         }
         text = it_->word()->BestUTF8(blob_index_, !reading_direction_is_ltr);
         if (IsAtFinalSymbolOfWord()) AppendSuffixMarks(&text);
       }
       break;
   }
   int length = text.length() + 1;
   char* result = new char[length];
   strncpy(result, text.string(), length);
   return result;
 }

◆ IsAtBeginningOf()

bool tesseract::ResultIterator::IsAtBeginningOf ( PageIteratorLevel level ) const

virtual

IsAtBeginningOf() returns whether we're at the logical beginning of the given level. (as opposed to ResultIterator's left-to-right top-to-bottom order). Otherwise, this acts the same as PageIterator::IsAtBeginningOf(). For a full description, see pageiterator.h

Reimplemented from tesseract::PageIterator.

Definition at line 496 of file resultiterator.cpp.

                                                                   {
   if (it_->block() == NULL) return false;  // Already at the end!
   if (it_->word() == NULL) return true;  // In an image block.
   if (level == RIL_SYMBOL) return true;  // Always at beginning of a symbol.
 
   bool at_word_start = IsAtFirstSymbolOfWord();
   if (level == RIL_WORD) return at_word_start;
 
   ResultIterator line_start(*this);
   // move to the first word in the line...
   line_start.MoveToLogicalStartOfTextline();
 
   bool at_textline_start = at_word_start && *line_start.it_ == *it_;
   if (level == RIL_TEXTLINE) return at_textline_start;
 
   // now we move to the left-most word...
   line_start.RestartRow();
   bool at_block_start = at_textline_start &&
       line_start.it_->block() != line_start.it_->prev_block();
   if (level == RIL_BLOCK) return at_block_start;
 
   bool at_para_start = at_block_start ||
       (at_textline_start &&
        line_start.it_->row()->row->para() !=
            line_start.it_->prev_row()->row->para());
   if (level == RIL_PARA) return at_para_start;
 
   ASSERT_HOST(false);  // shouldn't happen.
   return false;
 }

◆ IsAtFinalElement()

bool tesseract::ResultIterator::IsAtFinalElement	(	PageIteratorLevel	level,
		PageIteratorLevel	element
	)		const

virtual

Implement PageIterator's IsAtFinalElement correctly in a BiDi context. For instance, IsAtFinalElement(RIL_PARA, RIL_WORD) returns whether we point at the last word in a paragraph. See PageIterator for full comment.

NOTE! This is an exact copy of PageIterator::IsAtFinalElement with the change that the variable next is now a ResultIterator instead of a PageIterator.

Reimplemented from tesseract::PageIterator.

Definition at line 532 of file resultiterator.cpp.

                                                                        {
   if (Empty(element)) return true;  // Already at the end!
   // The result is true if we step forward by element and find we are
   // at the the end of the page or at beginning of *all* levels in:
   // [level, element).
   // When there is more than one level difference between element and level,
   // we could for instance move forward one symbol and still be at the first
   // word on a line, so we also have to be at the first symbol in a word.
   ResultIterator next(*this);
   next.Next(element);
   if (next.Empty(element)) return true;  // Reached the end of the page.
   while (element > level) {
     element = static_cast<PageIteratorLevel>(element - 1);
     if (!next.IsAtBeginningOf(element))
       return false;
   }
   return true;
 }

◆ Next()

bool tesseract::ResultIterator::Next ( PageIteratorLevel level )

virtual

Moves to the start of the next object at the given level in the page hierarchy in the appropriate reading order and returns false if the end of the page was reached. NOTE that RIL_SYMBOL will skip non-text blocks, but all other PageIteratorLevel level values will visit each non-text block once. Think of non text blocks as containing a single para, with a single line, with a single imaginary word. Calls to Next with different levels may be freely intermixed. This function iterates words in right-to-left scripts correctly, if the appropriate language has been loaded into Tesseract.

Reimplemented from tesseract::PageIterator.

Definition at line 421 of file resultiterator.cpp.

                                                  {
   if (it_->block() == NULL) return false; // already at end!
   switch (level) {
     case RIL_BLOCK:  // explicit fall-through
     case RIL_PARA:   // explicit fall-through
     case RIL_TEXTLINE:
       if (!PageIterator::Next(level)) return false;
       if (IsWithinFirstTextlineOfParagraph()) {
         // if we've advanced to a new paragraph,
         // recalculate current_paragraph_is_ltr_
         current_paragraph_is_ltr_ = CurrentParagraphIsLtr();
       }
       in_minor_direction_ = false;
       MoveToLogicalStartOfTextline();
       return it_->block() != NULL;
     case RIL_SYMBOL:
     {
       GenericVector<int> blob_order;
       CalculateBlobOrder(&blob_order);
       int next_blob = 0;
       while (next_blob < blob_order.size() &&
              blob_index_ != blob_order[next_blob])
         next_blob++;
       next_blob++;
       if (next_blob < blob_order.size()) {
         // we're in the same word; simply advance one blob.
         BeginWord(blob_order[next_blob]);
         at_beginning_of_minor_run_ = false;
         return true;
       }
       level = RIL_WORD;  // we've fallen through to the next word.
     }
     case RIL_WORD:  // explicit fall-through.
     {
       if (it_->word() == NULL) return Next(RIL_BLOCK);
       GenericVectorEqEq<int> word_indices;
       int this_word_index = LTRWordIndex();
       CalculateTextlineOrder(current_paragraph_is_ltr_,
                              *this,
                              &word_indices);
       int final_real_index = word_indices.size() - 1;
       while (final_real_index > 0 && word_indices[final_real_index] < 0)
         final_real_index--;
       for (int i = 0; i < final_real_index; i++) {
         if (word_indices[i] == this_word_index) {
           int j = i + 1;
           for (; j < final_real_index && word_indices[j] < 0; j++) {
             if (word_indices[j] == kMinorRunStart) in_minor_direction_ = true;
             if (word_indices[j] == kMinorRunEnd) in_minor_direction_ = false;
           }
           at_beginning_of_minor_run_ = (word_indices[j - 1] == kMinorRunStart);
           // awesome, we move to word_indices[j]
           if (BidiDebug(3)) {
             tprintf("Next(RIL_WORD): %d -> %d\n",
                     this_word_index, word_indices[j]);
           }
           PageIterator::RestartRow();
           for (int k = 0; k < word_indices[j]; k++) {
             PageIterator::Next(RIL_WORD);
           }
           MoveToLogicalStartOfWord();
           return true;
         }
       }
       if (BidiDebug(3)) {
         tprintf("Next(RIL_WORD): %d -> EOL\n", this_word_index);
       }
       // we're going off the end of the text line.
       return Next(RIL_TEXTLINE);
     }
   }
   ASSERT_HOST(false);  // shouldn't happen.
   return false;
 }

◆ ParagraphIsLtr()

bool tesseract::ResultIterator::ParagraphIsLtr ( ) const

Return whether the current paragraph's dominant reading direction is left-to-right (as opposed to right-to-left).

Definition at line 53 of file resultiterator.cpp.

                                           {
   return current_paragraph_is_ltr_;
 }

◆ StartOfParagraph()

ResultIterator * tesseract::ResultIterator::StartOfParagraph ( const LTRResultIterator & resit )

static

Definition at line 48 of file resultiterator.cpp.

                                     {
   return new ResultIterator(resit);
 }

Member Data Documentation

◆ kComplexWord

const int tesseract::ResultIterator::kComplexWord = -3

static

Definition at line 130 of file resultiterator.h.

◆ kMinorRunEnd

const int tesseract::ResultIterator::kMinorRunEnd = -2

static

Definition at line 129 of file resultiterator.h.

◆ kMinorRunStart

const int tesseract::ResultIterator::kMinorRunStart = -1

static

Definition at line 128 of file resultiterator.h.

The documentation for this class was generated from the following files:

/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/resultiterator.h
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/resultiterator.cpp

Public Member Functions

Static Public Member Functions

Static Public Attributes

Protected Member Functions

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ ~ResultIterator()

◆ ResultIterator()

Member Function Documentation

◆ Begin()

◆ CalculateTextlineOrder()

◆ GetUTF8Text()

◆ IsAtBeginningOf()

◆ IsAtFinalElement()

◆ Next()

◆ ParagraphIsLtr()

◆ StartOfParagraph()

Member Data Documentation

◆ kComplexWord

◆ kMinorRunEnd

◆ kMinorRunStart