#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:

Public Member Functions

Tesseract ()

~Tesseract ()

void Clear ()

void ResetAdaptiveClassifier ()

void ResetDocumentDictionary ()

void SetEquationDetect (EquationDetect *detector)

const FCOORD & reskew () const

Pix ** mutable_pix_binary ()

Pix * pix_binary () const

Pix * pix_grey () const

void set_pix_grey (Pix *grey_pix)

Pix * pix_original () const

void set_pix_original (Pix *original_pix)

Pix * BestPix () const

void set_pix_thresholds (Pix *thresholds)

int source_resolution () const

void set_source_resolution (int ppi)

int ImageWidth () const

int ImageHeight () const

Pix * scaled_color () const

int scaled_factor () const

void SetScaledColor (int factor, Pix *color)

const Textord & textord () const

Textord * mutable_textord ()

bool right_to_left () const

int num_sub_langs () const

Tesseract * get_sub_lang (int index) const

bool AnyTessLang () const

bool AnyLSTMLang () const

void SetBlackAndWhitelist ()

void PrepareForPageseg ()

void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)

int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)

void SetupWordScripts (BLOCK_LIST *blocks)

int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)

ColumnFinder * SetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)

void PrerecAllWordsPar (const GenericVector< WordData > &words)

void TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)

void TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)

ImageData * GetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)

ImageData * GetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const

void LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)

void SearchWords (PointerVector< WERD_RES > *words)

bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)

void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)

void SetupWordPassN (int pass_n, WordData *word)

bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)

bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)

void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)

void bigram_correction_pass (PAGE_RES *page_res)

void blamer_pass (PAGE_RES *page_res)

void script_pos_pass (PAGE_RES *page_res)

int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)

bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)

void AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)

void AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)

bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)

float ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)

float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)

void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)

void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)

void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)

void fix_rep_char (PAGE_RES_IT *page_res_it)

ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)

void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)

void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)

void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)

bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)

bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)

bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)

BOOL8 recog_interactive (PAGE_RES_IT *pr_it)

void set_word_fonts (WERD_RES *word)

void font_recognition_pass (PAGE_RES *page_res)

void dictionary_correction_pass (PAGE_RES *page_res)

BOOL8 check_debug_pt (WERD_RES *word, int location)

bool SubAndSuperscriptFix (WERD_RES *word_res)

void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)

WERD_RES * TrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)

bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const

void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)

void write_results (PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)

void set_unlv_suspects (WERD_RES *word)

UNICHAR_ID get_rep_char (WERD_RES *word)

BOOL8 acceptable_number_string (const char *s, const char *lengths)

inT16 count_alphanums (const WERD_CHOICE &word)

inT16 count_alphas (const WERD_CHOICE &word)

void read_config_file (const char *filename, SetParamConstraint constraint)

int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)

int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)

int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)

void SetupUniversalFontIds ()

int init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)

void recognize_page (STRING &image_name)

void end_tesseract ()

bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)

void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)

SVMenuNode * build_menu_new ()

void pgeditor_main (int width, int height, PAGE_RES *page_res)

void process_image_event (const SVEvent &event)

BOOL8 process_cmd_win_event (inT32 cmd_event, char *new_value)

void debug_word (PAGE_RES *page_res, const TBOX &selection_box)

void do_re_display (BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))

BOOL8 word_display (PAGE_RES_IT *pr_it)

BOOL8 word_bln_display (PAGE_RES_IT *pr_it)

BOOL8 word_blank_and_set_display (PAGE_RES_IT *pr_its)

BOOL8 word_set_display (PAGE_RES_IT *pr_it)

BOOL8 word_dumper (PAGE_RES_IT *pr_it)

void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)

void make_reject_map (WERD_RES *word, ROW *row, inT16 pass)

BOOL8 one_ell_conflict (WERD_RES *word_res, BOOL8 update_map)

inT16 first_alphanum_index (const char *word, const char *word_lengths)

inT16 first_alphanum_offset (const char *word, const char *word_lengths)

inT16 alpha_count (const char *word, const char *word_lengths)

BOOL8 word_contains_non_1_digit (const char *word, const char *word_lengths)

void dont_allow_1Il (WERD_RES *word)

inT16 count_alphanums (WERD_RES *word)

void flip_0O (WERD_RES *word)

BOOL8 non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)

BOOL8 non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)

BOOL8 repeated_nonalphanum_wd (WERD_RES *word, ROW *row)

void nn_match_word (WERD_RES *word, ROW *row)

void nn_recover_rejects (WERD_RES *word, ROW *row)

void set_done (WERD_RES *word, inT16 pass)

inT16 safe_dict_word (const WERD_RES *werd_res)

void flip_hyphens (WERD_RES *word)

void reject_I_1_L (WERD_RES *word)

void reject_edge_blobs (WERD_RES *word)

void reject_mostly_rejects (WERD_RES *word)

BOOL8 word_adaptable (WERD_RES *word, uinT16 mode)

void recog_word_recursive (WERD_RES *word)

void recog_word (WERD_RES *word)

void split_and_recog_word (WERD_RES *word)

void split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const

void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const

void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)

inT16 fp_eval_word_spacing (WERD_RES_LIST &word_res_list)

void dump_words (WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)

BOOL8 fixspace_thinks_word_done (WERD_RES *word)

GARBAGE_LEVEL garbage_word (WERD_RES *word, BOOL8 ok_dict_word)

BOOL8 potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)

void tilde_crunch (PAGE_RES_IT &page_res_it)

void unrej_good_quality_words (PAGE_RES_IT &page_res_it)

void doc_and_block_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)

void quality_based_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)

void convert_bad_unlv_chs (WERD_RES *word_res)

void tilde_delete (PAGE_RES_IT &page_res_it)

inT16 word_blob_quality (WERD_RES *word, ROW *row)

void word_char_quality (WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)

void unrej_good_chs (WERD_RES *word, ROW *row)

inT16 count_outline_errs (char c, inT16 outline_count)

inT16 word_outline_errs (WERD_RES *word)

BOOL8 terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)

CRUNCH_MODE word_deletable (WERD_RES *word, inT16 &delete_mode)

inT16 failure_count (WERD_RES *word)

BOOL8 noise_outlines (TWERD *word)

void tess_segment_pass_n (int pass_n, WERD_RES *word)

PAGE_RES * ApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)

void PreenXHeights (BLOCK_LIST *block_list)

PAGE_RES * SetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)

void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)

bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)

bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)

void ReSegmentByClassification (PAGE_RES *page_res)

bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)

bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)

void SearchForText (const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)

void TidyUp (PAGE_RES *page_res)

void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)

void CorrectClassifyWords (PAGE_RES *page_res)

void ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res)

int CountMisfitTops (WERD_RES *word_res)

float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)

FILE * init_recog_training (const STRING &fname)

void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)

void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)

eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

BOOL8 digit_or_numeric_punct (WERD_RES *word, int char_position)

inT16 eval_word_spacing (WERD_RES_LIST &word_res_list)

fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)

void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)

inT16 worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)

float blob_noise_score (TBLOB *blob)

void break_noisiest_blob_word (WERD_RES_LIST &words)

fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters

	monitor	progress monitor
	word_count	count of words in doc
[out]	page_res

void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)

void fix_fuzzy_spaces (ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)

process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))

tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)

tess_acceptable_word

Returns: true if the word is regarded as "good enough".

Parameters

word_choice	after context
raw_choice	before context

bool tess_acceptable_word (WERD_RES *word)

Public Member Functions inherited from tesseract::Wordrec

Wordrec ()

virtual ~Wordrec ()

void SaveAltChoices (const LIST &best_choices, WERD_RES *word)

void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)

void WordSearch (WERD_RES *word_res)

void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)

void DoSegSearch (WERD_RES *word_res)

SEAM * attempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)

SEAM * chop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)

SEAM * chop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)

void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)

void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)

void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)

SEAM * pick_good_seam (TBLOB *blob)

void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)

void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)

PRIORITY grade_split_length (register SPLIT *split)

PRIORITY grade_sharpness (register SPLIT *split)

bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)

virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)

void merge_fragments (MATRIX *ratings, inT16 num_blobs)

void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)

void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)

void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)

void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)

void cc_recog (WERD_RES *word)

void program_editdown (inT32 elasped_time)

void set_pass1 ()

void set_pass2 ()

int end_recog ()

BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)

int dict_word (const WERD_CHOICE &word)

BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)

PRIORITY point_priority (EDGEPT *point)

void add_point_to_list (PointHeap *point_heap, EDGEPT *point)

bool is_inside_angle (EDGEPT *pt)

int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)

EDGEPT * pick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)

void prioritize_points (TESSLINE *outline, PointHeap *points)

void new_min_point (EDGEPT *local_min, PointHeap *points)

void new_max_point (EDGEPT *local_max, PointHeap *points)

void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)

SEAM * improve_one_blob (const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)

SEAM * chop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)

void chop_word_main (WERD_RES *word)

void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)

int select_blob_to_split (const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment)

int select_blob_to_split_from_fixpt (DANGERR *fixpt)

Public Member Functions inherited from tesseract::Classify

Classify ()

virtual ~Classify ()

Dict & getDict ()

const ShapeTable * shape_table () const

void SetStaticClassifier (ShapeClassifier *static_classifier)

void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)

bool LargeSpeckle (const TBLOB &blob)

ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)

int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)

int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)

void ReadNewCutoffs (TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)

void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)

void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)

ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)

FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)

void FreeNormProtos ()

NORM_PROTOS * ReadNormProtos (TFile *fp)

void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)

INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)

void LearnWord (const char *fontname, WERD_RES *word)

void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)

void InitAdaptiveClassifier (TessdataManager *mgr)

void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)

void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)

void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)

void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)

double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors)

void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)

void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)

int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)

void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)

PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)

int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)

void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)

void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)

void RemoveExtraPuncs (ADAPT_RESULTS *Results)

void RemoveBadMatches (ADAPT_RESULTS *Results)

void SetAdaptiveThreshold (FLOAT32 Threshold)

void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)

STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const

int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const

int ShapeIDToClassID (int shape_id) const

UNICHAR_ID * BaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)

int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)

int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)

UNICHAR_ID * GetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)

void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)

void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)

void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)

bool AdaptableWord (WERD_RES *word)

void EndAdaptiveClassifier ()

void SettupPass1 ()

void SettupPass2 ()

void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)

void ClassifyAsNoise (ADAPT_RESULTS *Results)

void ResetAdaptiveClassifierInternal ()

void SwitchAdaptiveClassifier ()

void StartBackupAdaptiveClassifier ()

int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array)

void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)

bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)

void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)

bool AdaptiveClassifierIsFull () const

bool AdaptiveClassifierIsEmpty () const

bool LooksLikeGarbage (TBLOB *blob)

void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)

void ClearCharNormArray (uinT8 *char_norm_array)

void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)

void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)

INT_TEMPLATES ReadIntTemplates (TFile *fp)

void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)

CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)

void ShowMatchDisplay ()

UnicityTable< FontInfo > & get_fontinfo_table ()

const UnicityTable< FontInfo > & get_fontinfo_table () const

UnicityTable< FontSet > & get_fontset_table ()

void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)

FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)

FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)

FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)

FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)

void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)

bool WriteTRFile (const STRING &filename)

Public Member Functions inherited from tesseract::CCStruct

CCStruct ()

~CCStruct ()

Public Member Functions inherited from tesseract::CUtil

CUtil ()

~CUtil ()

void read_variables (const char *filename, bool global_only)

Public Member Functions inherited from tesseract::CCUtil

CCUtil ()

virtual ~CCUtil ()

void main_setup (const char *argv0, const char *basename)

CCUtil::main_setup - set location of tessdata and name of image. More...

ParamsVectors * params ()

Public Attributes
bool	tessedit_resegment_from_boxes = false

bool	tessedit_resegment_from_line_boxes = false

bool	tessedit_train_from_boxes = false

bool	tessedit_make_boxes_from_boxes = false

bool	tessedit_train_line_recognizer = false

bool	tessedit_dump_pageseg_images = false

int	tessedit_pageseg_mode = PSM_SINGLE_BLOCK

int	tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT

char *	tessedit_char_blacklist = ""

char *	tessedit_char_whitelist = ""

char *	tessedit_char_unblacklist = ""

bool	tessedit_ambigs_training = false

int	pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

int	ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

char *	tessedit_write_params_to_file = ""

bool	tessedit_adaption_debug = false

int	bidi_debug = 0

int	applybox_debug = 1

int	applybox_page = 0

char *	applybox_exposure_pattern = ".exp"

bool	applybox_learn_chars_and_char_frags_mode = false

bool	applybox_learn_ngrams_mode = false

bool	tessedit_display_outwords = false

bool	tessedit_dump_choices = false

bool	tessedit_timing_debug = false

bool	tessedit_fix_fuzzy_spaces = true

bool	tessedit_unrej_any_wd = false

bool	tessedit_fix_hyphens = true

bool	tessedit_redo_xheight = true

bool	tessedit_enable_doc_dict = true

bool	tessedit_debug_fonts = false

bool	tessedit_debug_block_rejection = false

bool	tessedit_enable_bigram_correction = true

bool	tessedit_enable_dict_correction = false

int	tessedit_bigram_debug = 0

bool	enable_noise_removal = true

int	debug_noise_removal = 0

double	noise_cert_basechar = -8.0

double	noise_cert_disjoint = -2.5

double	noise_cert_punc = -2.5

double	noise_cert_factor = 0.375

int	noise_maxperblob = 8

int	noise_maxperword = 16

int	debug_x_ht_level = 0

bool	debug_acceptable_wds = false

char *	chs_leading_punct = "('`\""

char *	chs_trailing_punct1 = ").,;:?!"

char *	chs_trailing_punct2 = ")'`\""

double	quality_rej_pc = 0.08

double	quality_blob_pc = 0.0

double	quality_outline_pc = 1.0

double	quality_char_pc = 0.95

int	quality_min_initial_alphas_reqd = 2

int	tessedit_tess_adaption_mode = 0x27

bool	tessedit_minimal_rej_pass1 = false

bool	tessedit_test_adaption = false

bool	tessedit_matcher_log = false

int	tessedit_test_adaption_mode = 3

bool	test_pt = false

double	test_pt_x = 99999.99

double	test_pt_y = 99999.99

int	multilang_debug_level = 0

int	paragraph_debug_level = 0

bool	paragraph_text_based = true

bool	lstm_use_matrix = 1

char *	outlines_odd = "%\| "

char *	outlines_2 = "ij!?%\":;"

bool	docqual_excuse_outline_errs = false

bool	tessedit_good_quality_unrej = true

bool	tessedit_use_reject_spaces = true

double	tessedit_reject_doc_percent = 65.00

double	tessedit_reject_block_percent = 45.00

double	tessedit_reject_row_percent = 40.00

double	tessedit_whole_wd_rej_row_percent = 70.00

bool	tessedit_preserve_blk_rej_perfect_wds = true

bool	tessedit_preserve_row_rej_perfect_wds = true

bool	tessedit_dont_blkrej_good_wds = false

bool	tessedit_dont_rowrej_good_wds = false

int	tessedit_preserve_min_wd_len = 2

bool	tessedit_row_rej_good_docs = true

double	tessedit_good_doc_still_rowrej_wd = 1.1

bool	tessedit_reject_bad_qual_wds = true

bool	tessedit_debug_doc_rejection = false

bool	tessedit_debug_quality_metrics = false

bool	bland_unrej = false

double	quality_rowrej_pc = 1.1

bool	unlv_tilde_crunching = true

bool	hocr_font_info = false

bool	crunch_early_merge_tess_fails = true

bool	crunch_early_convert_bad_unlv_chs = false

double	crunch_terrible_rating = 80.0

bool	crunch_terrible_garbage = true

double	crunch_poor_garbage_cert = -9.0

double	crunch_poor_garbage_rate = 60

double	crunch_pot_poor_rate = 40

double	crunch_pot_poor_cert = -8.0

bool	crunch_pot_garbage = true

double	crunch_del_rating = 60

double	crunch_del_cert = -10.0

double	crunch_del_min_ht = 0.7

double	crunch_del_max_ht = 3.0

double	crunch_del_min_width = 3.0

double	crunch_del_high_word = 1.5

double	crunch_del_low_word = 0.5

double	crunch_small_outlines_size = 0.6

int	crunch_rating_max = 10

int	crunch_pot_indicators = 1

bool	crunch_leave_ok_strings = true

bool	crunch_accept_ok = true

bool	crunch_leave_accept_strings = false

bool	crunch_include_numerals = false

int	crunch_leave_lc_strings = 4

int	crunch_leave_uc_strings = 4

int	crunch_long_repetitions = 3

int	crunch_debug = 0

int	fixsp_non_noise_limit = 1

double	fixsp_small_outlines_size = 0.28

bool	tessedit_prefer_joined_punct = false

int	fixsp_done_mode = 1

int	debug_fix_space_level = 0

char *	numeric_punctuation = ".,"

int	x_ht_acceptance_tolerance = 8

int	x_ht_min_change = 8

int	superscript_debug = 0

double	superscript_worse_certainty = 2.0

double	superscript_bettered_certainty = 0.97

double	superscript_scaledown_ratio = 0.4

double	subscript_max_y_top = 0.5

double	superscript_min_y_bottom = 0.3

bool	tessedit_write_block_separators = false

bool	tessedit_write_rep_codes = false

bool	tessedit_write_unlv = false

bool	tessedit_create_txt = false

bool	tessedit_create_hocr = false

bool	tessedit_create_tsv = false

bool	tessedit_create_pdf = false

bool	textonly_pdf = false

char *	unrecognised_char = "\|"

int	suspect_level = 99

int	suspect_space_level = 100

int	suspect_short_words = 2

bool	suspect_constrain_1Il = false

double	suspect_rating_per_ch = 999.9

double	suspect_accept_rating = -999.9

bool	tessedit_minimal_rejection = false

bool	tessedit_zero_rejection = false

bool	tessedit_word_for_word = false

bool	tessedit_zero_kelvin_rejection = false

bool	tessedit_consistent_reps = true

int	tessedit_reject_mode = 0

bool	tessedit_rejection_debug = false

bool	tessedit_flip_0O = true

double	tessedit_lower_flip_hyphen = 1.5

double	tessedit_upper_flip_hyphen = 1.8

bool	rej_trust_doc_dawg = false

bool	rej_1Il_use_dict_word = false

bool	rej_1Il_trust_permuter_type = true

bool	rej_use_tess_accepted = true

bool	rej_use_tess_blanks = true

bool	rej_use_good_perm = true

bool	rej_use_sensible_wd = false

bool	rej_alphas_in_number_perm = false

double	rej_whole_of_mostly_reject_word_fract = 0.85

int	tessedit_image_border = 2

char *	ok_repeated_ch_non_alphanum_wds = "-?*\075"

char *	conflict_set_I_l_1 = "Il1[]"

int	min_sane_x_ht_pixels = 8

bool	tessedit_create_boxfile = false

int	tessedit_page_number = -1

bool	tessedit_write_images = false

bool	interactive_display_mode = false

char *	file_type = ".tif"

bool	tessedit_override_permuter = true

char *	tessedit_load_sublangs = ""

bool	tessedit_use_primary_params_model = false

double	min_orientation_margin = 7.0

bool	textord_tabfind_show_vlines = false

bool	textord_use_cjk_fp_model = FALSE

bool	poly_allow_detailed_fx = false

bool	tessedit_init_config_only = false

bool	textord_equation_detect = false

bool	textord_tabfind_vertical_text = true

bool	textord_tabfind_force_vertical_text = false

double	textord_tabfind_vertical_text_ratio = 0.5

double	textord_tabfind_aligned_gap_fraction = 0.75

int	tessedit_parallelize = 0

bool	preserve_interword_spaces = false

bool	include_page_breaks = false

char *	page_separator = "\f"

bool	textord_tabfind_vertical_horizontal_mix = true

int	tessedit_ok_mode = 5

bool	load_fixed_length_dawgs = true

int	segment_debug = 0

bool	permute_debug = 0

double	bestrate_pruning_factor = 2.0

bool	permute_script_word = 0

bool	segment_segcost_rating = 0

double	segment_reward_script = 0.95

bool	permute_fixed_length_dawg = 0

bool	permute_chartype_word = 0

double	segment_reward_chartype = 0.97

double	segment_reward_ngram_best_choice = 0.99

bool	ngram_permuter_activated = false

bool	permute_only_top = false

int	language_model_fixed_length_choices_depth = 3

bool	use_new_state_cost = FALSE

double	heuristic_segcost_rating_base = 1.25

double	heuristic_weight_rating = 1

double	heuristic_weight_width = 1000.0

double	heuristic_weight_seamcut = 0

double	heuristic_max_char_wh_ratio = 2.0

bool	enable_new_segsearch = false

double	segsearch_max_fixed_pitch_char_wh_ratio = 2.0

Public Attributes inherited from tesseract::Wordrec
bool	merge_fragments_in_matrix = TRUE

bool	wordrec_no_block = FALSE

bool	wordrec_enable_assoc = TRUE

bool	force_word_assoc = FALSE

double	wordrec_worst_state = 1

bool	fragments_guide_chopper = FALSE

int	repair_unchopped_blobs = 1

double	tessedit_certainty_threshold = -2.25

int	chop_debug = 0

bool	chop_enable = 1

bool	chop_vertical_creep = 0

int	chop_split_length = 10000

int	chop_same_distance = 2

int	chop_min_outline_points = 6

int	chop_seam_pile_size = 150

bool	chop_new_seam_pile = 1

int	chop_inside_angle = -50

int	chop_min_outline_area = 2000

double	chop_split_dist_knob = 0.5

double	chop_overlap_knob = 0.9

double	chop_center_knob = 0.15

int	chop_centered_maxwidth = 90

double	chop_sharpness_knob = 0.06

double	chop_width_change_knob = 5.0

double	chop_ok_split = 100.0

double	chop_good_split = 50.0

int	chop_x_y_weight = 3

int	segment_adjust_debug = 0

bool	assume_fixed_pitch_char_segment = FALSE

int	wordrec_debug_level = 0

int	wordrec_max_join_chunks = 4

bool	wordrec_skip_no_truth_words = false

bool	wordrec_debug_blamer = false

bool	wordrec_run_blamer = false

int	segsearch_debug_level = 0

int	segsearch_max_pain_points = 2000

int	segsearch_max_futile_classifications = 10

double	segsearch_max_char_wh_ratio = 2.0

bool	save_alt_choices = true

LanguageModel *	language_model_

PRIORITY	pass2_ok_split

WERD_CHOICE *	prev_word_best_choice_

GenericVector< int >	blame_reasons_

void(Wordrec::*	fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Public Attributes inherited from tesseract::Classify
bool	allow_blob_division = true

bool	prioritize_division = FALSE

int	tessedit_single_match = FALSE

bool	classify_enable_learning = true

int	classify_debug_level = 0

int	classify_norm_method = character

double	classify_char_norm_range = 0.2

double	classify_min_norm_scale_x = 0.0

double	classify_max_norm_scale_x = 0.325

double	classify_min_norm_scale_y = 0.0

double	classify_max_norm_scale_y = 0.325

double	classify_max_rating_ratio = 1.5

double	classify_max_certainty_margin = 5.5

bool	tess_cn_matching = 0

bool	tess_bn_matching = 0

bool	classify_enable_adaptive_matcher = 1

bool	classify_use_pre_adapted_templates = 0

bool	classify_save_adapted_templates = 0

bool	classify_enable_adaptive_debugger = 0

bool	classify_nonlinear_norm = 0

int	matcher_debug_level = 0

int	matcher_debug_flags = 0

int	classify_learning_debug_level = 0

double	matcher_good_threshold = 0.125

double	matcher_reliable_adaptive_result = 0.0

double	matcher_perfect_threshold = 0.02

double	matcher_bad_match_pad = 0.15

double	matcher_rating_margin = 0.1

double	matcher_avg_noise_size = 12.0

int	matcher_permanent_classes_min = 1

int	matcher_min_examples_for_prototyping = 3

int	matcher_sufficient_examples_for_prototyping = 5

double	matcher_clustering_max_angle_delta = 0.015

double	classify_misfit_junk_penalty = 0.0

double	rating_scale = 1.5

double	certainty_scale = 20.0

double	tessedit_class_miss_scale = 0.00390625

double	classify_adapted_pruning_factor = 2.5

double	classify_adapted_pruning_threshold = -1.0

int	classify_adapt_proto_threshold = 230

int	classify_adapt_feature_threshold = 230

bool	disable_character_fragments = TRUE

double	classify_character_fragments_garbage_certainty_threshold = -3.0

bool	classify_debug_character_fragments = FALSE

bool	matcher_debug_separate_windows = FALSE

char *	classify_learn_debug_str = ""

int	classify_class_pruner_threshold = 229

int	classify_class_pruner_multiplier = 15

int	classify_cp_cutoff_strength = 7

int	classify_integer_matcher_multiplier = 10

INT_TEMPLATES	PreTrainedTemplates

ADAPT_TEMPLATES	AdaptedTemplates

ADAPT_TEMPLATES	BackupAdaptedTemplates

BIT_VECTOR	AllProtosOn

BIT_VECTOR	AllConfigsOn

BIT_VECTOR	AllConfigsOff

BIT_VECTOR	TempProtoMask

bool	EnableLearning

NORM_PROTOS *	NormProtos

UnicityTable< FontInfo >	fontinfo_table_

UnicityTable< FontSet >	fontset_table_

int	il1_adaption_test = 0

bool	classify_bln_numeric_mode = 0

double	speckle_large_max_size = 0.30

double	speckle_rating_penalty = 10.0

Public Attributes inherited from tesseract::CCUtil
STRING	datadir

STRING	imagebasename

STRING	lang

STRING	language_data_path_prefix

UNICHARSET	unicharset

UnicharAmbigs	unichar_ambigs

STRING	imagefile

STRING	directory

char *	m_data_sub_dir = "tessdata/"

int	ambigs_debug_level = 0

bool	use_definite_ambigs_for_classifier = 0

bool	use_ambigs_for_adaption = 0

Additional Inherited Members
Static Public Member Functions inherited from tesseract::Classify
static void	SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM bl_denorm, DENORM cn_denorm, INT_FX_RESULT_STRUCT *fx_info)

static void	ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > bl_features, GenericVector< INT_FEATURE_STRUCT > cn_features, INT_FX_RESULT_STRUCT results, GenericVector< int > outline_cn_counts)

Static Public Attributes inherited from tesseract::CCStruct
static const double	kDescenderFraction = 0.25

static const double	kXHeightFraction = 0.5

static const double	kAscenderFraction = 0.25

static const double	kXHeightCapRatio

Protected Member Functions inherited from tesseract::Wordrec
bool	SegSearchDone (int num_futile_classifications)

void	UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > pending, WERD_RES word_res, LMPainPoints pain_points, BestChoiceBundle best_choice_bundle, BlamerBundle *blamer_bundle)

void	ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char pain_point_type, GenericVector< SegSearchPending > pending, WERD_RES word_res, LMPainPoints pain_points, BlamerBundle *blamer_bundle)

void	ResetNGramSearch (WERD_RES word_res, BestChoiceBundle best_choice_bundle, GenericVector< SegSearchPending > *pending)

void	InitBlamerForSegSearch (WERD_RES word_res, LMPainPoints pain_points, BlamerBundle blamer_bundle, STRING blamer_debug)

Protected Attributes inherited from tesseract::Classify
IntegerMatcher	im_

FEATURE_DEFS_STRUCT	feature_defs_

ShapeTable *	shape_table_

Detailed Description

Definition at line 164 of file tesseractclass.h.

Constructor & Destructor Documentation

◆ Tesseract()

tesseract::Tesseract::Tesseract ( )

Definition at line 54 of file tesseractclass.cpp.

     : BOOL_MEMBER(tessedit_resegment_from_boxes, false,
                   "Take segmentation and labeling from box file",
                   this->params()),
       BOOL_MEMBER(tessedit_resegment_from_line_boxes, false,
                   "Conversion of word/line box file to char box file",
                   this->params()),
       BOOL_MEMBER(tessedit_train_from_boxes, false,
                   "Generate training data from boxed chars", this->params()),
       BOOL_MEMBER(tessedit_make_boxes_from_boxes, false,
                   "Generate more boxes from boxed chars", this->params()),
       BOOL_MEMBER(tessedit_train_line_recognizer, false,
                   "Break input into lines and remap boxes if present",
                   this->params()),
       BOOL_MEMBER(tessedit_dump_pageseg_images, false,
                   "Dump intermediate images made during page segmentation",
                   this->params()),
       // The default for pageseg_mode is the old behaviour, so as not to
       // upset anything that relies on that.
       INT_MEMBER(
           tessedit_pageseg_mode, PSM_SINGLE_BLOCK,
           "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
           " 5=line, 6=word, 7=char"
           " (Values from PageSegMode enum in publictypes.h)",
           this->params()),
       INT_INIT_MEMBER(tessedit_ocr_engine_mode, tesseract::OEM_DEFAULT,
                       "Which OCR engine(s) to run (Tesseract, LSTM, both)."
                       " Defaults to loading and running the most accurate"
                       " available.",
                       this->params()),
       STRING_MEMBER(tessedit_char_blacklist, "",
                     "Blacklist of chars not to recognize", this->params()),
       STRING_MEMBER(tessedit_char_whitelist, "",
                     "Whitelist of chars to recognize", this->params()),
       STRING_MEMBER(tessedit_char_unblacklist, "",
                     "List of chars to override tessedit_char_blacklist",
                     this->params()),
       BOOL_MEMBER(tessedit_ambigs_training, false,
                   "Perform training for ambiguities", this->params()),
       INT_MEMBER(pageseg_devanagari_split_strategy,
                  tesseract::ShiroRekhaSplitter::NO_SPLIT,
                  "Whether to use the top-line splitting process for Devanagari "
                  "documents while performing page-segmentation.",
                  this->params()),
       INT_MEMBER(ocr_devanagari_split_strategy,
                  tesseract::ShiroRekhaSplitter::NO_SPLIT,
                  "Whether to use the top-line splitting process for Devanagari "
                  "documents while performing ocr.",
                  this->params()),
       STRING_MEMBER(tessedit_write_params_to_file, "",
                     "Write all parameters to the given file.", this->params()),
       BOOL_MEMBER(tessedit_adaption_debug, false,
                   "Generate and print debug"
                   " information for adaption",
                   this->params()),
       INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
       INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
       INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
                  this->params()),
       STRING_MEMBER(applybox_exposure_pattern, ".exp",
                     "Exposure value follows"
                     " this pattern in the image filename. The name of the image"
                     " files are expected to be in the form"
                     " [lang].[fontname].exp[num].tif",
                     this->params()),
       BOOL_MEMBER(applybox_learn_chars_and_char_frags_mode, false,
                   "Learn both character fragments (as is done in the"
                   " special low exposure mode) as well as unfragmented"
                   " characters.",
                   this->params()),
       BOOL_MEMBER(applybox_learn_ngrams_mode, false,
                   "Each bounding box"
                   " is assumed to contain ngrams. Only learn the ngrams"
                   " whose outlines overlap horizontally.",
                   this->params()),
       BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
                   this->params()),
       BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
                   this->params()),
       BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
                   this->params()),
       BOOL_MEMBER(tessedit_fix_fuzzy_spaces, true,
                   "Try to improve fuzzy spaces", this->params()),
       BOOL_MEMBER(tessedit_unrej_any_wd, false,
                   "Don't bother with word plausibility", this->params()),
       BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
                   this->params()),
       BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
                   this->params()),
       BOOL_MEMBER(tessedit_enable_doc_dict, true,
                   "Add words to the document dictionary", this->params()),
       BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
                   this->params()),
       BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
                   this->params()),
       BOOL_MEMBER(tessedit_enable_bigram_correction, true,
                   "Enable correction based on the word bigram dictionary.",
                   this->params()),
       BOOL_MEMBER(tessedit_enable_dict_correction, false,
                   "Enable single word correction based on the dictionary.",
                   this->params()),
       INT_MEMBER(tessedit_bigram_debug, 0,
                  "Amount of debug output for bigram correction.",
                  this->params()),
       BOOL_MEMBER(enable_noise_removal, true,
                   "Remove and conditionally reassign small outlines when they"
                   " confuse layout analysis, determining diacritics vs noise",
                   this->params()),
       INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
                  this->params()),
       // Worst (min) certainty, for which a diacritic is allowed to make the
       // base
       // character worse and still be included.
       double_MEMBER(noise_cert_basechar, -8.0,
                     "Hingepoint for base char certainty", this->params()),
       // Worst (min) certainty, for which a non-overlapping diacritic is allowed
       // to make the base character worse and still be included.
       double_MEMBER(noise_cert_disjoint, -1.0,
                     "Hingepoint for disjoint certainty", this->params()),
       // Worst (min) certainty, for which a diacritic is allowed to make a new
       // stand-alone blob.
       double_MEMBER(noise_cert_punc, -3.0,
                     "Threshold for new punc char certainty", this->params()),
       // Factor of certainty margin for adding diacritics to not count as worse.
       double_MEMBER(noise_cert_factor, 0.375,
                     "Scaling on certainty diff from Hingepoint",
                     this->params()),
       INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
                  this->params()),
       INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
                  this->params()),
       INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
       BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk",
                   this->params()),
       STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
                     this->params()),
       STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
                     this->params()),
       STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
                     this->params()),
       double_MEMBER(quality_rej_pc, 0.08,
                     "good_quality_doc lte rejection limit", this->params()),
       double_MEMBER(quality_blob_pc, 0.0,
                     "good_quality_doc gte good blobs limit", this->params()),
       double_MEMBER(quality_outline_pc, 1.0,
                     "good_quality_doc lte outline error limit", this->params()),
       double_MEMBER(quality_char_pc, 0.95,
                     "good_quality_doc gte good char limit", this->params()),
       INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
                  this->params()),
       INT_MEMBER(tessedit_tess_adaption_mode, 0x27,
                  "Adaptation decision algorithm for tess", this->params()),
       BOOL_MEMBER(tessedit_minimal_rej_pass1, false,
                   "Do minimal rejection on pass 1 output", this->params()),
       BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
                   this->params()),
       BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity",
                   this->params()),
       INT_MEMBER(tessedit_test_adaption_mode, 3,
                  "Adaptation decision algorithm for tess", this->params()),
       BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
       double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
       double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
       INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
                  this->params()),
       INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
                  this->params()),
       BOOL_MEMBER(paragraph_text_based, true,
                   "Run paragraph detection on the post-text-recognition "
                   "(more accurate)",
                   this->params()),
       BOOL_MEMBER(lstm_use_matrix, 1,
                   "Use ratings matrix/beam search with lstm", this->params()),
       STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
                     this->params()),
       STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
                     this->params()),
       BOOL_MEMBER(docqual_excuse_outline_errs, false,
                   "Allow outline errs in unrejection?", this->params()),
       BOOL_MEMBER(tessedit_good_quality_unrej, true,
                   "Reduce rejection on good docs", this->params()),
       BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
                   this->params()),
       double_MEMBER(tessedit_reject_doc_percent, 65.00,
                     "%rej allowed before rej whole doc", this->params()),
       double_MEMBER(tessedit_reject_block_percent, 45.00,
                     "%rej allowed before rej whole block", this->params()),
       double_MEMBER(tessedit_reject_row_percent, 40.00,
                     "%rej allowed before rej whole row", this->params()),
       double_MEMBER(tessedit_whole_wd_rej_row_percent, 70.00,
                     "Number of row rejects in whole word rejects"
                     "which prevents whole row rejection",
                     this->params()),
       BOOL_MEMBER(tessedit_preserve_blk_rej_perfect_wds, true,
                   "Only rej partially rejected words in block rejection",
                   this->params()),
       BOOL_MEMBER(tessedit_preserve_row_rej_perfect_wds, true,
                   "Only rej partially rejected words in row rejection",
                   this->params()),
       BOOL_MEMBER(tessedit_dont_blkrej_good_wds, false,
                   "Use word segmentation quality metric", this->params()),
       BOOL_MEMBER(tessedit_dont_rowrej_good_wds, false,
                   "Use word segmentation quality metric", this->params()),
       INT_MEMBER(tessedit_preserve_min_wd_len, 2,
                  "Only preserve wds longer than this", this->params()),
       BOOL_MEMBER(tessedit_row_rej_good_docs, true,
                   "Apply row rejection to good docs", this->params()),
       double_MEMBER(tessedit_good_doc_still_rowrej_wd, 1.1,
                     "rej good doc wd if more than this fraction rejected",
                     this->params()),
       BOOL_MEMBER(tessedit_reject_bad_qual_wds, true,
                   "Reject all bad quality wds", this->params()),
       BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
                   this->params()),
       BOOL_MEMBER(tessedit_debug_quality_metrics, false,
                   "Output data to debug file", this->params()),
       BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks",
                   this->params()),
       double_MEMBER(quality_rowrej_pc, 1.1,
                     "good_quality_doc gte good char limit", this->params()),
       BOOL_MEMBER(unlv_tilde_crunching, true,
                   "Mark v.bad words for tilde crunch", this->params()),
       BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
                   this->params()),
       BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
                   this->params()),
       BOOL_MEMBER(crunch_early_convert_bad_unlv_chs, false,
                   "Take out ~^ early?", this->params()),
       double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
                     this->params()),
       BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
       double_MEMBER(crunch_poor_garbage_cert, -9.0,
                     "crunch garbage cert lt this", this->params()),
       double_MEMBER(crunch_poor_garbage_rate, 60,
                     "crunch garbage rating lt this", this->params()),
       double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
                     this->params()),
       double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
                     this->params()),
       BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage",
                   this->params()),
       double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
                     this->params()),
       double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
                     this->params()),
       double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
                     this->params()),
       double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
                     this->params()),
       double_MEMBER(crunch_del_min_width, 3.0,
                     "Del if word width lt xht x this", this->params()),
       double_MEMBER(crunch_del_high_word, 1.5,
                     "Del if word gt xht x this above bl", this->params()),
       double_MEMBER(crunch_del_low_word, 0.5,
                     "Del if word gt xht x this below bl", this->params()),
       double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
                     this->params()),
       INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
                  this->params()),
       INT_MEMBER(crunch_pot_indicators, 1,
                  "How many potential indicators needed", this->params()),
       BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
                   this->params()),
       BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
                   this->params()),
       BOOL_MEMBER(crunch_leave_accept_strings, false,
                   "Don't pot crunch sensible strings", this->params()),
       BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
                   this->params()),
       INT_MEMBER(crunch_leave_lc_strings, 4,
                  "Don't crunch words with long lower case strings",
                  this->params()),
       INT_MEMBER(crunch_leave_uc_strings, 4,
                  "Don't crunch words with long lower case strings",
                  this->params()),
       INT_MEMBER(crunch_long_repetitions, 3,
                  "Crunch words with long repetitions", this->params()),
       INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
       INT_MEMBER(fixsp_non_noise_limit, 1,
                  "How many non-noise blbs either side?", this->params()),
       double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
                     this->params()),
       BOOL_MEMBER(tessedit_prefer_joined_punct, false,
                   "Reward punctation joins", this->params()),
       INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
                  this->params()),
       INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
                  this->params()),
       STRING_MEMBER(numeric_punctuation, ".,",
                     "Punct. chs expected WITHIN numbers", this->params()),
       INT_MEMBER(x_ht_acceptance_tolerance, 8,
                  "Max allowed deviation of blob top outside of font data",
                  this->params()),
       INT_MEMBER(x_ht_min_change, 8,
                  "Min change in xht before actually trying it", this->params()),
       INT_MEMBER(superscript_debug, 0,
                  "Debug level for sub & superscript fixer", this->params()),
       double_MEMBER(
           superscript_worse_certainty, 2.0,
           "How many times worse "
           "certainty does a superscript position glyph need to be for "
           "us to try classifying it as a char with a different "
           "baseline?",
           this->params()),
       double_MEMBER(
           superscript_bettered_certainty, 0.97,
           "What reduction in "
           "badness do we think sufficient to choose a superscript "
           "over what we'd thought.  For example, a value of 0.6 means "
           "we want to reduce badness of certainty by at least 40%",
           this->params()),
       double_MEMBER(superscript_scaledown_ratio, 0.4,
                     "A superscript scaled down more than this is unbelievably "
                     "small.  For example, 0.3 means we expect the font size to "
                     "be no smaller than 30% of the text line font size.",
                     this->params()),
       double_MEMBER(subscript_max_y_top, 0.5,
                     "Maximum top of a character measured as a multiple of "
                     "x-height above the baseline for us to reconsider whether "
                     "it's a subscript.",
                     this->params()),
       double_MEMBER(superscript_min_y_bottom, 0.3,
                     "Minimum bottom of a character measured as a multiple of "
                     "x-height above the baseline for us to reconsider whether "
                     "it's a superscript.",
                     this->params()),
       BOOL_MEMBER(tessedit_write_block_separators, false,
                   "Write block separators in output", this->params()),
       BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
                   this->params()),
       BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
                   this->params()),
       BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
                   this->params()),
       BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
                   this->params()),
       BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
                   this->params()),
       BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
                   this->params()),
       BOOL_MEMBER(textonly_pdf, false,
                   "Create PDF with only one invisible text layer",
                   this->params()),
       STRING_MEMBER(unrecognised_char, "|",
                     "Output char for unidentified blobs", this->params()),
       INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
       INT_MEMBER(suspect_space_level, 100,
                  "Min suspect level for rejecting spaces", this->params()),
       INT_MEMBER(suspect_short_words, 2,
                  "Don't suspect dict wds longer than this", this->params()),
       BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
                   this->params()),
       double_MEMBER(suspect_rating_per_ch, 999.9,
                     "Don't touch bad rating limit", this->params()),
       double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
                     this->params()),
       BOOL_MEMBER(tessedit_minimal_rejection, false,
                   "Only reject tess failures", this->params()),
       BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
                   this->params()),
       BOOL_MEMBER(tessedit_word_for_word, false,
                   "Make output have exactly one word per WERD", this->params()),
       BOOL_MEMBER(tessedit_zero_kelvin_rejection, false,
                   "Don't reject ANYTHING AT ALL", this->params()),
       BOOL_MEMBER(tessedit_consistent_reps, true,
                   "Force all rep chars the same", this->params()),
       INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
                  this->params()),
       BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
                   this->params()),
       BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
                   this->params()),
       double_MEMBER(tessedit_lower_flip_hyphen, 1.5,
                     "Aspect ratio dot/hyphen test", this->params()),
       double_MEMBER(tessedit_upper_flip_hyphen, 1.8,
                     "Aspect ratio dot/hyphen test", this->params()),
       BOOL_MEMBER(rej_trust_doc_dawg, false,
                   "Use DOC dawg in 11l conf. detector", this->params()),
       BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
                   this->params()),
       BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
                   this->params()),
       BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
                   this->params()),
       BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
                   this->params()),
       BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
                   this->params()),
       BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
                   this->params()),
       BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
                   this->params()),
       double_MEMBER(rej_whole_of_mostly_reject_word_fract, 0.85,
                     "if >this fract", this->params()),
       INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
                  this->params()),
       STRING_MEMBER(ok_repeated_ch_non_alphanum_wds, "-?*\075",
                     "Allow NN to unrej", this->params()),
       STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
                     this->params()),
       INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
                  this->params()),
       BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
                   this->params()),
       INT_MEMBER(tessedit_page_number, -1,
                  "-1 -> All pages"
                  " , else specific page to process",
                  this->params()),
       BOOL_MEMBER(tessedit_write_images, false,
                   "Capture the image from the IPE", this->params()),
       BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
                   this->params()),
       STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
       BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
                   this->params()),
       STRING_MEMBER(tessedit_load_sublangs, "",
                     "List of languages to load with this one", this->params()),
       BOOL_MEMBER(tessedit_use_primary_params_model, false,
                   "In multilingual mode use params model of the"
                   " primary language",
                   this->params()),
       double_MEMBER(min_orientation_margin, 7.0,
                     "Min acceptable orientation margin", this->params()),
       BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
                   this->params()),
       BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
                   this->params()),
       BOOL_MEMBER(poly_allow_detailed_fx, false,
                   "Allow feature extractors to see the original outline",
                   this->params()),
       BOOL_INIT_MEMBER(tessedit_init_config_only, false,
                        "Only initialize with the config file. Useful if the "
                        "instance is not going to be used for OCR but say only "
                        "for layout analysis.",
                        this->params()),
       BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
                   this->params()),
       BOOL_MEMBER(textord_tabfind_vertical_text, true,
                   "Enable vertical detection", this->params()),
       BOOL_MEMBER(textord_tabfind_force_vertical_text, false,
                   "Force using vertical text page mode", this->params()),
       double_MEMBER(
           textord_tabfind_vertical_text_ratio, 0.5,
           "Fraction of textlines deemed vertical to use vertical page "
           "mode",
           this->params()),
       double_MEMBER(
           textord_tabfind_aligned_gap_fraction, 0.75,
           "Fraction of height used as a minimum gap for aligned blobs.",
           this->params()),
       INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
                  this->params()),
       BOOL_MEMBER(preserve_interword_spaces, false,
                   "Preserve multiple interword spaces", this->params()),
       BOOL_MEMBER(include_page_breaks, FALSE,
                   "Include page separator string in output text after each "
                   "image/page.",
                   this->params()),
       STRING_MEMBER(page_separator, "\f",
                     "Page separator (default is form feed control character)",
                     this->params()),
 
       // The following parameters were deprecated and removed from their
       // original
       // locations. The parameters are temporarily kept here to give Tesseract
       // users a chance to updated their [lang].traineddata and config files
       // without introducing failures during Tesseract initialization.
       // TODO(ocr-team): remove these parameters from the code once we are
       // reasonably sure that Tesseract users have updated their data files.
       //
       // BEGIN DEPRECATED PARAMETERS
       BOOL_MEMBER(textord_tabfind_vertical_horizontal_mix, true,
                   "find horizontal lines such as headers in vertical page mode",
                   this->params()),
       INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm",
                  this->params()),
       BOOL_INIT_MEMBER(load_fixed_length_dawgs, true,
                        "Load fixed length dawgs"
                        " (e.g. for non-space delimited languages)",
                        this->params()),
       INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
                  this->params()),
       BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
                   this->params()),
       double_MEMBER(bestrate_pruning_factor, 2.0,
                     "Multiplying factor of"
                     " current best rate to prune other hypotheses",
                     this->params()),
       BOOL_MEMBER(permute_script_word, 0,
                   "Turn on word script consistency permuter", this->params()),
       BOOL_MEMBER(segment_segcost_rating, 0,
                   "incorporate segmentation cost in word rating?",
                   this->params()),
       double_MEMBER(segment_reward_script, 0.95,
                     "Score multipler for script consistency within a word. "
                     "Being a 'reward' factor, it should be <= 1. "
                     "Smaller value implies bigger reward.",
                     this->params()),
       BOOL_MEMBER(permute_fixed_length_dawg, 0,
                   "Turn on fixed-length phrasebook search permuter",
                   this->params()),
       BOOL_MEMBER(permute_chartype_word, 0,
                   "Turn on character type (property) consistency permuter",
                   this->params()),
       double_MEMBER(segment_reward_chartype, 0.97,
                     "Score multipler for char type consistency within a word. ",
                     this->params()),
       double_MEMBER(segment_reward_ngram_best_choice, 0.99,
                     "Score multipler for ngram permuter's best choice"
                     " (only used in the Han script path).",
                     this->params()),
       BOOL_MEMBER(ngram_permuter_activated, false,
                   "Activate character-level n-gram-based permuter",
                   this->params()),
       BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
                   this->params()),
       INT_MEMBER(language_model_fixed_length_choices_depth, 3,
                  "Depth of blob choice lists to explore"
                  " when fixed length dawgs are on",
                  this->params()),
       BOOL_MEMBER(use_new_state_cost, FALSE,
                   "use new state cost heuristics for segmentation state"
                   " evaluation",
                   this->params()),
       double_MEMBER(heuristic_segcost_rating_base, 1.25,
                     "base factor for adding segmentation cost into word rating."
                     "It's a multiplying factor, the larger the value above 1, "
                     "the bigger the effect of segmentation cost.",
                     this->params()),
       double_MEMBER(heuristic_weight_rating, 1.0,
                     "weight associated with char rating in combined cost of"
                     "state",
                     this->params()),
       double_MEMBER(heuristic_weight_width, 1000.0,
                     "weight associated with width evidence in combined cost of"
                     " state",
                     this->params()),
       double_MEMBER(heuristic_weight_seamcut, 0.0,
                     "weight associated with seam cut in combined cost of state",
                     this->params()),
       double_MEMBER(heuristic_max_char_wh_ratio, 2.0,
                     "max char width-to-height ratio allowed in segmentation",
                     this->params()),
       BOOL_MEMBER(enable_new_segsearch, true,
                   "Enable new segmentation search path.", this->params()),
       double_MEMBER(segsearch_max_fixed_pitch_char_wh_ratio, 2.0,
                     "Maximum character width-to-height ratio for"
                     " fixed-pitch fonts",
                     this->params()),
       // END DEPRECATED PARAMETERS
 
       backup_config_file_(NULL),
       pix_binary_(NULL),
       pix_grey_(NULL),
       pix_original_(NULL),
       pix_thresholds_(NULL),
       source_resolution_(0),
       textord_(this),
       right_to_left_(false),
       scaled_color_(NULL),
       scaled_factor_(-1),
       deskew_(1.0f, 0.0f),
       reskew_(1.0f, 0.0f),
       most_recently_used_(this),
       font_table_size_(0),
       equ_detect_(NULL),
 #ifndef ANDROID_BUILD
       lstm_recognizer_(NULL),
 #endif
       train_line_page_num_(0) {
 }

◆ ~Tesseract()

tesseract::Tesseract::~Tesseract ( )

Definition at line 626 of file tesseractclass.cpp.

                       {
   Clear();
   pixDestroy(&pix_original_);
   end_tesseract();
   sub_langs_.delete_data_pointers();
 #ifndef ANDROID_BUILD
   delete lstm_recognizer_;
   lstm_recognizer_ = NULL;
 #endif
 }

Member Function Documentation

◆ acceptable_number_string()

BOOL8 tesseract::Tesseract::acceptable_number_string	(	const char *	s,
		const char *	lengths
	)

Definition at line 419 of file output.cpp.

                                                                {
   BOOL8 prev_digit = FALSE;
 
   if (*lengths == 1 && *s == '(')
     s++;
 
   if (*lengths == 1 &&
       ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
     s++;
 
   for (; *s != '\0'; s += *(lengths++)) {
     if (unicharset.get_isdigit(s, *lengths))
       prev_digit = TRUE;
     else if (prev_digit &&
              (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
       prev_digit = FALSE;
     else if (prev_digit && *lengths == 1 &&
              (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
       return TRUE;
     else if (prev_digit &&
              *lengths == 1 && (*s == '%') &&
              (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
              (*(s + *lengths + *(lengths + 1)) == '\0'))
       return TRUE;
     else
       return FALSE;
   }
   return TRUE;
 }

◆ acceptable_word_string()

ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string	(	const UNICHARSET &	char_set,
		const char *	s,
		const char *	lengths
	)

Definition at line 1690 of file control.cpp.

                                                                     {
   int i = 0;
   int offset = 0;
   int leading_punct_count;
   int upper_count = 0;
   int hyphen_pos = -1;
   ACCEPTABLE_WERD_TYPE word_type = AC_UNACCEPTABLE;
 
   if (strlen (lengths) > 20)
     return word_type;
 
   /* Single Leading punctuation char*/
 
   if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
     offset += lengths[i++];
   leading_punct_count = i;
 
   /* Initial cap */
   while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
     offset += lengths[i++];
     upper_count++;
   }
   if (upper_count > 1) {
     word_type = AC_UPPER_CASE;
   } else {
     /* Lower case word, possibly with an initial cap */
     while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
       offset += lengths[i++];
     }
     if (i - leading_punct_count < quality_min_initial_alphas_reqd)
       goto not_a_word;
     /*
     Allow a single hyphen in a lower case word
     - don't trust upper case - I've seen several cases of "H" -> "I-I"
     */
     if (lengths[i] == 1 && s[offset] == '-') {
       hyphen_pos = i;
       offset += lengths[i++];
       if (s[offset] != '\0') {
         while ((s[offset] != '\0') &&
                char_set.get_islower(s + offset, lengths[i])) {
           offset += lengths[i++];
         }
         if (i < hyphen_pos + 3)
           goto not_a_word;
       }
     } else {
       /* Allow "'s" in NON hyphenated lower case words */
       if (lengths[i] == 1 && (s[offset] == '\'') &&
           lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
         offset += lengths[i++];
         offset += lengths[i++];
       }
     }
     if (upper_count > 0)
       word_type = AC_INITIAL_CAP;
     else
       word_type = AC_LOWER_CASE;
   }
 
   /* Up to two different, constrained trailing punctuation chars */
   if (lengths[i] == 1 && s[offset] != '\0' &&
       STRING(chs_trailing_punct1).contains(s[offset]))
     offset += lengths[i++];
   if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
       s[offset - lengths[i - 1]] != s[offset] &&
       STRING(chs_trailing_punct2).contains (s[offset]))
     offset += lengths[i++];
 
   if (s[offset] != '\0')
     word_type = AC_UNACCEPTABLE;
 
   not_a_word:
 
   if (word_type == AC_UNACCEPTABLE) {
     /* Look for abbreviation string */
     i = 0;
     offset = 0;
     if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
       word_type = AC_UC_ABBREV;
       while (s[offset] != '\0' &&
              char_set.get_isupper(s + offset, lengths[i]) &&
              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
         offset += lengths[i++];
         offset += lengths[i++];
       }
     }
     else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
       word_type = AC_LC_ABBREV;
       while (s[offset] != '\0' &&
              char_set.get_islower(s + offset, lengths[i]) &&
              lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
         offset += lengths[i++];
         offset += lengths[i++];
       }
     }
     if (s[offset] != '\0')
       word_type = AC_UNACCEPTABLE;
   }
 
   return word_type;
 }

◆ alpha_count()

inT16 tesseract::Tesseract::alpha_count	(	const char *	word,
		const char *	word_lengths
	)

Definition at line 495 of file reject.cpp.

                                                        {
   inT16 i;
   inT16 offset;
   inT16 count = 0;
 
   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
     if (unicharset.get_isalpha (word + offset, word_lengths[i]))
       count++;
   }
   return count;
 }

◆ ambigs_classify_and_output()

void tesseract::Tesseract::ambigs_classify_and_output	(	const char *	label,
		PAGE_RES_IT *	pr_it,
		FILE *	output_file
	)

Definition at line 202 of file recogtraining.cpp.

                                                               {
   // Classify word.
   fflush(stdout);
   WordData word_data(*pr_it);
   SetupWordPassN(1, &word_data);
   classify_word_and_language(1, pr_it, &word_data);
   WERD_RES* werd_res = word_data.word;
   WERD_CHOICE *best_choice = werd_res->best_choice;
   ASSERT_HOST(best_choice != NULL);
 
   // Compute the number of unichars in the label.
   GenericVector<UNICHAR_ID> encoding;
   if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) {
     tprintf("Not outputting illegal unichar %s\n", label);
     return;
   }
 
   // Dump all paths through the ratings matrix (which is normally small).
   int dim = werd_res->ratings->dimension();
   const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
   PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
                    unicharset, label, output_file);
   delete [] blob_choices;
 }

◆ AnyLSTMLang()

bool tesseract::Tesseract::AnyLSTMLang ( ) const

inline

Definition at line 268 of file tesseractclass.h.

                            {
     if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) return true;
     for (int i = 0; i < sub_langs_.size(); ++i) {
       if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY)
         return true;
     }
     return false;
   }

◆ AnyTessLang()

bool tesseract::Tesseract::AnyTessLang ( ) const

inline

Definition at line 260 of file tesseractclass.h.

                            {
     if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
     for (int i = 0; i < sub_langs_.size(); ++i) {
       if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
     }
     return false;
   }

◆ ApplyBoxes()

PAGE_RES * tesseract::Tesseract::ApplyBoxes	(	const STRING &	fname,
		bool	find_segmentation,
		BLOCK_LIST *	block_list
	)

Definition at line 117 of file applybox.cpp.

                                                         {
   GenericVector<TBOX> boxes;
   GenericVector<STRING> texts, full_texts;
   if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
                     NULL)) {
     return NULL;  // Can't do it.
   }
 
   int box_count = boxes.size();
   int box_failures = 0;
   // Add an empty everything to the end.
   boxes.push_back(TBOX());
   texts.push_back(STRING());
   full_texts.push_back(STRING());
 
   // In word mode, we use the boxes to make a word for each box, but
   // in blob mode we use the existing words and maximally chop them first.
   PAGE_RES* page_res = find_segmentation ?
       NULL : SetupApplyBoxes(boxes, block_list);
   clear_any_old_text(block_list);
 
   for (int i = 0; i < boxes.size() - 1; i++) {
     bool foundit = false;
     if (page_res != NULL) {
       if (i == 0) {
         foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
                                    full_texts[i].string());
       } else {
         foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
                                    boxes[i + 1], full_texts[i].string());
       }
     } else {
       foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
                                  texts[i].string());
     }
     if (!foundit) {
       box_failures++;
       ReportFailedBox(i, boxes[i], texts[i].string(),
                       "FAILURE! Couldn't find a matching blob");
     }
   }
 
   if (page_res == NULL) {
     // In word/line mode, we now maximally chop all the words and resegment
     // them with the classifier.
     page_res = SetupApplyBoxes(boxes, block_list);
     ReSegmentByClassification(page_res);
   }
   if (applybox_debug > 0) {
     tprintf("APPLY_BOXES:\n");
     tprintf("   Boxes read from boxfile:  %6d\n", box_count);
     if (box_failures > 0)
       tprintf("   Boxes failed resegmentation:  %6d\n", box_failures);
   }
   TidyUp(page_res);
   return page_res;
 }

◆ ApplyBoxTraining()

void tesseract::Tesseract::ApplyBoxTraining	(	const STRING &	fontname,
		PAGE_RES *	page_res
	)

Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.

Definition at line 796 of file applybox.cpp.

                                                                            {
   PAGE_RES_IT pr_it(page_res);
   int word_count = 0;
   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
        word_res = pr_it.forward()) {
     LearnWord(fontname.string(), word_res);
     ++word_count;
   }
   tprintf("Generated training data for %d words\n", word_count);
 }

◆ AssignDiacriticsToNewBlobs()

void tesseract::Tesseract::AssignDiacriticsToNewBlobs	(	const GenericVector< C_OUTLINE *> &	outlines,
		int	pass,
		WERD *	real_word,
		PAGE_RES_IT *	pr_it,
		GenericVector< bool > *	word_wanted,
		GenericVector< C_BLOB >	target_blobs
	)

Definition at line 1046 of file control.cpp.

                                           {
   GenericVector<bool> blob_wanted;
   word_wanted->init_to_size(outlines.size(), false);
   target_blobs->init_to_size(outlines.size(), NULL);
   // Check for outlines that need to be turned into stand-alone blobs.
   for (int i = 0; i < outlines.size(); ++i) {
     if (outlines[i] == NULL) continue;
     // Get a set of adjacent outlines that don't overlap any existing blob.
     blob_wanted.init_to_size(outlines.size(), false);
     int num_blob_outlines = 0;
     TBOX total_ol_box(outlines[i]->bounding_box());
     while (i < outlines.size() && outlines[i] != NULL) {
       blob_wanted[i] = true;
       total_ol_box += outlines[i]->bounding_box();
       ++i;
       ++num_blob_outlines;
     }
     // Find the insertion point.
     C_BLOB_IT blob_it(real_word->cblob_list());
     while (!blob_it.at_last() &&
            blob_it.data_relative(1)->bounding_box().left() <=
                total_ol_box.left()) {
       blob_it.forward();
     }
     // Choose which combination of them we actually want and where to put
     // them.
     if (debug_noise_removal)
       tprintf("Num blobless outlines = %d\n", num_blob_outlines);
     C_BLOB* left_blob = blob_it.data();
     TBOX left_box = left_blob->bounding_box();
     C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
     if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
          !right_blob->bounding_box().x_overlap(total_ol_box)) &&
         SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
                                     outlines, num_blob_outlines,
                                     &blob_wanted)) {
       if (debug_noise_removal) tprintf("Added to left blob\n");
       for (int j = 0; j < blob_wanted.size(); ++j) {
         if (blob_wanted[j]) {
           (*word_wanted)[j] = true;
           (*target_blobs)[j] = left_blob;
         }
       }
     } else if (right_blob != NULL &&
                (!left_box.x_overlap(total_ol_box) ||
                 right_blob->bounding_box().x_overlap(total_ol_box)) &&
                SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it,
                                            right_blob, outlines,
                                            num_blob_outlines, &blob_wanted)) {
       if (debug_noise_removal) tprintf("Added to right blob\n");
       for (int j = 0; j < blob_wanted.size(); ++j) {
         if (blob_wanted[j]) {
           (*word_wanted)[j] = true;
           (*target_blobs)[j] = right_blob;
         }
       }
     } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
                                            outlines, num_blob_outlines,
                                            &blob_wanted)) {
       if (debug_noise_removal) tprintf("Fitted between blobs\n");
       for (int j = 0; j < blob_wanted.size(); ++j) {
         if (blob_wanted[j]) {
           (*word_wanted)[j] = true;
           (*target_blobs)[j] = NULL;
         }
       }
     }
   }
 }

◆ AssignDiacriticsToOverlappingBlobs()

void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs	(	const GenericVector< C_OUTLINE *> &	outlines,
		int	pass,
		WERD *	real_word,
		PAGE_RES_IT *	pr_it,
		GenericVector< bool > *	word_wanted,
		GenericVector< bool > *	overlapped_any_blob,
		GenericVector< C_BLOB >	target_blobs
	)

Definition at line 993 of file control.cpp.

                                           {
   GenericVector<bool> blob_wanted;
   word_wanted->init_to_size(outlines.size(), false);
   overlapped_any_blob->init_to_size(outlines.size(), false);
   target_blobs->init_to_size(outlines.size(), NULL);
   // For each real blob, find the outlines that seriously overlap it.
   // A single blob could be several merged characters, so there can be quite
   // a few outlines overlapping, and the full engine needs to be used to chop
   // and join to get a sensible result.
   C_BLOB_IT blob_it(real_word->cblob_list());
   for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
     C_BLOB* blob = blob_it.data();
     TBOX blob_box = blob->bounding_box();
     blob_wanted.init_to_size(outlines.size(), false);
     int num_blob_outlines = 0;
     for (int i = 0; i < outlines.size(); ++i) {
       if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
           !(*word_wanted)[i]) {
         blob_wanted[i] = true;
         (*overlapped_any_blob)[i] = true;
         ++num_blob_outlines;
       }
     }
     if (debug_noise_removal) {
       tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
       blob_box.print();
     }
     // If any outlines overlap the blob, and not too many, classify the blob
     // (using the full engine, languages and all), and choose the maximal
     // combination of outlines that doesn't hurt the end-result classification
     // by too much. Mark them as wanted.
     if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
       if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
                                       outlines, num_blob_outlines,
                                       &blob_wanted)) {
         for (int i = 0; i < blob_wanted.size(); ++i) {
           if (blob_wanted[i]) {
             // Claim the outline and record where it is going.
             (*word_wanted)[i] = true;
             (*target_blobs)[i] = blob;
           }
         }
       }
     }
   }
 }

◆ AutoPageSeg()

int tesseract::Tesseract::AutoPageSeg	(	PageSegMode	pageseg_mode,
		BLOCK_LIST *	blocks,
		TO_BLOCK_LIST *	to_blocks,
		BLOBNBOX_LIST *	diacritic_blobs,
		Tesseract *	osd_tess,
		OSResults *	osr
	)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout anaylsis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 204 of file pagesegmain.cpp.

                                            {
   Pix* photomask_pix = NULL;
   Pix* musicmask_pix = NULL;
   // The blocks made by the ColumnFinder. Moved to blocks before return.
   BLOCK_LIST found_blocks;
   TO_BLOCK_LIST temp_blocks;
 
   ColumnFinder* finder = SetupPageSegAndDetectOrientation(
       pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
       &musicmask_pix);
   int result = 0;
   if (finder != NULL) {
     TO_BLOCK_IT to_block_it(&temp_blocks);
     TO_BLOCK* to_block = to_block_it.data();
     if (musicmask_pix != NULL) {
       // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
       // blocks separately. For now combine with photomask_pix.
       pixOr(photomask_pix, photomask_pix, musicmask_pix);
     }
     if (equ_detect_) {
       finder->SetEquationDetect(equ_detect_);
     }
     result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
                                 to_block, photomask_pix, pix_thresholds_,
                                 pix_grey_, &pixa_debug_, &found_blocks,
                                 diacritic_blobs, to_blocks);
     if (result >= 0)
       finder->GetDeskewVectors(&deskew_, &reskew_);
     delete finder;
   }
   pixDestroy(&photomask_pix);
   pixDestroy(&musicmask_pix);
   if (result < 0) return result;
 
   blocks->clear();
   BLOCK_IT block_it(blocks);
   // Move the found blocks to the input/output blocks.
   block_it.add_list_after(&found_blocks);
   return result;
 }

◆ BelievableSuperscript()

bool tesseract::Tesseract::BelievableSuperscript	(	bool	debug,
		const WERD_RES &	word,
		float	certainty_threshold,
		int *	left_ok,
		int *	right_ok
	)		const

Return whether this is believable superscript or subscript text.

We insist that:

there are no punctuation marks.
there are no italics.
no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
each character is at least as certain as certainty_threshold.

Parameters

[in]	debug	If true, spew debug output
[in]	word	The word whose best_choice we're evaluating
[in]	certainty_threshold	If any of the characters have less certainty than this, reject.
[out]	left_ok	How many left-side characters were ok?
[out]	right_ok	How many right-side characters were ok?

Returns: Whether the complete best choice is believable as a superscript.

Definition at line 520 of file superscript.cpp.

                                                            {
   int initial_ok_run_count = 0;
   int ok_run_count = 0;
   float worst_certainty = 0.0f;
   const WERD_CHOICE &wc = *word.best_choice;
 
   const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
   for (int i = 0; i < wc.length(); i++) {
     TBLOB *blob = word.rebuild_word->blobs[i];
     UNICHAR_ID unichar_id = wc.unichar_id(i);
     float char_certainty = wc.certainty(i);
     bool bad_certainty = char_certainty < certainty_threshold;
     bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
     bool is_italic = word.fontinfo && word.fontinfo->is_italic();
     BLOB_CHOICE *choice = word.GetBlobChoice(i);
     if (choice && fontinfo_table.size() > 0) {
       // Get better information from the specific choice, if available.
       int font_id1 = choice->fontinfo_id();
       bool font1_is_italic = font_id1 >= 0
           ? fontinfo_table.get(font_id1).is_italic() : false;
       int font_id2 = choice->fontinfo_id2();
       is_italic = font1_is_italic &&
           (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
     }
 
     float height_fraction = 1.0f;
     float char_height = blob->bounding_box().height();
     float normal_height = char_height;
     if (wc.unicharset()->top_bottom_useful()) {
       int min_bot, max_bot, min_top, max_top;
       wc.unicharset()->get_top_bottom(unichar_id,
                                       &min_bot, &max_bot,
                                       &min_top, &max_top);
       float hi_height = max_top - max_bot;
       float lo_height = min_top - min_bot;
       normal_height = (hi_height + lo_height) / 2;
       if (normal_height >= kBlnXHeight) {
         // Only ding characters that we have decent information for because
         // they're supposed to be normal sized, not tiny specks or dashes.
         height_fraction = char_height / normal_height;
       }
     }
     bool bad_height = height_fraction < superscript_scaledown_ratio;
 
     if (debug) {
       if (is_italic) {
         tprintf(" Rejecting: superscript is italic.\n");
       }
       if (is_punc) {
         tprintf(" Rejecting: punctuation present.\n");
       }
       const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
       if (bad_certainty) {
         tprintf(" Rejecting: don't believe character %s with certainty %.2f "
                 "which is less than threshold %.2f\n", char_str,
                 char_certainty, certainty_threshold);
       }
       if (bad_height) {
         tprintf(" Rejecting: character %s seems too small @ %.2f versus "
                 "expected %.2f\n", char_str, char_height, normal_height);
       }
     }
     if (bad_certainty || bad_height || is_punc || is_italic) {
       if (ok_run_count == i) {
         initial_ok_run_count = ok_run_count;
       }
       ok_run_count = 0;
     } else {
       ok_run_count++;
     }
     if (char_certainty < worst_certainty) {
       worst_certainty = char_certainty;
     }
   }
   bool all_ok = ok_run_count == wc.length();
   if (all_ok && debug) {
     tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
   }
   if (!all_ok) {
     if (left_ok) *left_ok = initial_ok_run_count;
     if (right_ok) *right_ok = ok_run_count;
   }
   return all_ok;
 }

◆ BestPix()

Pix* tesseract::Tesseract::BestPix ( ) const

inline

Definition at line 216 of file tesseractclass.h.

216 { return pix_original_; }

◆ bigram_correction_pass()

void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES * page_res )

Definition at line 450 of file control.cpp.

                                                          {
   PAGE_RES_IT word_it(page_res);
 
   WERD_RES *w_prev = NULL;
   WERD_RES *w = word_it.word();
   while (1) {
     w_prev = w;
     while (word_it.forward() != NULL &&
            (!word_it.word() || word_it.word()->part_of_combo)) {
       // advance word_it, skipping over parts of combos
     }
     if (!word_it.word()) break;
     w = word_it.word();
     if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
       continue;
     }
     if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
       if (tessedit_bigram_debug) {
         tprintf("Skipping because one of the words is W_REP_CHAR\n");
       }
       continue;
     }
     // Two words sharing the same language model, excellent!
     GenericVector<WERD_CHOICE *> overrides_word1;
     GenericVector<WERD_CHOICE *> overrides_word2;
 
     STRING orig_w1_str = w_prev->best_choice->unichar_string();
     STRING orig_w2_str = w->best_choice->unichar_string();
     WERD_CHOICE prev_best(w->uch_set);
     {
       int w1start, w1end;
       w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
       prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
     }
     WERD_CHOICE this_best(w->uch_set);
     {
       int w2start, w2end;
       w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
       this_best = w->best_choice->shallow_copy(w2start, w2end);
     }
 
     if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
       if (tessedit_bigram_debug) {
         tprintf("Top choice \"%s %s\" verified by bigram model.\n",
                 orig_w1_str.string(), orig_w2_str.string());
       }
       continue;
     }
     if (tessedit_bigram_debug > 2) {
       tprintf("Examining alt choices for \"%s %s\".\n",
               orig_w1_str.string(), orig_w2_str.string());
     }
     if (tessedit_bigram_debug > 1) {
       if (!w_prev->best_choices.singleton()) {
         w_prev->PrintBestChoices();
       }
       if (!w->best_choices.singleton()) {
         w->PrintBestChoices();
       }
     }
     float best_rating = 0.0;
     int best_idx = 0;
     WERD_CHOICE_IT prev_it(&w_prev->best_choices);
     for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
       WERD_CHOICE *p1 = prev_it.data();
       WERD_CHOICE strip1(w->uch_set);
       {
         int p1start, p1end;
         p1->GetNonSuperscriptSpan(&p1start, &p1end);
         strip1 = p1->shallow_copy(p1start, p1end);
       }
       WERD_CHOICE_IT w_it(&w->best_choices);
       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
         WERD_CHOICE *p2 = w_it.data();
         WERD_CHOICE strip2(w->uch_set);
         {
           int p2start, p2end;
           p2->GetNonSuperscriptSpan(&p2start, &p2end);
           strip2 = p2->shallow_copy(p2start, p2end);
         }
         if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
           overrides_word1.push_back(p1);
           overrides_word2.push_back(p2);
           if (overrides_word1.size() == 1 ||
               p1->rating() + p2->rating() < best_rating) {
             best_rating = p1->rating() + p2->rating();
             best_idx = overrides_word1.size() - 1;
           }
         }
       }
     }
     if (!overrides_word1.empty()) {
       // Excellent, we have some bigram matches.
       if (EqualIgnoringCaseAndTerminalPunct(*w_prev->best_choice,
                                             *overrides_word1[best_idx]) &&
           EqualIgnoringCaseAndTerminalPunct(*w->best_choice,
                                             *overrides_word2[best_idx])) {
         if (tessedit_bigram_debug > 1) {
           tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
                   "model.\n", orig_w1_str.string(), orig_w2_str.string());
         }
         continue;
       }
       STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
       STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
       if (new_w1_str != orig_w1_str) {
         w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
       }
       if (new_w2_str != orig_w2_str) {
         w->ReplaceBestChoice(overrides_word2[best_idx]);
       }
       if (tessedit_bigram_debug > 0) {
         STRING choices_description;
         int num_bigram_choices
             = overrides_word1.size() * overrides_word2.size();
         if (num_bigram_choices == 1) {
           choices_description = "This was the unique bigram choice.";
         } else {
           if (tessedit_bigram_debug > 1) {
             STRING bigrams_list;
             const int kMaxChoicesToPrint = 20;
             for (int i = 0; i < overrides_word1.size() &&
                  i < kMaxChoicesToPrint; i++) {
               if (i > 0) { bigrams_list += ", "; }
               WERD_CHOICE *p1 = overrides_word1[i];
               WERD_CHOICE *p2 = overrides_word2[i];
               bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
               if (i == kMaxChoicesToPrint) {
                 bigrams_list += " ...";
               }
             }
             choices_description = "There were many choices: {";
             choices_description += bigrams_list;
             choices_description += "}";
           } else {
             choices_description.add_str_int("There were ", num_bigram_choices);
             choices_description += " compatible bigrams.";
           }
         }
         tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
                 orig_w1_str.string(), orig_w2_str.string(),
                 new_w1_str.string(), new_w2_str.string(),
                 choices_description.string());
       }
     }
   }
 }

◆ blamer_pass()

void tesseract::Tesseract::blamer_pass ( PAGE_RES * page_res )

Definition at line 694 of file control.cpp.

                                               {
   if (!wordrec_run_blamer) return;
   PAGE_RES_IT page_res_it(page_res);
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
       page_res_it.forward()) {
     WERD_RES *word = page_res_it.word();
     BlamerBundle::LastChanceBlame(wordrec_debug_blamer, word);
     page_res->blame_reasons[word->blamer_bundle->incorrect_result_reason()]++;
   }
   tprintf("Blame reasons:\n");
   for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
     tprintf("%s %d\n", BlamerBundle::IncorrectReasonName(
         static_cast<IncorrectResultReason>(bl)),
         page_res->blame_reasons[bl]);
   }
   if (page_res->misadaption_log.length() > 0) {
     tprintf("Misadaption log:\n");
     for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
       tprintf("%s\n", page_res->misadaption_log[i].string());
     }
   }
 }

◆ blob_feature_display()

void tesseract::Tesseract::blob_feature_display	(	PAGE_RES *	page_res,
		const TBOX &	selection_box
	)

Definition at line 959 of file pgedit.cpp.

                                                                 {
   PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
   if (it != NULL) {
     WERD_RES* word_res = it->word();
     word_res->x_height = it->row()->row->x_height();
     word_res->SetupForRecognition(unicharset, this, BestPix(),
                                   tessedit_ocr_engine_mode, NULL,
                                   classify_bln_numeric_mode,
                                   textord_use_cjk_fp_model,
                                   poly_allow_detailed_fx,
                                   it->row()->row, it->block()->block);
     TWERD* bln_word = word_res->chopped_word;
     TBLOB* bln_blob = bln_word->blobs[0];
     INT_FX_RESULT_STRUCT fx_info;
     GenericVector<INT_FEATURE_STRUCT> bl_features;
     GenericVector<INT_FEATURE_STRUCT> cn_features;
     Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
                               &cn_features, &fx_info, NULL);
     // Display baseline features.
     ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
     ClearFeatureSpaceWindow(baseline, bl_win);
     for (int f = 0; f < bl_features.size(); ++f)
       RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
     bl_win->Update();
     // Display cn features.
     ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
     ClearFeatureSpaceWindow(character, cn_win);
     for (int f = 0; f < cn_features.size(); ++f)
       RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
     cn_win->Update();
 
     it->DeleteCurrentWord();
     delete it;
   }
 }

◆ blob_noise_score()

float tesseract::Tesseract::blob_noise_score ( TBLOB * blob )

Definition at line 760 of file fixspace.cpp.

                                              {
   TBOX box;                       // BB of outline
   inT16 outline_count = 0;
   inT16 max_dimension;
   inT16 largest_outline_dimension = 0;
 
   for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
     outline_count++;
     box = ol->bounding_box();
     if (box.height() > box.width()) {
       max_dimension = box.height();
     } else {
       max_dimension = box.width();
     }
 
     if (largest_outline_dimension < max_dimension)
       largest_outline_dimension = max_dimension;
   }
 
   if (outline_count > 5) {
     // penalise LOTS of blobs
     largest_outline_dimension *= 2;
   }
 
   box = blob->bounding_box();
   if (box.bottom() > kBlnBaselineOffset * 4 ||
       box.top() < kBlnBaselineOffset / 2) {
     // Lax blob is if high or low
     largest_outline_dimension /= 2;
   }
 
   return largest_outline_dimension;
 }

◆ break_noisiest_blob_word()

void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST & words )

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 615 of file fixspace.cpp.

                                                              {
   WERD_RES_IT word_it(&words);
   WERD_RES_IT worst_word_it;
   float worst_noise_score = 9999;
   int worst_blob_index = -1;     // Noisiest blob of noisiest wd
   int blob_index;                // of wds noisiest blob
   float noise_score;             // of wds noisiest blob
   WERD_RES *word_res;
   C_BLOB_IT blob_it;
   C_BLOB_IT rej_cblob_it;
   C_BLOB_LIST new_blob_list;
   C_BLOB_IT new_blob_it;
   C_BLOB_IT new_rej_cblob_it;
   WERD *new_word;
   inT16 start_of_noise_blob;
   inT16 i;
 
   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     blob_index = worst_noise_blob(word_it.data(), &noise_score);
     if (blob_index > -1 && worst_noise_score > noise_score) {
       worst_noise_score = noise_score;
       worst_blob_index = blob_index;
       worst_word_it = word_it;
     }
   }
   if (worst_blob_index < 0) {
     words.clear();          // signal termination
     return;
   }
 
   /* Now split the worst_word_it */
 
   word_res = worst_word_it.data();
 
   /* Move blobs before noise blob to a new bloblist */
 
   new_blob_it.set_to_list(&new_blob_list);
   blob_it.set_to_list(word_res->word->cblob_list());
   for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
     new_blob_it.add_after_then_move(blob_it.extract());
   }
   start_of_noise_blob = blob_it.data()->bounding_box().left();
   delete blob_it.extract();     // throw out noise blob
 
   new_word = new WERD(&new_blob_list, word_res->word);
   new_word->set_flag(W_EOL, FALSE);
   word_res->word->set_flag(W_BOL, FALSE);
   word_res->word->set_blanks(1);  // After break
 
   new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
   rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
   for (;
        (!rej_cblob_it.empty() &&
         (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
        rej_cblob_it.forward()) {
     new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
   }
 
   WERD_RES* new_word_res = new WERD_RES(new_word);
   new_word_res->combination = TRUE;
   worst_word_it.add_before_then_move(new_word_res);
 
   word_res->ClearResults();
 }

◆ build_menu_new()

SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 257 of file pgedit.cpp.

                                       {
   SVMenuNode* parent_menu;
   SVMenuNode* root_menu_item = new SVMenuNode();
 
   SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
 
   modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
   modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
   modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
   modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
   modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
   modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
   modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
   modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
 
   parent_menu = root_menu_item->AddChild("DISPLAY");
 
   parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE);
   parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE);
   parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE);
   parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE);
   parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, FALSE);
   parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, TRUE);
   parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
   parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
   parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
   parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
   parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
   parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
   parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
   parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
   parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
 
 
   parent_menu = root_menu_item->AddChild("OTHER");
 
   parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
   parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, FALSE);
   parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, FALSE);
   parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, FALSE);
   parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
   parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
 
   return root_menu_item;
 }

◆ check_debug_pt()

BOOL8 tesseract::Tesseract::check_debug_pt	(	WERD_RES *	word,
		int	location
	)

Definition at line 1794 of file control.cpp.

                                                             {
   BOOL8 show_map_detail = FALSE;
   inT16 i;
 
   if (!test_pt)
     return FALSE;
 
   tessedit_rejection_debug.set_value (FALSE);
   debug_x_ht_level.set_value(0);
 
   if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
     if (location < 0)
       return TRUE;               // For breakpoint use
     tessedit_rejection_debug.set_value (TRUE);
     debug_x_ht_level.set_value(2);
     tprintf ("\n\nTESTWD::");
     switch (location) {
       case 0:
         tprintf ("classify_word_pass1 start\n");
         word->word->print();
         break;
       case 10:
         tprintf ("make_reject_map: initial map");
         break;
       case 20:
         tprintf ("make_reject_map: after NN");
         break;
       case 30:
         tprintf ("classify_word_pass2 - START");
         break;
       case 40:
         tprintf ("classify_word_pass2 - Pre Xht");
         break;
       case 50:
         tprintf ("classify_word_pass2 - END");
         show_map_detail = TRUE;
         break;
       case 60:
         tprintf ("fixspace");
         break;
       case 70:
         tprintf ("MM pass START");
         break;
       case 80:
         tprintf ("MM pass END");
         break;
       case 90:
         tprintf ("After Poor quality rejection");
         break;
       case 100:
         tprintf ("unrej_good_quality_words - START");
         break;
       case 110:
         tprintf ("unrej_good_quality_words - END");
         break;
       case 120:
         tprintf ("Write results pass");
         show_map_detail = TRUE;
         break;
     }
     if (word->best_choice != NULL) {
       tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
       word->reject_map.print(debug_fp);
       tprintf("\n");
       if (show_map_detail) {
         tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
         for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
           tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
           word->reject_map[i].full_print(debug_fp);
         }
       }
     } else {
       tprintf("null best choice\n");
     }
     tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
     tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
     return TRUE;
   } else {
     return FALSE;
   }
 }

◆ classify_word_and_language()

void tesseract::Tesseract::classify_word_and_language	(	int	pass_n,
		PAGE_RES_IT *	pr_it,
		WordData *	word_data
	)

Definition at line 1285 of file control.cpp.

                                                                 {
   WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
                                           : &Tesseract::classify_word_pass2;
   // Best result so far.
   PointerVector<WERD_RES> best_words;
   // Points to the best result. May be word or in lang_words.
   WERD_RES* word = word_data->word;
   clock_t start_t = clock();
   bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
   if (debug) {
     tprintf("%s word with lang %s at:",
             word->done ? "Already done" : "Processing",
             most_recently_used_->lang.string());
     word->word->bounding_box().print();
   }
   if (word->done) {
     // If done on pass1, leave it as-is.
     if (!word->tess_failed)
       most_recently_used_ = word->tesseract;
     return;
   }
   int sub = sub_langs_.size();
   if (most_recently_used_ != this) {
     // Get the index of the most_recently_used_.
     for (sub = 0; sub < sub_langs_.size() &&
          most_recently_used_ != sub_langs_[sub]; ++sub) {}
   }
   most_recently_used_->RetryWithLanguage(
       *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
   Tesseract* best_lang_tess = most_recently_used_;
   if (!WordsAcceptable(best_words)) {
     // Try all the other languages to see if they are any better.
     if (most_recently_used_ != this &&
         this->RetryWithLanguage(*word_data, recognizer, debug,
                                 &word_data->lang_words[sub_langs_.size()],
                                 &best_words) > 0) {
       best_lang_tess = this;
     }
     for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
          ++i) {
       if (most_recently_used_ != sub_langs_[i] &&
           sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
                                            &word_data->lang_words[i],
                                            &best_words) > 0) {
         best_lang_tess = sub_langs_[i];
       }
     }
   }
   most_recently_used_ = best_lang_tess;
   if (!best_words.empty()) {
     if (best_words.size() == 1 && !best_words[0]->combination) {
       // Move the best single result to the main word.
       word_data->word->ConsumeWordResults(best_words[0]);
     } else {
       // Words came from LSTM, and must be moved to the PAGE_RES properly.
       word_data->word = best_words.back();
       pr_it->ReplaceCurrentWord(&best_words);
     }
     ASSERT_HOST(word_data->word->box_word != NULL);
   } else {
     tprintf("no best words!!\n");
   }
   clock_t ocr_t = clock();
   if (tessedit_timing_debug) {
     tprintf("%s (ocr took %.2f sec)\n",
             word->best_choice->unichar_string().string(),
             static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
   }
 }

◆ classify_word_pass1()

void tesseract::Tesseract::classify_word_pass1	(	const WordData &	word_data,
		WERD_RES **	in_word,
		PointerVector< WERD_RES > *	out_words
	)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1362 of file control.cpp.

                                                                         {
   ROW* row = word_data.row;
   BLOCK* block = word_data.block;
   prev_word_best_choice_ = word_data.prev_word != NULL
       ? word_data.prev_word->word->best_choice : NULL;
 #ifndef ANDROID_BUILD
   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
       tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
     if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
       LSTMRecognizeWord(*block, row, *in_word, out_words);
       if (!out_words->empty())
         return;  // Successful lstm recognition.
     }
     if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
       // No fallback allowed, so use a fake.
       (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
       return;
     }
     // Fall back to tesseract for failed words or odd words.
     (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
                                     OEM_TESSERACT_ONLY, NULL,
                                     classify_bln_numeric_mode,
                                     textord_use_cjk_fp_model,
                                     poly_allow_detailed_fx, row, block);
   }
 #endif
   WERD_RES* word = *in_word;
   match_word_pass_n(1, word, row, block);
   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
     word->tess_would_adapt = AdaptableWord(word);
     bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
 
     if (adapt_ok) {
       // Send word to adaptive classifier for training.
       word->BestChoiceToCorrectText();
       LearnWord(NULL, word);
       // Mark misadaptions if running blamer.
       if (word->blamer_bundle != NULL) {
         word->blamer_bundle->SetMisAdaptionDebug(word->best_choice,
                                                  wordrec_debug_blamer);
       }
     }
 
     if (tessedit_enable_doc_dict && !word->IsAmbiguous())
       tess_add_doc_word(word->best_choice);
   }
 }

◆ classify_word_pass2()

void tesseract::Tesseract::classify_word_pass2	(	const WordData &	word_data,
		WERD_RES **	in_word,
		PointerVector< WERD_RES > *	out_words
	)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1519 of file control.cpp.

                                                                         {
   // Return if we do not want to run Tesseract.
   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
     return;
   }
   ROW* row = word_data.row;
   BLOCK* block = word_data.block;
   WERD_RES* word = *in_word;
   prev_word_best_choice_ = word_data.prev_word != NULL
       ? word_data.prev_word->word->best_choice : NULL;
 
   set_global_subloc_code(SUBLOC_NORM);
   check_debug_pt(word, 30);
   if (!word->done) {
     word->caps_height = 0.0;
     if (word->x_height == 0.0f)
       word->x_height = row->x_height();
     match_word_pass_n(2, word, row, block);
     check_debug_pt(word, 40);
   }
 
   SubAndSuperscriptFix(word);
 
   if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
     if (unicharset.top_bottom_useful() && unicharset.script_has_xheight() &&
         block->classify_rotation().y() == 0.0f) {
       // Use the tops and bottoms since they are available.
       TrainedXheightFix(word, block, row);
     }
 
     set_global_subloc_code(SUBLOC_NORM);
   }
 #ifndef GRAPHICS_DISABLED
   if (tessedit_display_outwords) {
     if (fx_win == NULL)
       create_fx_win();
     clear_fx_win();
     word->rebuild_word->plot(fx_win);
     TBOX wbox = word->rebuild_word->bounding_box();
     fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
                             wbox.right(), wbox.bottom());
     ScrollView::Update();
   }
 #endif
   set_global_subloc_code(SUBLOC_NORM);
   check_debug_pt(word, 50);
 }

◆ ClassifyBlobAsWord()

float tesseract::Tesseract::ClassifyBlobAsWord	(	int	pass_n,
		PAGE_RES_IT *	pr_it,
		C_BLOB *	blob,
		STRING *	best_str,
		float *	c2
	)

Definition at line 1249 of file control.cpp.

                                                                                {
   WERD* real_word = pr_it->word()->word;
   WERD* word = real_word->ConstructFromSingleBlob(
       real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
   WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
   // Get a new iterator that points to the new word.
   PAGE_RES_IT it(pr_it->page_res);
   while (it.word() != word_res && it.word() != NULL) it.forward();
   ASSERT_HOST(it.word() == word_res);
   WordData wd(it);
   // Force full initialization.
   SetupWordPassN(1, &wd);
   classify_word_and_language(pass_n, &it, &wd);
   if (debug_noise_removal) {
     tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
             wd.row->x_height(), wd.word->raw_choice->min_x_height(),
             wd.word->raw_choice->max_x_height());
   }
   float cert = wd.word->raw_choice->certainty();
   float rat = wd.word->raw_choice->rating();
   *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
   *best_str = wd.word->raw_choice->unichar_string();
   it.DeleteCurrentWord();
   pr_it->ResetWordIterator();
   return cert;
 }

◆ ClassifyBlobPlusOutlines()

float tesseract::Tesseract::ClassifyBlobPlusOutlines	(	const GenericVector< bool > &	ok_outlines,
		const GenericVector< C_OUTLINE *> &	outlines,
		int	pass_n,
		PAGE_RES_IT *	pr_it,
		C_BLOB *	blob,
		STRING *	best_str
	)

Definition at line 1207 of file control.cpp.

                                     {
   C_OUTLINE_IT ol_it;
   C_OUTLINE* first_to_keep = NULL;
   if (blob != NULL) {
     // Add the required outlines to the blob.
     ol_it.set_to_list(blob->out_list());
     first_to_keep = ol_it.data();
   }
   for (int i = 0; i < ok_outlines.size(); ++i) {
     if (ok_outlines[i]) {
       // This outline is to be added.
       if (blob == NULL) {
         blob = new C_BLOB(outlines[i]);
         ol_it.set_to_list(blob->out_list());
       } else {
         ol_it.add_before_stay_put(outlines[i]);
       }
     }
   }
   float c2;
   float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
   ol_it.move_to_first();
   if (first_to_keep == NULL) {
     // We created blob. Empty its outlines and delete it.
     for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
     delete blob;
     cert = -c2;
   } else {
     // Remove the outlines that we put in.
     for (; ol_it.data() != first_to_keep; ol_it.forward()) {
       ol_it.extract();
     }
   }
   return cert;
 }

◆ Clear()

void tesseract::Tesseract::Clear ( )

Definition at line 637 of file tesseractclass.cpp.

                       {
   STRING debug_name = imagebasename + "_debug.pdf";
   pixa_debug_.WritePDF(debug_name.string());
   pixDestroy(&pix_binary_);
   pixDestroy(&pix_grey_);
   pixDestroy(&pix_thresholds_);
   pixDestroy(&scaled_color_);
   deskew_ = FCOORD(1.0f, 0.0f);
   reskew_ = FCOORD(1.0f, 0.0f);
   splitter_.Clear();
   scaled_factor_ = -1;
   for (int i = 0; i < sub_langs_.size(); ++i)
     sub_langs_[i]->Clear();
 }

◆ ComputeCompatibleXheight()

float tesseract::Tesseract::ComputeCompatibleXheight	(	WERD_RES *	word_res,
		float *	baseline_shift
	)

Definition at line 101 of file fixxht.cpp.

                                                                  {
   STATS top_stats(0, MAX_UINT8);
   STATS shift_stats(-MAX_UINT8, MAX_UINT8);
   int bottom_shift = 0;
   int num_blobs = word_res->rebuild_word->NumBlobs();
   do {
     top_stats.clear();
     shift_stats.clear();
     for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
       TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
       UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
       if (unicharset.get_isalpha(class_id) ||
           unicharset.get_isdigit(class_id)) {
         int top = blob->bounding_box().top() + bottom_shift;
         // Clip the top to the limit of normalized feature space.
         if (top >= INT_FEAT_RANGE)
           top = INT_FEAT_RANGE - 1;
         int bottom = blob->bounding_box().bottom() + bottom_shift;
         int min_bottom, max_bottom, min_top, max_top;
         unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
                                   &min_top, &max_top);
         // Chars with a wild top range would mess up the result so ignore them.
         if (max_top - min_top > kMaxCharTopRange)
           continue;
         int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
                             top - (max_top + x_ht_acceptance_tolerance));
         int height = top - kBlnBaselineOffset;
         if (debug_x_ht_level >= 2) {
           tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
                   unicharset.id_to_unichar(class_id),
                   height, min_bottom, max_bottom, min_top, max_top,
                   bottom, top);
         }
         // Use only chars that fit in the expected bottom range, and where
         // the range of tops is sensibly near the xheight.
         if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
             bottom - x_ht_acceptance_tolerance <= max_bottom &&
             min_top > kBlnBaselineOffset &&
             max_top - kBlnBaselineOffset >= kBlnXHeight &&
             misfit_dist > 0) {
           // Compute the x-height position using proportionality between the
           // actual height and expected height.
           int min_xht = DivRounded(height * kBlnXHeight,
                                    max_top - kBlnBaselineOffset);
           int max_xht = DivRounded(height * kBlnXHeight,
                                    min_top - kBlnBaselineOffset);
           if (debug_x_ht_level >= 2) {
             tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
           }
           // The range of expected heights gets a vote equal to the distance
           // of the actual top from the expected top.
           for (int y = min_xht; y <= max_xht; ++y)
             top_stats.add(y, misfit_dist);
         } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
                     bottom - x_ht_acceptance_tolerance > max_bottom) &&
                    bottom_shift == 0) {
           // Get the range of required bottom shift.
           int min_shift = min_bottom - bottom;
           int max_shift = max_bottom - bottom;
           if (debug_x_ht_level >= 2) {
             tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
           }
           // The range of expected shifts gets a vote equal to the min distance
           // of the actual bottom from the expected bottom, spread over the
           // range of its acceptance.
           int misfit_weight = abs(min_shift);
           if (max_shift > min_shift)
             misfit_weight /= max_shift - min_shift;
           for (int y = min_shift; y <= max_shift; ++y)
             shift_stats.add(y, misfit_weight);
         } else {
           if (bottom_shift == 0) {
             // Things with bottoms that are already ok need to say so, on the
             // 1st iteration only.
             shift_stats.add(0, kBlnBaselineOffset);
           }
           if (debug_x_ht_level >= 2) {
             tprintf(" already OK\n");
           }
         }
       }
     }
     if (shift_stats.get_total() > top_stats.get_total()) {
       bottom_shift = IntCastRounded(shift_stats.median());
       if (debug_x_ht_level >= 2) {
         tprintf("Applying bottom shift=%d\n", bottom_shift);
       }
     }
   } while (bottom_shift != 0 &&
            top_stats.get_total() < shift_stats.get_total());
   // Baseline shift is opposite sign to the bottom shift.
   *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
   if (debug_x_ht_level >= 2) {
     tprintf("baseline shift=%g\n", *baseline_shift);
   }
   if (top_stats.get_total() == 0)
     return bottom_shift != 0 ? word_res->x_height : 0.0f;
   // The new xheight is just the median vote, which is then scaled out
   // of BLN space back to pixel space to get the x-height in pixel space.
   float new_xht = top_stats.median();
   if (debug_x_ht_level >= 2) {
     tprintf("Median xht=%f\n", new_xht);
     tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
             new_xht, new_xht / word_res->denorm.y_scale());
   }
   // The xheight must change by at least x_ht_min_change to be used.
   if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
     return new_xht / word_res->denorm.y_scale();
   else
     return bottom_shift != 0 ? word_res->x_height : 0.0f;
 }

◆ convert_bad_unlv_chs()

void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES * word_res )

Definition at line 664 of file docqual.cpp.

                                                        {
   int i;
   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
   UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
   UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
   UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
   for (i = 0; i < word_res->reject_map.length(); ++i) {
     if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
       word_res->best_choice->set_unichar_id(unichar_dash, i);
       if (word_res->reject_map[i].accepted ())
         word_res->reject_map[i].setrej_unlv_rej ();
     }
     if (word_res->best_choice->unichar_id(i) == unichar_pow) {
       word_res->best_choice->set_unichar_id(unichar_space, i);
       if (word_res->reject_map[i].accepted ())
         word_res->reject_map[i].setrej_unlv_rej ();
     }
   }
 }

◆ ConvertStringToUnichars()

bool tesseract::Tesseract::ConvertStringToUnichars	(	const char *	utf8,
		GenericVector< UNICHAR_ID > *	class_ids
	)

Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.

Returns: false if an invalid UNICHAR_ID is encountered.

Definition at line 535 of file applybox.cpp.

                                                                               {
   for (int step = 0; *utf8 != '\0'; utf8 += step) {
     const char* next_space = strchr(utf8, ' ');
     if (next_space == NULL)
       next_space = utf8 + strlen(utf8);
     step = next_space - utf8;
     UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
     if (class_id == INVALID_UNICHAR_ID) {
       return false;
     }
     while (utf8[step] == ' ')
       ++step;
     class_ids->push_back(class_id);
   }
   return true;
 }

◆ CorrectClassifyWords()

void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES * page_res )

Creates a fake best_choice entry in each WERD_RES with the correct text.

Definition at line 772 of file applybox.cpp.

                                                        {
   PAGE_RES_IT pr_it(page_res);
   for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
        word_res = pr_it.forward()) {
     WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
                                           word_res->correct_text.size());
     for (int i = 0; i < word_res->correct_text.size(); ++i) {
       // The part before the first space is the real ground truth, and the
       // rest is the bounding box location and page number.
       GenericVector<STRING> tokens;
       word_res->correct_text[i].split(' ', &tokens);
       UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
       choice->append_unichar_id_space_allocated(char_id,
                                                 word_res->best_state[i],
                                                 0.0f, 0.0f);
     }
     word_res->ClearWordChoices();
     word_res->LogNewRawChoice(choice);
     word_res->LogNewCookedChoice(1, false, choice);
   }
 }

◆ count_alphanums() [1/2]

inT16 tesseract::Tesseract::count_alphanums ( const WERD_CHOICE & word )

Definition at line 408 of file output.cpp.

                                                         {
   int count = 0;
   for (int i = 0; i < word.length(); ++i) {
     if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
         word.unicharset()->get_isdigit(word.unichar_id(i)))
       count++;
   }
   return count;
 }

◆ count_alphanums() [2/2]

inT16 tesseract::Tesseract::count_alphanums ( WERD_RES * word )

Definition at line 558 of file reject.cpp.

                                                    {
   int count = 0;
   const WERD_CHOICE *best_choice = word_res->best_choice;
   for (int i = 0; i < word_res->reject_map.length(); ++i) {
     if ((word_res->reject_map[i].accepted()) &&
         (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
             word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
       count++;
     }
   }
   return count;
 }

◆ count_alphas()

inT16 tesseract::Tesseract::count_alphas ( const WERD_CHOICE & word )

Definition at line 398 of file output.cpp.

                                                      {
   int count = 0;
   for (int i = 0; i < word.length(); ++i) {
     if (word.unicharset()->get_isalpha(word.unichar_id(i)))
       count++;
   }
   return count;
 }

◆ count_outline_errs()

inT16 tesseract::Tesseract::count_outline_errs	(	char	c,
		inT16	outline_count
	)

Definition at line 131 of file docqual.cpp.

                                                                {
   int expected_outline_count;
 
   if (STRING (outlines_odd).contains (c))
     return 0;  // Don't use this char
   else if (STRING (outlines_2).contains (c))
     expected_outline_count = 2;
   else
     expected_outline_count = 1;
   return abs (outline_count - expected_outline_count);
 }

◆ CountMisfitTops()

int tesseract::Tesseract::CountMisfitTops ( WERD_RES * word_res )

Definition at line 69 of file fixxht.cpp.

                                                  {
   int bad_blobs = 0;
   int num_blobs = word_res->rebuild_word->NumBlobs();
   for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
     TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
     UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
     if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
       int top = blob->bounding_box().top();
       if (top >= INT_FEAT_RANGE)
         top = INT_FEAT_RANGE - 1;
       int min_bottom, max_bottom, min_top, max_top;
       unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
                                 &min_top, &max_top);
       if (max_top - min_top > kMaxCharTopRange)
         continue;
       bool bad =  top < min_top - x_ht_acceptance_tolerance ||
                   top > max_top + x_ht_acceptance_tolerance;
       if (bad)
         ++bad_blobs;
       if (debug_x_ht_level >= 1) {
         tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
                 unicharset.id_to_unichar(class_id),
                 bad ? "Misfit" : "OK", top, min_top, max_top,
                 static_cast<int>(x_ht_acceptance_tolerance));
       }
     }
   }
   return bad_blobs;
 }

◆ debug_word()

void tesseract::Tesseract::debug_word	(	PAGE_RES *	page_res,
		const TBOX &	selection_box
	)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 640 of file pgedit.cpp.

                                                                         {
   ResetAdaptiveClassifier();
   recog_all_words(page_res, NULL, &selection_box, word_config_.string(), 0);
 }

◆ dictionary_correction_pass()

void tesseract::Tesseract::dictionary_correction_pass ( PAGE_RES * page_res )

Definition at line 2042 of file control.cpp.

                                                              {
   PAGE_RES_IT word_it(page_res);
   for (WERD_RES* word = word_it.word(); word != NULL;
        word = word_it.forward()) {
     if (word->best_choices.singleton())
       continue;  // There are no alternates.
 
     WERD_CHOICE* best = word->best_choice;
     if (word->tesseract->getDict().valid_word(*best) != 0)
       continue;  // The best choice is in the dictionary.
 
     WERD_CHOICE_IT choice_it(&word->best_choices);
     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
          choice_it.forward()) {
       WERD_CHOICE* alternate = choice_it.data();
       if (word->tesseract->getDict().valid_word(*alternate)) {
         // The alternate choice is in the dictionary.
         if (tessedit_bigram_debug) {
           tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
                   best->unichar_string().string(),
                   alternate->unichar_string().string());
         }
         // Replace the 'best' choice with a better choice.
         word->ReplaceBestChoice(alternate);
         break;
       }
     }
   }
 }

◆ digit_or_numeric_punct()

BOOL8 tesseract::Tesseract::digit_or_numeric_punct	(	WERD_RES *	word,
		int	char_position
	)

Definition at line 343 of file fixspace.cpp.

                                                                          {
   int i;
   int offset;
 
   for (i = 0, offset = 0; i < char_position;
        offset += word->best_choice->unichar_lengths()[i++]);
   return (
       word->uch_set->get_isdigit(
           word->best_choice->unichar_string().string() + offset,
           word->best_choice->unichar_lengths()[i]) ||
       (word->best_choice->permuter() == NUMBER_PERM &&
        STRING(numeric_punctuation).contains(
            word->best_choice->unichar_string().string()[offset])));
 }

◆ do_re_display()

void tesseract::Tesseract::do_re_display ( BOOL8(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) word_painter )

do_re_display()

Redisplay page

Definition at line 308 of file pgedit.cpp.

                                                                  {
   int block_count = 1;
 
   image_win->Clear();
   if (display_image != 0) {
     image_win->Image(pix_binary_, 0, 0);
   }
 
   image_win->Brush(ScrollView::NONE);
   PAGE_RES_IT pr_it(current_page_res);
   for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) {
     (this->*word_painter)(&pr_it);
     if (display_baselines && pr_it.row() != pr_it.prev_row())
       pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
     if (display_blocks && pr_it.block() != pr_it.prev_block())
       pr_it.block()->block->plot(image_win, block_count++, ScrollView::RED);
   }
   image_win->Update();
 }

◆ doc_and_block_rejection()

void tesseract::Tesseract::doc_and_block_rejection	(	PAGE_RES_IT &	page_res_it,
		BOOL8	good_quality_doc
	)

Definition at line 237 of file docqual.cpp.

                                                                 {
   inT16 block_no = 0;
   inT16 row_no = 0;
   BLOCK_RES *current_block;
   ROW_RES *current_row;
 
   BOOL8 rej_word;
   BOOL8 prev_word_rejected;
   inT16 char_quality = 0;
   inT16 accepted_char_quality;
 
   if (page_res_it.page_res->rej_count * 100.0 /
       page_res_it.page_res->char_count > tessedit_reject_doc_percent) {
     reject_whole_page(page_res_it);
     if (tessedit_debug_doc_rejection) {
       tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
               page_res_it.page_res->char_count,
               page_res_it.page_res->rej_count);
     }
   } else {
     if (tessedit_debug_doc_rejection) {
       tprintf("NO PAGE REJECTION #chars: %d  # Rejects: %d; \n",
               page_res_it.page_res->char_count,
               page_res_it.page_res->rej_count);
     }
 
     /* Walk blocks testing for block rejection */
 
     page_res_it.restart_page();
     WERD_RES* word;
     while ((word = page_res_it.word()) != NULL) {
       current_block = page_res_it.block();
       block_no = current_block->block->index();
       if (current_block->char_count > 0 &&
           (current_block->rej_count * 100.0 / current_block->char_count) >
            tessedit_reject_block_percent) {
         if (tessedit_debug_block_rejection) {
           tprintf("REJECTING BLOCK %d  #chars: %d;  #Rejects: %d\n",
                   block_no, current_block->char_count,
                   current_block->rej_count);
         }
         prev_word_rejected = FALSE;
         while ((word = page_res_it.word()) != NULL &&
                (page_res_it.block() == current_block)) {
           if (tessedit_preserve_blk_rej_perfect_wds) {
             rej_word = word->reject_map.reject_count() > 0 ||
                 word->reject_map.length () < tessedit_preserve_min_wd_len;
             if (rej_word && tessedit_dont_blkrej_good_wds &&
                 word->reject_map.length() >= tessedit_preserve_min_wd_len &&
                 acceptable_word_string(
                     *word->uch_set,
                     word->best_choice->unichar_string().string(),
                     word->best_choice->unichar_lengths().string()) !=
                 AC_UNACCEPTABLE) {
               word_char_quality(word, page_res_it.row()->row,
                                 &char_quality,
                                 &accepted_char_quality);
               rej_word = char_quality !=  word->reject_map.length();
             }
           } else {
             rej_word = TRUE;
           }
           if (rej_word) {
             /*
               Reject spacing if both current and prev words are rejected.
               NOTE - this is NOT restricted to FUZZY spaces. - When tried this
               generated more space errors.
             */
             if (tessedit_use_reject_spaces &&
                 prev_word_rejected &&
                 page_res_it.prev_row() == page_res_it.row() &&
                 word->word->space() == 1)
               word->reject_spaces = TRUE;
             word->reject_map.rej_word_block_rej();
           }
           prev_word_rejected = rej_word;
           page_res_it.forward();
         }
       } else {
         if (tessedit_debug_block_rejection) {
           tprintf("NOT REJECTING BLOCK %d #chars: %d  # Rejects: %d; \n",
                   block_no, page_res_it.block()->char_count,
                   page_res_it.block()->rej_count);
         }
 
         /* Walk rows in block testing for row rejection */
         row_no = 0;
         while (page_res_it.word() != NULL &&
                page_res_it.block() == current_block) {
           current_row = page_res_it.row();
           row_no++;
           /* Reject whole row if:
             fraction of chars on row which are rejected exceed a limit AND
             fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
             limit
           */
           if (current_row->char_count > 0 &&
               (current_row->rej_count * 100.0 / current_row->char_count) >
               tessedit_reject_row_percent &&
               (current_row->whole_word_rej_count * 100.0 /
                   current_row->rej_count) <
               tessedit_whole_wd_rej_row_percent) {
             if (tessedit_debug_block_rejection) {
               tprintf("REJECTING ROW %d  #chars: %d;  #Rejects: %d\n",
                       row_no, current_row->char_count,
                       current_row->rej_count);
             }
             prev_word_rejected = FALSE;
             while ((word = page_res_it.word()) != NULL &&
                    page_res_it.row () == current_row) {
               /* Preserve words on good docs unless they are mostly rejected*/
               if (!tessedit_row_rej_good_docs && good_quality_doc) {
                 rej_word = word->reject_map.reject_count() /
                     static_cast<float>(word->reject_map.length()) >
                     tessedit_good_doc_still_rowrej_wd;
               } else if (tessedit_preserve_row_rej_perfect_wds) {
                 /* Preserve perfect words anyway */
                 rej_word = word->reject_map.reject_count() > 0 ||
                     word->reject_map.length () < tessedit_preserve_min_wd_len;
                 if (rej_word && tessedit_dont_rowrej_good_wds &&
                     word->reject_map.length() >= tessedit_preserve_min_wd_len &&
                     acceptable_word_string(*word->uch_set,
                         word->best_choice->unichar_string().string(),
                         word->best_choice->unichar_lengths().string()) !=
                             AC_UNACCEPTABLE) {
                   word_char_quality(word, page_res_it.row()->row,
                                     &char_quality,
                                     &accepted_char_quality);
                   rej_word = char_quality != word->reject_map.length();
                 }
               } else {
                 rej_word = TRUE;
               }
               if (rej_word) {
                 /*
                   Reject spacing if both current and prev words are rejected.
                   NOTE - this is NOT restricted to FUZZY spaces. - When tried
                   this generated more space errors.
                 */
                 if (tessedit_use_reject_spaces &&
                     prev_word_rejected &&
                     page_res_it.prev_row() == page_res_it.row() &&
                     word->word->space () == 1)
                   word->reject_spaces = TRUE;
                 word->reject_map.rej_word_row_rej();
               }
               prev_word_rejected = rej_word;
               page_res_it.forward();
             }
           } else {
             if (tessedit_debug_block_rejection) {
               tprintf("NOT REJECTING ROW %d #chars: %d  # Rejects: %d; \n",
                       row_no, current_row->char_count, current_row->rej_count);
             }
             while (page_res_it.word() != NULL &&
                    page_res_it.row() == current_row)
               page_res_it.forward();
           }
         }
       }
     }
   }
 }

◆ dont_allow_1Il()

void tesseract::Tesseract::dont_allow_1Il ( WERD_RES * word )

Definition at line 526 of file reject.cpp.

                                              {
   int i = 0;
   int offset;
   int word_len = word->reject_map.length();
   const char *s = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   BOOL8 accepted_1Il = FALSE;
 
   for (i = 0, offset = 0; i < word_len;
        offset += word->best_choice->unichar_lengths()[i++]) {
     if (word->reject_map[i].accepted()) {
       if (STRING(conflict_set_I_l_1).contains(s[offset])) {
         accepted_1Il = TRUE;
       } else {
         if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
             word->uch_set->get_isdigit(s + offset, lengths[i]))
           return;                // >=1 non 1Il ch accepted
       }
     }
   }
   if (!accepted_1Il)
     return;                      //Nothing to worry about
 
   for (i = 0, offset = 0; i < word_len;
        offset += word->best_choice->unichar_lengths()[i++]) {
     if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
       word->reject_map[i].accepted())
       word->reject_map[i].setrej_postNN_1Il();
   }
 }

◆ dump_words()

void tesseract::Tesseract::dump_words	(	WERD_RES_LIST &	perm,
		inT16	score,
		inT16	mode,
		BOOL8	improved
	)

Definition at line 449 of file fixspace.cpp.

                                                        {
   WERD_RES_IT word_res_it(&perm);
 
   if (debug_fix_space_level > 0) {
     if (mode == 1) {
       stats_.dump_words_str = "";
       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
            word_res_it.forward()) {
         if (!word_res_it.data()->part_of_combo) {
           stats_.dump_words_str +=
               word_res_it.data()->best_choice->unichar_string();
           stats_.dump_words_str += ' ';
         }
       }
     }
 
     if (debug_fix_space_level > 1) {
       switch (mode) {
         case 1:
           tprintf("EXTRACTED (%d): \"", score);
           break;
         case 2:
           tprintf("TESTED (%d): \"", score);
           break;
         case 3:
           tprintf("RETURNED (%d): \"", score);
           break;
       }
 
       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
            word_res_it.forward()) {
         if (!word_res_it.data()->part_of_combo) {
           tprintf("%s/%1d ",
                   word_res_it.data()->best_choice->unichar_string().string(),
                   (int)word_res_it.data()->best_choice->permuter());
         }
       }
       tprintf("\"\n");
     } else if (improved) {
       tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
       for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
            word_res_it.forward()) {
         if (!word_res_it.data()->part_of_combo) {
           tprintf("%s/%1d ",
                   word_res_it.data()->best_choice->unichar_string().string(),
                   (int)word_res_it.data()->best_choice->permuter());
         }
       }
       tprintf("\"\n");
     }
   }
 }

◆ end_tesseract()

void tesseract::Tesseract::end_tesseract ( )

Definition at line 468 of file tessedit.cpp.

                               {
   end_recog();
 }

◆ eval_word_spacing()

inT16 tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST & word_res_list )

Definition at line 239 of file fixspace.cpp.

                                                                {
   WERD_RES_IT word_res_it(&word_res_list);
   inT16 total_score = 0;
   inT16 word_count = 0;
   inT16 done_word_count = 0;
   inT16 word_len;
   inT16 i;
   inT16 offset;
   WERD_RES *word;                 // current word
   inT16 prev_word_score = 0;
   BOOL8 prev_word_done = FALSE;
   BOOL8 prev_char_1 = FALSE;      // prev ch a "1/I/l"?
   BOOL8 prev_char_digit = FALSE;  // prev ch 2..9 or 0
   BOOL8 current_char_1 = FALSE;
   BOOL8 current_word_ok_so_far;
   STRING punct_chars = "!\"`',.:;";
   BOOL8 prev_char_punct = FALSE;
   BOOL8 current_char_punct = FALSE;
   BOOL8 word_done = FALSE;
 
   do {
     word = word_res_it.data();
     word_done = fixspace_thinks_word_done(word);
     word_count++;
     if (word->tess_failed) {
       total_score += prev_word_score;
       if (prev_word_done)
         done_word_count++;
       prev_word_score = 0;
       prev_char_1 = FALSE;
       prev_char_digit = FALSE;
       prev_word_done = FALSE;
     } else {
       /*
         Can we add the prev word score and potentially count this word?
         Yes IF it didn't end in a 1 when the first char of this word is a digit
           AND it didn't end in a digit when the first char of this word is a 1
       */
       word_len = word->reject_map.length();
       current_word_ok_so_far = FALSE;
       if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
             (prev_char_digit && (
                 (word_done &&
                  word->best_choice->unichar_lengths().string()[0] == 1 &&
                  word->best_choice->unichar_string()[0] == '1') ||
                 (!word_done && STRING(conflict_set_I_l_1).contains(
                       word->best_choice->unichar_string()[0])))))) {
         total_score += prev_word_score;
         if (prev_word_done)
           done_word_count++;
         current_word_ok_so_far = word_done;
       }
 
       if (current_word_ok_so_far) {
         prev_word_done = TRUE;
         prev_word_score = word_len;
       } else {
         prev_word_done = FALSE;
         prev_word_score = 0;
       }
 
       /* Add 1 to total score for every joined 1 regardless of context and
          rejtn */
       for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
         current_char_1 = word->best_choice->unichar_string()[i] == '1';
         if (prev_char_1 || (current_char_1 && (i > 0)))
           total_score++;
         prev_char_1 = current_char_1;
       }
 
       /* Add 1 to total score for every joined punctuation regardless of context
         and rejtn */
       if (tessedit_prefer_joined_punct) {
         for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
              offset += word->best_choice->unichar_lengths()[i++]) {
           current_char_punct =
             punct_chars.contains(word->best_choice->unichar_string()[offset]);
           if (prev_char_punct || (current_char_punct && i > 0))
             total_score++;
           prev_char_punct = current_char_punct;
         }
       }
       prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
       for (i = 0, offset = 0; i < word_len - 1;
            offset += word->best_choice->unichar_lengths()[i++]);
       prev_char_1 =
           ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
            || (!word_done && STRING(conflict_set_I_l_1).contains(
                    word->best_choice->unichar_string()[offset])));
     }
     /* Find next word */
     do {
       word_res_it.forward();
     } while (word_res_it.data()->part_of_combo);
   } while (!word_res_it.at_first());
   total_score += prev_word_score;
   if (prev_word_done)
     done_word_count++;
   if (done_word_count == word_count)
     return PERFECT_WERDS;
   else
     return total_score;
 }

◆ failure_count()

inT16 tesseract::Tesseract::failure_count ( WERD_RES * word )

Definition at line 970 of file docqual.cpp.

                                              {
   const char *str = word->best_choice->unichar_string().string();
   int tess_rejs = 0;
 
   for (; *str != '\0'; str++) {
     if (*str == ' ')
       tess_rejs++;
   }
   return tess_rejs;
 }

◆ FindSegmentation()

bool tesseract::Tesseract::FindSegmentation	(	const GenericVector< UNICHAR_ID > &	target_text,
		WERD_RES *	word_res
	)

Resegments the word to achieve the target_text from the classifier. Returns false if the re-segmentation fails. Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and applies a full search on the classifier results to find the best classified segmentation. As a compromise to obtain better recall, 1-1 ambiguity substitutions ARE used.

Definition at line 559 of file applybox.cpp.

                                                      {
   // Classify all required combinations of blobs and save results in choices.
   int word_length = word_res->box_word->length();
   GenericVector<BLOB_CHOICE_LIST*>* choices =
       new GenericVector<BLOB_CHOICE_LIST*>[word_length];
   for (int i = 0; i < word_length; ++i) {
     for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
       BLOB_CHOICE_LIST* match_result = classify_piece(
           word_res->seam_array, i, i + j - 1, "Applybox",
           word_res->chopped_word, word_res->blamer_bundle);
       if (applybox_debug > 2) {
         tprintf("%d+%d:", i, j);
         print_ratings_list("Segment:", match_result, unicharset);
       }
       choices[i].push_back(match_result);
     }
   }
   // Search the segmentation graph for the target text. Must be an exact
   // match. Using wildcards makes it difficult to find the correct
   // segmentation even when it is there.
   word_res->best_state.clear();
   GenericVector<int> search_segmentation;
   float best_rating = 0.0f;
   SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
                 &search_segmentation, &best_rating, &word_res->best_state);
   for (int i = 0; i < word_length; ++i)
     choices[i].delete_data_pointers();
   delete [] choices;
   if (word_res->best_state.empty()) {
     // Build the original segmentation and if it is the same length as the
     // truth, assume it will do.
     int blob_count = 1;
     for (int s = 0; s < word_res->seam_array.size(); ++s) {
       SEAM* seam = word_res->seam_array[s];
       if (!seam->HasAnySplits()) {
         word_res->best_state.push_back(blob_count);
         blob_count = 1;
       } else {
         ++blob_count;
       }
     }
     word_res->best_state.push_back(blob_count);
     if (word_res->best_state.size() != target_text.size()) {
       word_res->best_state.clear();  // No good. Original segmentation bad size.
       return false;
     }
   }
   word_res->correct_text.clear();
   for (int i = 0; i < target_text.size(); ++i) {
     word_res->correct_text.push_back(
         STRING(unicharset.id_to_unichar(target_text[i])));
   }
   return true;
 }

◆ first_alphanum_index()

inT16 tesseract::Tesseract::first_alphanum_index	(	const char *	word,
		const char *	word_lengths
	)

Definition at line 469 of file reject.cpp.

                                                                 {
   inT16 i;
   inT16 offset;
 
   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
         unicharset.get_isdigit(word + offset, word_lengths[i]))
       return i;
   }
   return -1;
 }

◆ first_alphanum_offset()

inT16 tesseract::Tesseract::first_alphanum_offset	(	const char *	word,
		const char *	word_lengths
	)

Definition at line 482 of file reject.cpp.

                                                                  {
   inT16 i;
   inT16 offset;
 
   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
     if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
         unicharset.get_isdigit(word + offset, word_lengths[i]))
       return offset;
   }
   return -1;
 }

◆ fix_fuzzy_space_list()

void tesseract::Tesseract::fix_fuzzy_space_list	(	WERD_RES_LIST &	best_perm,
		ROW *	row,
		BLOCK *	block
	)

Definition at line 145 of file fixspace.cpp.

                                                    {
   inT16 best_score;
   WERD_RES_LIST current_perm;
   inT16 current_score;
   BOOL8 improved = FALSE;
 
   best_score = eval_word_spacing(best_perm);  // default score
   dump_words(best_perm, best_score, 1, improved);
 
   if (best_score != PERFECT_WERDS)
     initialise_search(best_perm, current_perm);
 
   while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
     match_current_words(current_perm, row, block);
     current_score = eval_word_spacing(current_perm);
     dump_words(current_perm, current_score, 2, improved);
     if (current_score > best_score) {
       best_perm.clear();
       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
       best_score = current_score;
       improved = TRUE;
     }
     if (current_score < PERFECT_WERDS)
       transform_to_next_perm(current_perm);
   }
   dump_words(best_perm, best_score, 3, improved);
 }

◆ fix_fuzzy_spaces()

void tesseract::Tesseract::fix_fuzzy_spaces	(	ETEXT_DESC *	monitor,
		inT32	word_count,
		PAGE_RES *	page_res
	)

Definition at line 48 of file fixspace.cpp.

                                                      {
   BLOCK_RES_IT block_res_it;
   ROW_RES_IT row_res_it;
   WERD_RES_IT word_res_it_from;
   WERD_RES_IT word_res_it_to;
   WERD_RES *word_res;
   WERD_RES_LIST fuzzy_space_words;
   inT16 new_length;
   BOOL8 prevent_null_wd_fixsp;   // DON'T process blobless wds
   inT32 word_index;              // current word
 
   block_res_it.set_to_list(&page_res->block_res_list);
   word_index = 0;
   for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
        block_res_it.forward()) {
     row_res_it.set_to_list(&block_res_it.data()->row_res_list);
     for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
          row_res_it.forward()) {
       word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
       while (!word_res_it_from.at_last()) {
         word_res = word_res_it_from.data();
         while (!word_res_it_from.at_last() &&
                !(word_res->combination ||
                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
                  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
           fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
                          block_res_it.data()->block);
           word_res = word_res_it_from.forward();
           word_index++;
           if (monitor != NULL) {
             monitor->ocr_alive = TRUE;
             monitor->progress = 90 + 5 * word_index / word_count;
             if (monitor->deadline_exceeded() ||
                 (monitor->cancel != NULL &&
                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
             return;
           }
         }
 
         if (!word_res_it_from.at_last()) {
           word_res_it_to = word_res_it_from;
           prevent_null_wd_fixsp =
             word_res->word->cblob_list()->empty();
           if (check_debug_pt(word_res, 60))
             debug_fix_space_level.set_value(10);
           word_res_it_to.forward();
           word_index++;
           if (monitor != NULL) {
             monitor->ocr_alive = TRUE;
             monitor->progress = 90 + 5 * word_index / word_count;
             if (monitor->deadline_exceeded() ||
                 (monitor->cancel != NULL &&
                  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
             return;
           }
           while (!word_res_it_to.at_last () &&
                  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
                   word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
             if (check_debug_pt(word_res, 60))
               debug_fix_space_level.set_value(10);
             if (word_res->word->cblob_list()->empty())
               prevent_null_wd_fixsp = TRUE;
             word_res = word_res_it_to.forward();
           }
           if (check_debug_pt(word_res, 60))
             debug_fix_space_level.set_value(10);
           if (word_res->word->cblob_list()->empty())
             prevent_null_wd_fixsp = TRUE;
           if (prevent_null_wd_fixsp) {
             word_res_it_from = word_res_it_to;
           } else {
             fuzzy_space_words.assign_to_sublist(&word_res_it_from,
                                                 &word_res_it_to);
             fix_fuzzy_space_list(fuzzy_space_words,
                                  row_res_it.data()->row,
                                  block_res_it.data()->block);
             new_length = fuzzy_space_words.length();
             word_res_it_from.add_list_before(&fuzzy_space_words);
             for (;
                  !word_res_it_from.at_last() && new_length > 0;
                  new_length--) {
               word_res_it_from.forward();
             }
           }
           if (test_pt)
             debug_fix_space_level.set_value(0);
         }
         fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
                        block_res_it.data()->block);
         // Last word in row
       }
     }
   }
 }

◆ fix_noisy_space_list()

void tesseract::Tesseract::fix_noisy_space_list	(	WERD_RES_LIST &	best_perm,
		ROW *	row,
		BLOCK *	block
	)

Definition at line 569 of file fixspace.cpp.

                                                    {
   inT16 best_score;
   WERD_RES_IT best_perm_it(&best_perm);
   WERD_RES_LIST current_perm;
   WERD_RES_IT current_perm_it(&current_perm);
   WERD_RES *old_word_res;
   inT16 current_score;
   BOOL8 improved = FALSE;
 
   best_score = fp_eval_word_spacing(best_perm);  // default score
 
   dump_words(best_perm, best_score, 1, improved);
 
   old_word_res = best_perm_it.data();
   // Even deep_copy doesn't copy the underlying WERD unless its combination
   // flag is true!.
   old_word_res->combination = TRUE;   // Kludge to force deep copy
   current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
   old_word_res->combination = FALSE;  // Undo kludge
 
   break_noisiest_blob_word(current_perm);
 
   while (best_score != PERFECT_WERDS && !current_perm.empty()) {
     match_current_words(current_perm, row, block);
     current_score = fp_eval_word_spacing(current_perm);
     dump_words(current_perm, current_score, 2, improved);
     if (current_score > best_score) {
       best_perm.clear();
       best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
       best_score = current_score;
       improved = TRUE;
     }
     if (current_score < PERFECT_WERDS) {
       break_noisiest_blob_word(current_perm);
     }
   }
   dump_words(best_perm, best_score, 3, improved);
 }

◆ fix_rep_char()

void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT * page_res_it )

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1651 of file control.cpp.

                                                      {
   WERD_RES *word_res = page_res_it->word();
   const WERD_CHOICE &word = *(word_res->best_choice);
 
   // Find the frequency of each unique character in the word.
   SortHelper<UNICHAR_ID> rep_ch(word.length());
   for (int i = 0; i < word.length(); ++i) {
     rep_ch.Add(word.unichar_id(i), 1);
   }
 
   // Find the most frequent result.
   UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
   int max_count = rep_ch.MaxCount(&maxch_id);
   // Find the best exemplar of a classifier result for maxch_id.
   BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
   if (best_choice == NULL) {
     tprintf("Failed to find a choice for %s, occurring %d times\n",
             word_res->uch_set->debug_str(maxch_id).string(), max_count);
     return;
   }
   word_res->done = TRUE;
 
   // Measure the mean space.
   int gap_count = 0;
   WERD* werd = word_res->word;
   C_BLOB_IT blob_it(werd->cblob_list());
   C_BLOB* prev_blob = blob_it.data();
   for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
     C_BLOB* blob = blob_it.data();
     int gap = blob->bounding_box().left();
     gap -= prev_blob->bounding_box().right();
     ++gap_count;
     prev_blob = blob;
   }
   // Just correct existing classification.
   CorrectRepcharChoices(best_choice, word_res);
   word_res->reject_map.initialise(word.length());
 }

◆ fix_sp_fp_word()

void tesseract::Tesseract::fix_sp_fp_word	(	WERD_RES_IT &	word_res_it,
		ROW *	row,
		BLOCK *	block
	)

Definition at line 535 of file fixspace.cpp.

                                              {
   WERD_RES *word_res;
   WERD_RES_LIST sub_word_list;
   WERD_RES_IT sub_word_list_it(&sub_word_list);
   inT16 blob_index;
   inT16 new_length;
   float junk;
 
   word_res = word_res_it.data();
   if (word_res->word->flag(W_REP_CHAR) ||
       word_res->combination ||
       word_res->part_of_combo ||
       !word_res->word->flag(W_DONT_CHOP))
     return;
 
   blob_index = worst_noise_blob(word_res, &junk);
   if (blob_index < 0)
     return;
 
   if (debug_fix_space_level > 1) {
     tprintf("FP fixspace working on \"%s\"\n",
             word_res->best_choice->unichar_string().string());
   }
   word_res->word->rej_cblob_list()->sort(c_blob_comparator);
   sub_word_list_it.add_after_stay_put(word_res_it.extract());
   fix_noisy_space_list(sub_word_list, row, block);
   new_length = sub_word_list.length();
   word_res_it.add_list_before(&sub_word_list);
   for (; !word_res_it.at_last() && new_length > 1; new_length--) {
     word_res_it.forward();
   }
 }

◆ fixspace_thinks_word_done()

BOOL8 tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES * word )

Definition at line 503 of file fixspace.cpp.

                                                          {
   if (word->done)
     return TRUE;
 
   /*
     Use all the standard pass 2 conditions for mode 5 in set_done() in
     reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
     CARE WHETHER WE HAVE of/at on/an etc.
   */
   if (fixsp_done_mode > 0 &&
       (word->tess_accepted ||
        (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
        fixsp_done_mode == 3) &&
       (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
       ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
        (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
        (word->best_choice->permuter() == USER_DAWG_PERM) ||
        (word->best_choice->permuter() == NUMBER_PERM))) {
     return TRUE;
   } else {
     return FALSE;
   }
 }

◆ flip_0O()

void tesseract::Tesseract::flip_0O ( WERD_RES * word )

Definition at line 673 of file reject.cpp.

                                           {
   WERD_CHOICE *best_choice = word_res->best_choice;
   int i;
   TBOX out_box;
 
   if (!tessedit_flip_0O)
     return;
 
   int num_blobs = word_res->rebuild_word->NumBlobs();
   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
     TBLOB* blob = word_res->rebuild_word->blobs[i];
     if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
         word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
       out_box = blob->bounding_box();
       if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
         (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
         return;                  //Beware words with sub/superscripts
     }
   }
   UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
   UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
   if (unichar_0 == INVALID_UNICHAR_ID ||
       !word_res->uch_set->get_enabled(unichar_0) ||
       unichar_O == INVALID_UNICHAR_ID ||
       !word_res->uch_set->get_enabled(unichar_O)) {
     return;  // 0 or O are not present/enabled in unicharset
   }
   for (i = 1; i < best_choice->length(); ++i) {
     if (best_choice->unichar_id(i) == unichar_0 ||
         best_choice->unichar_id(i) == unichar_O) {
       /* A0A */
       if ((i+1) < best_choice->length() &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
         best_choice->set_unichar_id(unichar_O, i);
       }
       /* A00A */
       if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+1) < best_choice->length() &&
           (best_choice->unichar_id(i+1) == unichar_0 ||
            best_choice->unichar_id(i+1) == unichar_O) &&
           (i+2) < best_choice->length() &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
         best_choice->set_unichar_id(unichar_O, i);
         i++;
       }
       /* AA0<non digit or end of word> */
       if ((i > 1) &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
           non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (((i+1) < best_choice->length() &&
             !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
             !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
            (i == best_choice->length() - 1))) {
         best_choice->set_unichar_id(unichar_O, i);
       }
       /* 9O9 */
       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+1) < best_choice->length() &&
           non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
         best_choice->set_unichar_id(unichar_0, i);
       }
       /* 9OOO */
       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+2) < best_choice->length() &&
           (best_choice->unichar_id(i+1) == unichar_0 ||
            best_choice->unichar_id(i+1) == unichar_O) &&
           (best_choice->unichar_id(i+2) == unichar_0 ||
            best_choice->unichar_id(i+2) == unichar_O)) {
         best_choice->set_unichar_id(unichar_0, i);
         best_choice->set_unichar_id(unichar_0, i+1);
         best_choice->set_unichar_id(unichar_0, i+2);
         i += 2;
       }
       /* 9OO<non upper> */
       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+2) < best_choice->length() &&
           (best_choice->unichar_id(i+1) == unichar_0 ||
           best_choice->unichar_id(i+1) == unichar_O) &&
           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
         best_choice->set_unichar_id(unichar_0, i);
         best_choice->set_unichar_id(unichar_0, i+1);
         i++;
       }
       /* 9O<non upper> */
       if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
           (i+1) < best_choice->length() &&
           !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
         best_choice->set_unichar_id(unichar_0, i);
       }
       /* 9[.,]OOO.. */
       if ((i > 1) &&
           (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
               word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
           (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
            best_choice->unichar_id(i-2) == unichar_O)) {
         if (best_choice->unichar_id(i-2) == unichar_O) {
           best_choice->set_unichar_id(unichar_0, i-2);
         }
         while (i < best_choice->length() &&
                (best_choice->unichar_id(i) == unichar_O ||
                 best_choice->unichar_id(i) == unichar_0)) {
           best_choice->set_unichar_id(unichar_0, i);
           i++;
         }
         i--;
       }
     }
   }
 }

◆ flip_hyphens()

void tesseract::Tesseract::flip_hyphens ( WERD_RES * word )

Definition at line 616 of file reject.cpp.

                                                {
   WERD_CHOICE *best_choice = word_res->best_choice;
   int i;
   int prev_right = -9999;
   int next_left;
   TBOX out_box;
   float aspect_ratio;
 
   if (tessedit_lower_flip_hyphen <= 1)
     return;
 
   int num_blobs = word_res->rebuild_word->NumBlobs();
   UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
   for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
     TBLOB* blob = word_res->rebuild_word->blobs[i];
     out_box = blob->bounding_box();
     if (i + 1 == num_blobs)
       next_left = 9999;
     else
       next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
     // Don't touch small or touching blobs - it is too dangerous.
     if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
         (out_box.left() > prev_right) && (out_box.right() < next_left)) {
       aspect_ratio = out_box.width() / (float) out_box.height();
       if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
         if (aspect_ratio >= tessedit_upper_flip_hyphen &&
             word_res->uch_set->contains_unichar_id(unichar_dash) &&
             word_res->uch_set->get_enabled(unichar_dash)) {
           /* Certain HYPHEN */
           best_choice->set_unichar_id(unichar_dash, i);
           if (word_res->reject_map[i].rejected())
             word_res->reject_map[i].setrej_hyphen_accept();
         }
         if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
           word_res->reject_map[i].accepted())
                                  //Suspected HYPHEN
           word_res->reject_map[i].setrej_hyphen ();
       }
       else if (best_choice->unichar_id(i) == unichar_dash) {
         if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
           (word_res->reject_map[i].rejected()))
           word_res->reject_map[i].setrej_hyphen_accept();
         //Certain HYPHEN
 
         if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
           (word_res->reject_map[i].accepted()))
                                  //Suspected HYPHEN
           word_res->reject_map[i].setrej_hyphen();
       }
     }
     prev_right = out_box.right();
   }
 }

◆ font_recognition_pass()

void tesseract::Tesseract::font_recognition_pass ( PAGE_RES * page_res )

font_recognition_pass

Smooth the fonts for the document.

Definition at line 1985 of file control.cpp.

                                                         {
   PAGE_RES_IT page_res_it(page_res);
   WERD_RES *word;                // current word
   STATS doc_fonts(0, font_table_size_);           // font counters
 
   // Gather font id statistics.
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
        page_res_it.forward()) {
     word = page_res_it.word();
     if (word->fontinfo != NULL) {
       doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
     }
     if (word->fontinfo2 != NULL) {
       doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
     }
   }
   inT16 doc_font;                 // modal font
   int8_t doc_font_count;           // modal font
   find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
   if (doc_font_count == 0)
     return;
   // Get the modal font pointer.
   const FontInfo* modal_font = NULL;
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
        page_res_it.forward()) {
     word = page_res_it.word();
     if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
       modal_font = word->fontinfo;
       break;
     }
     if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
       modal_font = word->fontinfo2;
       break;
     }
   }
   ASSERT_HOST(modal_font != NULL);
 
   // Assign modal font to weak words.
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
        page_res_it.forward()) {
     word = page_res_it.word();
     int length = word->best_choice->length();
 
     int count = word->fontinfo_id_count;
     if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
       word->fontinfo = modal_font;
       // Counts only get 1 as it came from the doc.
       word->fontinfo_id_count = 1;
       word->italic = modal_font->is_italic() ? 1 : -1;
       word->bold = modal_font->is_bold() ? 1 : -1;
     }
   }
 }

◆ fp_eval_word_spacing()

inT16 tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST & word_res_list )

Definition at line 830 of file fixspace.cpp.

                                                                   {
   WERD_RES_IT word_it(&word_res_list);
   WERD_RES *word;
   inT16 score = 0;
   inT16 i;
   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
 
   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     word = word_it.data();
     if (word->rebuild_word == NULL)
       continue;  // Can't handle cube words.
     if (word->done ||
         word->tess_accepted ||
         word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
         word->best_choice->permuter() == FREQ_DAWG_PERM ||
         word->best_choice->permuter() == USER_DAWG_PERM ||
         safe_dict_word(word) > 0) {
       int num_blobs = word->rebuild_word->NumBlobs();
       UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
       for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
         TBLOB* blob = word->rebuild_word->blobs[i];
         if (word->best_choice->unichar_id(i) == space ||
             blob_noise_score(blob) < small_limit) {
           score -= 1;  // penalise possibly erroneous non-space
         } else if (word->reject_map[i].accepted()) {
           score++;
         }
       }
     }
   }
   if (score < 0)
     score = 0;
   return score;
 }

◆ garbage_word()

GARBAGE_LEVEL tesseract::Tesseract::garbage_word	(	WERD_RES *	word,
		BOOL8	ok_dict_word
	)

Definition at line 684 of file docqual.cpp.

                                                                         {
   enum STATES
   {
     JUNK,
     FIRST_UPPER,
     FIRST_LOWER,
     FIRST_NUM,
     SUBSEQUENT_UPPER,
     SUBSEQUENT_LOWER,
     SUBSEQUENT_NUM
   };
   const char *str = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   STATES state = JUNK;
   int len = 0;
   int isolated_digits = 0;
   int isolated_alphas = 0;
   int bad_char_count = 0;
   int tess_rejs = 0;
   int dodgy_chars = 0;
   int ok_chars;
   UNICHAR_ID last_char = -1;
   int alpha_repetition_count = 0;
   int longest_alpha_repetition_count = 0;
   int longest_lower_run_len = 0;
   int lower_string_count = 0;
   int longest_upper_run_len = 0;
   int upper_string_count = 0;
   int total_alpha_count = 0;
   int total_digit_count = 0;
 
   for (; *str != '\0'; str += *(lengths++)) {
     len++;
     if (word->uch_set->get_isupper (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_UPPER:
         case FIRST_UPPER:
           state = SUBSEQUENT_UPPER;
           upper_string_count++;
           if (longest_upper_run_len < upper_string_count)
             longest_upper_run_len = upper_string_count;
           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
             last_char = word->uch_set->unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
         case FIRST_NUM:
           isolated_digits++;
         default:
           state = FIRST_UPPER;
           last_char = word->uch_set->unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           upper_string_count = 1;
           break;
       }
     }
     else if (word->uch_set->get_islower (str, *lengths)) {
       total_alpha_count++;
       switch (state) {
         case SUBSEQUENT_LOWER:
         case FIRST_LOWER:
           state = SUBSEQUENT_LOWER;
           lower_string_count++;
           if (longest_lower_run_len < lower_string_count)
             longest_lower_run_len = lower_string_count;
           if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
             alpha_repetition_count++;
             if (longest_alpha_repetition_count < alpha_repetition_count) {
               longest_alpha_repetition_count = alpha_repetition_count;
             }
           }
           else {
             last_char = word->uch_set->unichar_to_id(str, *lengths);
             alpha_repetition_count = 1;
           }
           break;
         case FIRST_NUM:
           isolated_digits++;
         default:
           state = FIRST_LOWER;
           last_char = word->uch_set->unichar_to_id(str, *lengths);
           alpha_repetition_count = 1;
           lower_string_count = 1;
           break;
       }
     }
     else if (word->uch_set->get_isdigit (str, *lengths)) {
       total_digit_count++;
       switch (state) {
         case FIRST_NUM:
           state = SUBSEQUENT_NUM;
         case SUBSEQUENT_NUM:
           break;
         case FIRST_UPPER:
         case FIRST_LOWER:
           isolated_alphas++;
         default:
           state = FIRST_NUM;
           break;
       }
     }
     else {
       if (*lengths == 1 && *str == ' ')
         tess_rejs++;
       else
         bad_char_count++;
       switch (state) {
         case FIRST_NUM:
           isolated_digits++;
           break;
         case FIRST_UPPER:
         case FIRST_LOWER:
           isolated_alphas++;
         default:
           break;
       }
       state = JUNK;
     }
   }
 
   switch (state) {
     case FIRST_NUM:
       isolated_digits++;
       break;
     case FIRST_UPPER:
     case FIRST_LOWER:
       isolated_alphas++;
     default:
       break;
   }
 
   if (crunch_include_numerals) {
     total_alpha_count += total_digit_count - isolated_digits;
   }
 
   if (crunch_leave_ok_strings && len >= 4 &&
       2 * (total_alpha_count - isolated_alphas) > len &&
       longest_alpha_repetition_count < crunch_long_repetitions) {
     if ((crunch_accept_ok &&
          acceptable_word_string(*word->uch_set, str, lengths) !=
              AC_UNACCEPTABLE) ||
         longest_lower_run_len > crunch_leave_lc_strings ||
         longest_upper_run_len > crunch_leave_uc_strings)
       return G_NEVER_CRUNCH;
   }
   if (word->reject_map.length() > 1 &&
       strpbrk(str, " ") == NULL &&
       (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
        word->best_choice->permuter() == FREQ_DAWG_PERM ||
        word->best_choice->permuter() == USER_DAWG_PERM ||
        word->best_choice->permuter() == NUMBER_PERM ||
        acceptable_word_string(*word->uch_set, str, lengths) !=
            AC_UNACCEPTABLE || ok_dict_word))
     return G_OK;
 
   ok_chars = len - bad_char_count - isolated_digits -
     isolated_alphas - tess_rejs;
 
   if (crunch_debug > 3) {
     tprintf("garbage_word: \"%s\"\n",
             word->best_choice->unichar_string().string());
     tprintf("LEN: %d  bad: %d  iso_N: %d  iso_A: %d  rej: %d\n",
             len,
             bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
   }
   if (bad_char_count == 0 &&
       tess_rejs == 0 &&
       (len > isolated_digits + isolated_alphas || len <= 2))
     return G_OK;
 
   if (tess_rejs > ok_chars ||
       (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
     return G_TERRIBLE;
 
   if (len > 4) {
     dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
         isolated_alphas;
     if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
       return G_DODGY;
     else
       return G_OK;
   } else {
     dodgy_chars = 2 * tess_rejs + bad_char_count;
     if ((len == 4 && dodgy_chars > 2) ||
         (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
       return G_DODGY;
     else
       return G_OK;
   }
 }

◆ get_rep_char()

UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES * word )

Definition at line 283 of file output.cpp.

                                                  {  // what char is repeated?
   int i;
   for (i = 0; ((i < word->reject_map.length()) &&
                (word->reject_map[i].rejected())); ++i);
 
   if (i < word->reject_map.length()) {
     return word->best_choice->unichar_id(i);
   } else {
     return word->uch_set->unichar_to_id(unrecognised_char.string());
   }
 }

◆ get_sub_lang()

Tesseract* tesseract::Tesseract::get_sub_lang ( int index ) const

inline

Definition at line 256 of file tesseractclass.h.

                                            {
     return sub_langs_[index];
   }

◆ GetLineData()

ImageData * tesseract::Tesseract::GetLineData	(	const TBOX &	line_box,
		const GenericVector< TBOX > &	boxes,
		const GenericVector< STRING > &	texts,
		int	start_box,
		int	end_box,
		const BLOCK &	block
	)

Definition at line 131 of file linerec.cpp.

                                                       {
   TBOX revised_box;
   ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
                                        &revised_box);
   if (image_data == NULL) return NULL;
   image_data->set_page_number(applybox_page);
   // Copy the boxes and shift them so they are relative to the image.
   FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
   ICOORD shift = -revised_box.botleft();
   GenericVector<TBOX> line_boxes;
   GenericVector<STRING> line_texts;
   for (int b = start_box; b < end_box; ++b) {
     TBOX box = boxes[b];
     box.rotate(block_rotation);
     box.move(shift);
     line_boxes.push_back(box);
     line_texts.push_back(texts[b]);
   }
   GenericVector<int> page_numbers;
   page_numbers.init_to_size(line_boxes.size(), applybox_page);
   image_data->AddBoxes(line_boxes, line_texts, page_numbers);
   return image_data;
 }

◆ GetRectImage()

ImageData * tesseract::Tesseract::GetRectImage	(	const TBOX &	box,
		const BLOCK &	block,
		int	padding,
		TBOX *	revised_box
	)		const

Definition at line 165 of file linerec.cpp.

                                                                          {
   TBOX wbox = box;
   wbox.pad(padding, padding);
   *revised_box = wbox;
   // Number of clockwise 90 degree rotations needed to get back to tesseract
   // coords from the clipped image.
   int num_rotations = 0;
   if (block.re_rotation().y() > 0.0f)
     num_rotations = 1;
   else if (block.re_rotation().x() < 0.0f)
     num_rotations = 2;
   else if (block.re_rotation().y() < 0.0f)
     num_rotations = 3;
   // Handle two cases automatically: 1 the box came from the block, 2 the box
   // came from a box file, and refers to the image, which the block may not.
   if (block.bounding_box().major_overlap(*revised_box))
     revised_box->rotate(block.re_rotation());
   // Now revised_box always refers to the image.
   // BestPix is never colormapped, but may be of any depth.
   Pix* pix = BestPix();
   int width = pixGetWidth(pix);
   int height = pixGetHeight(pix);
   TBOX image_box(0, 0, width, height);
   // Clip to image bounds;
   *revised_box &= image_box;
   if (revised_box->null_box()) return NULL;
   Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
                             revised_box->width(), revised_box->height());
   Pix* box_pix = pixClipRectangle(pix, clip_box, NULL);
   if (box_pix == NULL) return NULL;
   boxDestroy(&clip_box);
   if (num_rotations > 0) {
     Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
     pixDestroy(&box_pix);
     box_pix = rot_pix;
   }
   // Convert sub-8-bit images to 8 bit.
   int depth = pixGetDepth(box_pix);
   if (depth < 8) {
     Pix* grey;
     grey = pixConvertTo8(box_pix, false);
     pixDestroy(&box_pix);
     box_pix = grey;
   }
   bool vertical_text = false;
   if (num_rotations > 0) {
     // Rotated the clipped revised box back to internal coordinates.
     FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
     revised_box->rotate(rotation);
     if (num_rotations != 2)
       vertical_text = true;
   }
   return new ImageData(vertical_text, box_pix);
 }

◆ GetSubAndSuperscriptCandidates()

void tesseract::Tesseract::GetSubAndSuperscriptCandidates	(	const WERD_RES *	word,
		int *	num_rebuilt_leading,
		ScriptPos *	leading_pos,
		float *	leading_certainty,
		int *	num_rebuilt_trailing,
		ScriptPos *	trailing_pos,
		float *	trailing_certainty,
		float *	avg_certainty,
		float *	unlikely_threshold
	)

Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.

Parameters

[in]	word	The word to examine.
[out]	num_rebuilt_leading	the number of rebuilt blobs at the start of the word which are all up or down and seem badly classified.
[out]	leading_pos	"super" or "sub" (for debugging)
[out]	leading_certainty	the worst certainty in the leading blobs.
[out]	num_rebuilt_trailing	the number of rebuilt blobs at the end of the word which are all up or down and seem badly classified.
[out]	trailing_pos	"super" or "sub" (for debugging)
[out]	trailing_certainty	the worst certainty in the trailing blobs.
[out]	avg_certainty	the average certainty of "normal" blobs in the word.
[out]	unlikely_threshold	the threshold (on certainty) we used to select "bad enough" outlier characters.

Definition at line 253 of file superscript.cpp.

                                                                           {
   *avg_certainty = *unlikely_threshold = 0.0f;
   *num_rebuilt_leading = *num_rebuilt_trailing = 0;
   *leading_certainty = *trailing_certainty = 0.0f;
 
   int super_y_bottom =
       kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
   int sub_y_top =
       kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
 
   // Step one: Get an average certainty for "normally placed" characters.
 
   // Counts here are of blobs in the rebuild_word / unichars in best_choice.
   *leading_pos = *trailing_pos = SP_NORMAL;
   int leading_outliers = 0;
   int trailing_outliers = 0;
   int num_normal = 0;
   float normal_certainty_total = 0.0f;
   float worst_normal_certainty = 0.0f;
   ScriptPos last_pos = SP_NORMAL;
   int num_blobs = word->rebuild_word->NumBlobs();
   for (int b = 0; b < num_blobs; ++b) {
     TBOX box = word->rebuild_word->blobs[b]->bounding_box();
     ScriptPos pos = SP_NORMAL;
     if (box.bottom() >= super_y_bottom) {
       pos = SP_SUPERSCRIPT;
     } else if (box.top() <= sub_y_top) {
       pos = SP_SUBSCRIPT;
     }
     if (pos == SP_NORMAL) {
       if (word->best_choice->unichar_id(b) != 0) {
         float char_certainty = word->best_choice->certainty(b);
         if (char_certainty < worst_normal_certainty) {
           worst_normal_certainty = char_certainty;
         }
         num_normal++;
         normal_certainty_total += char_certainty;
       }
       if (trailing_outliers == b) {
         leading_outliers = trailing_outliers;
         *leading_pos = last_pos;
       }
       trailing_outliers = 0;
     } else {
       if (last_pos == pos) {
         trailing_outliers++;
       } else {
         trailing_outliers = 1;
       }
     }
     last_pos = pos;
   }
   *trailing_pos = last_pos;
   if (num_normal >= 3) {  // throw out the worst as an outlier.
     num_normal--;
     normal_certainty_total -= worst_normal_certainty;
   }
   if (num_normal > 0) {
     *avg_certainty = normal_certainty_total / num_normal;
     *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
   }
   if (num_normal == 0 ||
       (leading_outliers == 0 && trailing_outliers == 0)) {
     return;
   }
 
   // Step two: Try to split off bits of the word that are both outliers
   //           and have much lower certainty than average
   // Calculate num_leading and leading_certainty.
   for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
        *num_rebuilt_leading < leading_outliers;
        (*num_rebuilt_leading)++) {
     float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
     if (char_certainty > *unlikely_threshold) {
       break;
     }
     if (char_certainty < *leading_certainty) {
       *leading_certainty = char_certainty;
     }
   }
 
   // Calculate num_trailing and trailing_certainty.
   for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
        *num_rebuilt_trailing < trailing_outliers;
        (*num_rebuilt_trailing)++) {
     int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
     float char_certainty = word->best_choice->certainty(blob_idx);
     if (char_certainty > *unlikely_threshold) {
       break;
     }
     if (char_certainty < *trailing_certainty) {
       *trailing_certainty = char_certainty;
     }
   }
 }

◆ ImageHeight()

int tesseract::Tesseract::ImageHeight ( ) const

inline

Definition at line 230 of file tesseractclass.h.

                           {
     return pixGetHeight(pix_binary_);
   }

◆ ImageWidth()

int tesseract::Tesseract::ImageWidth ( ) const

inline

Definition at line 227 of file tesseractclass.h.

                          {
     return pixGetWidth(pix_binary_);
   }

◆ init_recog_training()

FILE * tesseract::Tesseract::init_recog_training ( const STRING & fname )

Definition at line 36 of file recogtraining.cpp.

                                                         {
   if (tessedit_ambigs_training) {
     tessedit_tess_adaption_mode.set_value(0);    // turn off adaption
     tessedit_enable_doc_dict.set_value(0);       // turn off document dictionary
     // Explore all segmentations.
     getDict().stopper_no_acceptable_choices.set_value(1);
   }
 
   STRING output_fname = fname;
   const char *lastdot = strrchr(output_fname.string(), '.');
   if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
   output_fname += ".txt";
   FILE *output_file = open_file(output_fname.string(), "a+");
   return output_file;
 }

◆ init_tesseract() [1/2]

int tesseract::Tesseract::init_tesseract	(	const char *	arg0,
		const char *	textbase,
		const char *	language,
		OcrEngineMode	oem,
		char **	configs,
		int	configs_size,
		const GenericVector< STRING > *	vars_vec,
		const GenericVector< STRING > *	vars_values,
		bool	set_only_init_params,
		TessdataManager *	mgr
	)

Definition at line 295 of file tessedit.cpp.

                                                     {
   GenericVector<STRING> langs_to_load;
   GenericVector<STRING> langs_not_to_load;
   ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
 
   sub_langs_.delete_data_pointers();
   sub_langs_.clear();
   // Find the first loadable lang and load into this.
   // Add any languages that this language requires
   bool loaded_primary = false;
   // Load the rest into sub_langs_.
   for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
     if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
       const char *lang_str = langs_to_load[lang_index].string();
       Tesseract *tess_to_init;
       if (!loaded_primary) {
         tess_to_init = this;
       } else {
         tess_to_init = new Tesseract;
       }
 
       int result = tess_to_init->init_tesseract_internal(
           arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
           vars_values, set_only_non_debug_params, mgr);
       // Forget that language, but keep any reader we were given.
       mgr->Clear();
 
       if (!loaded_primary) {
         if (result < 0) {
           tprintf("Failed loading language '%s'\n", lang_str);
         } else {
           ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
                               &langs_to_load, &langs_not_to_load);
           loaded_primary = true;
         }
       } else {
         if (result < 0) {
           tprintf("Failed loading language '%s'\n", lang_str);
           delete tess_to_init;
         } else {
           sub_langs_.push_back(tess_to_init);
           // Add any languages that this language requires
           ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
                               &langs_to_load, &langs_not_to_load);
         }
       }
     }
   }
   if (!loaded_primary) {
     tprintf("Tesseract couldn't load any languages!\n");
     return -1;  // Couldn't load any language!
   }
   if (!sub_langs_.empty()) {
     // In multilingual mode word ratings have to be directly comparable,
     // so use the same language model weights for all languages:
     // use the primary language's params model if
     // tessedit_use_primary_params_model is set,
     // otherwise use default language model weights.
     if (tessedit_use_primary_params_model) {
       for (int s = 0; s < sub_langs_.size(); ++s) {
         sub_langs_[s]->language_model_->getParamsModel().Copy(
             this->language_model_->getParamsModel());
       }
       tprintf("Using params model of the primary language\n");
     } else {
       this->language_model_->getParamsModel().Clear();
       for (int s = 0; s < sub_langs_.size(); ++s) {
         sub_langs_[s]->language_model_->getParamsModel().Clear();
       }
     }
   }
 
   SetupUniversalFontIds();
   return 0;
 }

◆ init_tesseract() [2/2]

int tesseract::Tesseract::init_tesseract	(	const char *	datapath,
		const char *	language,
		OcrEngineMode	oem
	)

inline

Definition at line 504 of file tesseractclass.h.

                                         {
     TessdataManager mgr;
     return init_tesseract(datapath, NULL, language, oem, NULL, 0, NULL, NULL,
                           false, &mgr);
   }

◆ init_tesseract_internal()

int tesseract::Tesseract::init_tesseract_internal	(	const char *	arg0,
		const char *	textbase,
		const char *	language,
		OcrEngineMode	oem,
		char **	configs,
		int	configs_size,
		const GenericVector< STRING > *	vars_vec,
		const GenericVector< STRING > *	vars_values,
		bool	set_only_init_params,
		TessdataManager *	mgr
	)

Definition at line 393 of file tessedit.cpp.

                                                              {
   if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
                                 configs_size, vars_vec, vars_values,
                                 set_only_non_debug_params, mgr)) {
     return -1;
   }
   if (tessedit_init_config_only) {
     return 0;
   }
   // If only LSTM will be used, skip loading Tesseract classifier's
   // pre-trained templates and dictionary.
   bool init_tesseract = tessedit_ocr_engine_mode != OEM_LSTM_ONLY;
   program_editup(textbase, init_tesseract ? mgr : nullptr,
                  init_tesseract ? mgr : nullptr);
   return 0;                      //Normal exit
 }

◆ init_tesseract_lang_data()

bool tesseract::Tesseract::init_tesseract_lang_data	(	const char *	arg0,
		const char *	textbase,
		const char *	language,
		OcrEngineMode	oem,
		char **	configs,
		int	configs_size,
		const GenericVector< STRING > *	vars_vec,
		const GenericVector< STRING > *	vars_values,
		bool	set_only_init_params,
		TessdataManager *	mgr
	)

Definition at line 91 of file tessedit.cpp.

                           {
   // Set the basename, compute the data directory.
   main_setup(arg0, textbase);
 
   // Set the language data path prefix
   lang = language != NULL ? language : "eng";
   language_data_path_prefix = datadir;
   language_data_path_prefix += lang;
   language_data_path_prefix += ".";
 
   // Initialize TessdataManager.
   STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
   if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
     // Try without tessdata.
     m_data_sub_dir.set_value("");
     main_setup(arg0, textbase);
     language_data_path_prefix = datadir;
     language_data_path_prefix += lang;
     language_data_path_prefix += ".";
     tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
     if (!mgr->Init(tessdata_path.string())) {
       tprintf("Error opening data file %s\n", tessdata_path.string());
       tprintf(
           "Please make sure the TESSDATA_PREFIX environment variable is set"
           " to your \"tessdata\" directory.\n");
       return false;
     }
   }
   if (oem == OEM_DEFAULT) {
     // Set the engine mode from availability, which can then be overidden by
     // the config file when we read it below.
     if (!mgr->IsLSTMAvailable()) {
       tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
     } else if (!mgr->IsBaseAvailable()) {
       tessedit_ocr_engine_mode.set_value(OEM_LSTM_ONLY);
     } else {
       tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_LSTM_COMBINED);
     }
   }
 
   // If a language specific config file (lang.config) exists, load it in.
   TFile fp;
   if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
     ParamUtils::ReadParamsFromFp(SET_PARAM_CONSTRAINT_NONE, &fp,
                                  this->params());
   }
 
   SetParamConstraint set_params_constraint = set_only_non_debug_params ?
       SET_PARAM_CONSTRAINT_NON_DEBUG_ONLY : SET_PARAM_CONSTRAINT_NONE;
   // Load tesseract variables from config files. This is done after loading
   // language-specific variables from [lang].traineddata file, so that custom
   // config files can override values in [lang].traineddata file.
   for (int i = 0; i < configs_size; ++i) {
     read_config_file(configs[i], set_params_constraint);
   }
 
   // Set params specified in vars_vec (done after setting params from config
   // files, so that params in vars_vec can override those from files).
   if (vars_vec != NULL && vars_values != NULL) {
     for (int i = 0; i < vars_vec->size(); ++i) {
       if (!ParamUtils::SetParam((*vars_vec)[i].string(),
                                 (*vars_values)[i].string(),
                                 set_params_constraint, this->params())) {
         tprintf("Error setting param %s\n", (*vars_vec)[i].string());
         exit(1);
       }
     }
   }
 
   if (((STRING &)tessedit_write_params_to_file).length() > 0) {
     FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
     if (params_file != NULL) {
       ParamUtils::PrintParams(params_file, this->params());
       fclose(params_file);
     } else {
       tprintf("Failed to open %s for writing params.\n",
               tessedit_write_params_to_file.string());
     }
   }
 
   // Determine which ocr engine(s) should be loaded and used for recognition.
   if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
 
   // If we are only loading the config file (and so not planning on doing any
   // recognition) then there's nothing else do here.
   if (tessedit_init_config_only) {
     return true;
   }
 
 // The various OcrEngineMode settings (see publictypes.h) determine which
 // engine-specific data files need to be loaded.
 // If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
 #ifndef ANDROID_BUILD
   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY ||
       tessedit_ocr_engine_mode == OEM_TESSERACT_LSTM_COMBINED) {
     if (mgr->GetComponent(TESSDATA_LSTM, &fp)) {
       lstm_recognizer_ = new LSTMRecognizer;
       ASSERT_HOST(lstm_recognizer_->DeSerialize(&fp));
       if (lstm_use_matrix) lstm_recognizer_->LoadDictionary(language, mgr);
     } else {
       tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
       tessedit_ocr_engine_mode.set_value(OEM_TESSERACT_ONLY);
     }
   }
 #endif
 
   // Load the unicharset
   if (tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
     // Avoid requiring a unicharset when we aren't running base tesseract.
 #ifndef ANDROID_BUILD
     unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
 #endif
   } else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
              !unicharset.load_from_file(&fp, false)) {
     return false;
   }
   if (unicharset.size() > MAX_NUM_CLASSES) {
     tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
     return false;
   }
   right_to_left_ = unicharset.major_right_to_left();
 
   // Setup initial unichar ambigs table and read universal ambigs.
   UNICHARSET encoder_unicharset;
   encoder_unicharset.CopyFrom(unicharset);
   unichar_ambigs.InitUnicharAmbigs(unicharset, use_ambigs_for_adaption);
   unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
 
   if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
     unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
                                      ambigs_debug_level,
                                      use_ambigs_for_adaption, &unicharset);
   }
   // Init ParamsModel.
   // Load pass1 and pass2 weights (for now these two sets are the same, but in
   // the future separate sets of weights can be generated).
   for (int p = ParamsModel::PTRAIN_PASS1;
       p < ParamsModel::PTRAIN_NUM_PASSES; ++p) {
     language_model_->getParamsModel().SetPass(
         static_cast<ParamsModel::PassEnum>(p));
     if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
       if (!language_model_->getParamsModel().LoadFromFp(lang.string(), &fp)) {
         return false;
       }
     }
   }
 
   return true;
 }

◆ init_tesseract_lm()

int tesseract::Tesseract::init_tesseract_lm	(	const char *	arg0,
		const char *	textbase,
		const char *	language,
		TessdataManager *	mgr
	)

Definition at line 457 of file tessedit.cpp.

                                                                              {
   if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
                                 NULL, 0, NULL, NULL, false, mgr))
     return -1;
   getDict().SetupForLoad(Dict::GlobalDawgCache());
   getDict().Load(lang, mgr);
   getDict().FinishLoad();
   return 0;
 }

◆ join_words()

void tesseract::Tesseract::join_words	(	WERD_RES *	word,
		WERD_RES *	word2,
		BlamerBundle *	orig_bb
	)		const

Definition at line 240 of file tfacepp.cpp.

                                                         {
   TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
   TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
   // Tack the word2 outputs onto the end of the word outputs.
   word->chopped_word->blobs += word2->chopped_word->blobs;
   word->rebuild_word->blobs += word2->rebuild_word->blobs;
   word2->chopped_word->blobs.clear();
   word2->rebuild_word->blobs.clear();
   TPOINT split_pt;
   split_pt.x = (prev_box.right() + blob_box.left()) / 2;
   split_pt.y = (prev_box.top() + prev_box.bottom() +
                 blob_box.top() + blob_box.bottom()) / 4;
   // Move the word2 seams onto the end of the word1 seam_array.
   // Since the seam list is one element short, an empty seam marking the
   // end of the last blob in the first word is needed first.
   word->seam_array.push_back(new SEAM(0.0f, split_pt));
   word->seam_array += word2->seam_array;
   word2->seam_array.truncate(0);
   // Fix widths and gaps.
   word->blob_widths += word2->blob_widths;
   word->blob_gaps += word2->blob_gaps;
   // Fix the ratings matrix.
   int rat1 = word->ratings->dimension();
   int rat2 = word2->ratings->dimension();
   word->ratings->AttachOnCorner(word2->ratings);
   ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
   word->best_state += word2->best_state;
   // Append the word choices.
   *word->raw_choice += *word2->raw_choice;
 
   // How many alt choices from each should we try to get?
   const int kAltsPerPiece = 2;
   // When do we start throwing away extra alt choices?
   const int kTooManyAltChoices = 100;
 
   // Construct the cartesian product of the best_choices of word(1) and word2.
   WERD_CHOICE_LIST joined_choices;
   WERD_CHOICE_IT jc_it(&joined_choices);
   WERD_CHOICE_IT bc1_it(&word->best_choices);
   WERD_CHOICE_IT bc2_it(&word2->best_choices);
   int num_word1_choices = word->best_choices.length();
   int total_joined_choices = num_word1_choices;
   // Nota Bene: For the main loop here, we operate only on the 2nd and greater
   // word2 choices, and put them in the joined_choices list. The 1st word2
   // choice gets added to the original word1 choices in-place after we have
   // finished with them.
   int bc2_index = 1;
   for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
     if (total_joined_choices >= kTooManyAltChoices &&
         bc2_index > kAltsPerPiece)
       break;
     int bc1_index = 0;
     for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
         ++bc1_index, bc1_it.forward()) {
       if (total_joined_choices >= kTooManyAltChoices &&
           bc1_index > kAltsPerPiece)
         break;
       WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
       *wc += *bc2_it.data();
       jc_it.add_after_then_move(wc);
       ++total_joined_choices;
     }
   }
   // Now that we've filled in as many alternates as we want, paste the best
   // choice for word2 onto the original word alt_choices.
   bc1_it.move_to_first();
   bc2_it.move_to_first();
   for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
     *bc1_it.data() += *bc2_it.data();
   }
   bc1_it.move_to_last();
   bc1_it.add_list_after(&joined_choices);
 
   // Restore the pointer to original blamer bundle and combine blamer
   // information recorded in the splits.
   if (orig_bb != NULL) {
     orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
                         wordrec_debug_blamer);
     delete word->blamer_bundle;
     word->blamer_bundle = orig_bb;
   }
   word->SetupBoxWord();
   word->reject_map.initialise(word->box_word->length());
   delete word2;
 }

◆ LSTMRecognizeWord()

void tesseract::Tesseract::LSTMRecognizeWord	(	const BLOCK &	block,
		ROW *	row,
		WERD_RES *	word,
		PointerVector< WERD_RES > *	words
	)

Definition at line 224 of file linerec.cpp.

                                                                   {
   TBOX word_box = word->word->bounding_box();
   // Get the word image - no frills.
   if (tessedit_pageseg_mode == PSM_SINGLE_WORD ||
       tessedit_pageseg_mode == PSM_RAW_LINE) {
     // In single word mode, use the whole image without any other row/word
     // interpretation.
     word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
   } else {
     float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
     if (baseline + row->descenders() < word_box.bottom())
       word_box.set_bottom(baseline + row->descenders());
     if (baseline + row->x_height() + row->ascenders() > word_box.top())
       word_box.set_top(baseline + row->x_height() + row->ascenders());
   }
   ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
   if (im_data == NULL) return;
   lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
                                   kWorstDictCertainty / kCertaintyScale,
                                   lstm_use_matrix, &unicharset, word_box, 2.0,
                                   false, words);
   delete im_data;
   SearchWords(words);
 }

◆ make_reject_map()

void tesseract::Tesseract::make_reject_map	(	WERD_RES *	word,
		ROW *	row,
		inT16	pass
	)

◆ match_current_words()

void tesseract::Tesseract::match_current_words	(	WERD_RES_LIST &	words,
		ROW *	row,
		BLOCK *	block
	)

Definition at line 196 of file fixspace.cpp.

                                                   {
   WERD_RES_IT word_it(&words);
   WERD_RES *word;
   // Since we are not using PAGE_RES to iterate over words, we need to update
   // prev_word_best_choice_ before calling classify_word_pass2().
   prev_word_best_choice_ = NULL;
   for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
     word = word_it.data();
     if ((!word->part_of_combo) && (word->box_word == NULL)) {
       WordData word_data(block, row, word);
       SetupWordPassN(2, &word_data);
       classify_word_and_language(2, NULL, &word_data);
     }
     prev_word_best_choice_ = word->best_choice;
   }
 }

◆ match_word_pass_n()

void tesseract::Tesseract::match_word_pass_n	(	int	pass_n,
		WERD_RES *	word,
		ROW *	row,
		BLOCK *	block
	)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1576 of file control.cpp.

                                                           {
   if (word->tess_failed) return;
   tess_segment_pass_n(pass_n, word);
 
   if (!word->tess_failed) {
     if (!word->word->flag (W_REP_CHAR)) {
        word->fix_quotes();
       if (tessedit_fix_hyphens)
         word->fix_hyphens();
       /* Don't trust fix_quotes! - though I think I've fixed the bug */
       if (word->best_choice->length() != word->box_word->length()) {
         tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
                 " #Blobs=%d\n",
                 word->best_choice->debug_string().string(),
                 word->best_choice->length(),
                 word->box_word->length());
 
       }
       word->tess_accepted = tess_acceptable_word(word);
 
       // Also sets word->done flag
       make_reject_map(word, row, pass_n);
     }
   }
   set_word_fonts(word);
 
   ASSERT_HOST(word->raw_choice != NULL);
 }

◆ MaximallyChopWord()

void tesseract::Tesseract::MaximallyChopWord	(	const GenericVector< TBOX > &	boxes,
		BLOCK *	block,
		ROW *	row,
		WERD_RES *	word_res
	)

Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.

Definition at line 253 of file applybox.cpp.

                                                       {
   if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
                                      tessedit_ocr_engine_mode, NULL,
                                      classify_bln_numeric_mode,
                                      textord_use_cjk_fp_model,
                                      poly_allow_detailed_fx,
                                      row, block)) {
     word_res->CloneChoppedToRebuild();
     return;
   }
   if (chop_debug) {
     tprintf("Maximally chopping word at:");
     word_res->word->bounding_box().print();
   }
   GenericVector<BLOB_CHOICE*> blob_choices;
   ASSERT_HOST(!word_res->chopped_word->blobs.empty());
   float rating = static_cast<float>(MAX_INT8);
   for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
     // The rating and certainty are not quite arbitrary. Since
     // select_blob_to_chop uses the worst certainty to choose, they all have
     // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
     // in here, and then divide by e each time they are chopped, which
     // should guarantee a set of unequal values for the whole tree of blobs
     // produced, however much chopping is required. The chops are thus only
     // limited by the ability of the chopper to find suitable chop points,
     // and not by the value of the certainties.
     BLOB_CHOICE* choice =
         new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
     blob_choices.push_back(choice);
     rating -= 0.125f;
   }
   const double e = exp(1.0);  // The base of natural logs.
   int blob_number;
   int right_chop_index = 0;
   if (!assume_fixed_pitch_char_segment) {
     // We only chop if the language is not fixed pitch like CJK.
     SEAM* seam = NULL;
     while ((seam = chop_one_blob(boxes, blob_choices, word_res,
                                  &blob_number)) != NULL) {
       word_res->InsertSeam(blob_number, seam);
       BLOB_CHOICE* left_choice = blob_choices[blob_number];
       rating = left_choice->rating() / e;
       left_choice->set_rating(rating);
       left_choice->set_certainty(-rating);
       // combine confidence w/ serial #
       BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
                                                   rating - 0.125f, -rating, -1,
                                                   0.0f, 0.0f, 0.0f, BCC_FAKE);
       blob_choices.insert(right_choice, blob_number + 1);
     }
   }
   word_res->CloneChoppedToRebuild();
   word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
 }

◆ mutable_pix_binary()

Pix** tesseract::Tesseract::mutable_pix_binary ( )

inline

Definition at line 185 of file tesseractclass.h.

                              {
     pixDestroy(&pix_binary_);
     return &pix_binary_;
   }

◆ mutable_textord()

Textord* tesseract::Tesseract::mutable_textord ( )

inline

Definition at line 246 of file tesseractclass.h.

                              {
     return &textord_;
   }

◆ nn_match_word()

void tesseract::Tesseract::nn_match_word	(	WERD_RES *	word,
		ROW *	row
	)

◆ nn_recover_rejects()

void tesseract::Tesseract::nn_recover_rejects	(	WERD_RES *	word,
		ROW *	row
	)

◆ noise_outlines()

BOOL8 tesseract::Tesseract::noise_outlines ( TWERD * word )

Definition at line 982 of file docqual.cpp.

                                            {
   TBOX box;                       // BB of outline
   inT16 outline_count = 0;
   inT16 small_outline_count = 0;
   inT16 max_dimension;
   float small_limit = kBlnXHeight * crunch_small_outlines_size;
 
   for (int b = 0; b < word->NumBlobs(); ++b) {
     TBLOB* blob = word->blobs[b];
     for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
       outline_count++;
       box = ol->bounding_box();
       if (box.height() > box.width())
         max_dimension = box.height();
       else
         max_dimension = box.width();
       if (max_dimension < small_limit)
         small_outline_count++;
     }
   }
   return small_outline_count >= outline_count;
 }

◆ non_0_digit()

BOOL8 tesseract::Tesseract::non_0_digit	(	const UNICHARSET &	ch_set,
		UNICHAR_ID	unichar_id
	)

Definition at line 789 of file reject.cpp.

                                                                             {
   return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
 }

◆ non_O_upper()

BOOL8 tesseract::Tesseract::non_O_upper	(	const UNICHARSET &	ch_set,
		UNICHAR_ID	unichar_id
	)

Definition at line 785 of file reject.cpp.

                                                                             {
   return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
 }

◆ num_sub_langs()

int tesseract::Tesseract::num_sub_langs ( ) const

inline

Definition at line 253 of file tesseractclass.h.

                             {
     return sub_langs_.size();
   }

◆ one_ell_conflict()

BOOL8 tesseract::Tesseract::one_ell_conflict	(	WERD_RES *	word_res,
		BOOL8	update_map
	)

Definition at line 292 of file reject.cpp.

                                                                       {
   const char *word;
   const char *lengths;
   inT16 word_len;                //its length
   inT16 first_alphanum_index_;
   inT16 first_alphanum_offset_;
   inT16 i;
   inT16 offset;
   BOOL8 non_conflict_set_char;   //non conf set a/n?
   BOOL8 conflict = FALSE;
   BOOL8 allow_1s;
   ACCEPTABLE_WERD_TYPE word_type;
   BOOL8 dict_perm_type;
   BOOL8 dict_word_ok;
   int dict_word_type;
 
   word = word_res->best_choice->unichar_string().string ();
   lengths = word_res->best_choice->unichar_lengths().string();
   word_len = strlen (lengths);
   /*
     If there are no occurrences of the conflict set characters then the word
     is OK.
   */
   if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
     return FALSE;
 
   /*
     There is a conflict if there are NO other (confirmed) alphanumerics apart
     from those in the conflict set.
   */
 
   for (i = 0, offset = 0, non_conflict_set_char = FALSE;
        (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
     non_conflict_set_char =
         (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
             word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
         !STRING (conflict_set_I_l_1).contains (word[offset]);
   if (!non_conflict_set_char) {
     if (update_map)
       reject_I_1_L(word_res);
     return TRUE;
   }
 
   /*
     If the word is accepted by a dawg permuter, and the first alpha character
     is "I" or "l", check to see if the alternative is also a dawg word. If it
     is, then there is a potential error otherwise the word is ok.
   */
 
   dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
     (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
     (rej_trust_doc_dawg &&
     (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
     (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
   dict_word_type = dict_word(*(word_res->best_choice));
   dict_word_ok = (dict_word_type > 0) &&
     (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
 
   if ((rej_1Il_use_dict_word && dict_word_ok) ||
     (rej_1Il_trust_permuter_type && dict_perm_type) ||
   (dict_perm_type && dict_word_ok)) {
     first_alphanum_index_ = first_alphanum_index (word, lengths);
     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
     if (lengths[first_alphanum_index_] == 1 &&
         word[first_alphanum_offset_] == 'I') {
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
       if (safe_dict_word(word_res) > 0) {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
         if (update_map)
           word_res->reject_map[first_alphanum_index_].
             setrej_1Il_conflict();
         return TRUE;
       }
       else {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
         return FALSE;
       }
     }
 
     if (lengths[first_alphanum_index_] == 1 &&
         word[first_alphanum_offset_] == 'l') {
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
       if (safe_dict_word(word_res) > 0) {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
         if (update_map)
           word_res->reject_map[first_alphanum_index_].
             setrej_1Il_conflict();
         return TRUE;
       }
       else {
         word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
         return FALSE;
       }
     }
     return FALSE;
   }
 
   /*
     NEW 1Il code. The old code relied on permuter types too much. In fact,
     tess will use TOP_CHOICE permute for good things like "palette".
     In this code the string is examined independently to see if it looks like
     a well formed word.
   */
 
   /*
     REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
     dictionary word.
   */
   first_alphanum_index_ = first_alphanum_index (word, lengths);
   first_alphanum_offset_ = first_alphanum_offset (word, lengths);
   if (lengths[first_alphanum_index_] == 1 &&
       word[first_alphanum_offset_] == 'l') {
     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
     if (safe_dict_word(word_res) > 0)
       return FALSE;
     else
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
   }
   else if (lengths[first_alphanum_index_] == 1 &&
            word[first_alphanum_offset_] == 'I') {
     word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
     if (safe_dict_word(word_res) > 0)
       return FALSE;
     else
       word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
   }
   /*
     For strings containing digits:
       If there are no alphas OR the numeric permuter liked the word,
         reject any non 1 conflict chs
       Else reject all conflict chs
   */
   if (word_contains_non_1_digit (word, lengths)) {
     allow_1s = (alpha_count (word, lengths) == 0) ||
       (word_res->best_choice->permuter () == NUMBER_PERM);
 
     inT16 offset;
     conflict = FALSE;
     for (i = 0, offset = 0; word[offset] != '\0';
          offset += word_res->best_choice->unichar_lengths()[i++]) {
       if ((!allow_1s || (word[offset] != '1')) &&
       STRING (conflict_set_I_l_1).contains (word[offset])) {
         if (update_map)
           word_res->reject_map[i].setrej_1Il_conflict ();
         conflict = TRUE;
       }
     }
     return conflict;
   }
   /*
     For anything else. See if it conforms to an acceptable word type. If so,
     treat accordingly.
   */
   word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
   if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
     first_alphanum_index_ = first_alphanum_index (word, lengths);
     first_alphanum_offset_ = first_alphanum_offset (word, lengths);
     if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
       if (update_map)
         word_res->reject_map[first_alphanum_index_].
             setrej_1Il_conflict ();
       return TRUE;
     }
     else
       return FALSE;
   }
   else if (word_type == AC_UPPER_CASE) {
     return FALSE;
   }
   else {
     if (update_map)
       reject_I_1_L(word_res);
     return TRUE;
   }
 }

◆ output_pass()

void tesseract::Tesseract::output_pass	(	PAGE_RES_IT &	page_res_it,
		const TBOX *	target_word_box
	)

Definition at line 68 of file output.cpp.

                                                          {
   BLOCK_RES *block_of_last_word;
   BOOL8 force_eol;               //During output
   BLOCK *nextblock;              //block of next word
   WERD *nextword;                //next word
 
   page_res_it.restart_page ();
   block_of_last_word = NULL;
   while (page_res_it.word () != NULL) {
     check_debug_pt (page_res_it.word (), 120);
 
     if (target_word_box) {
       TBOX current_word_box = page_res_it.word()->word->bounding_box();
       FCOORD center_pt(
           (current_word_box.right() + current_word_box.left()) / 2,
           (current_word_box.bottom() + current_word_box.top()) / 2);
       if (!target_word_box->contains(center_pt)) {
         page_res_it.forward();
         continue;
       }
     }
     if (tessedit_write_block_separators &&
     block_of_last_word != page_res_it.block ()) {
       block_of_last_word = page_res_it.block ();
     }
 
     force_eol = (tessedit_write_block_separators &&
       (page_res_it.block () != page_res_it.next_block ())) ||
       (page_res_it.next_word () == NULL);
 
     if (page_res_it.next_word () != NULL)
       nextword = page_res_it.next_word ()->word;
     else
       nextword = NULL;
     if (page_res_it.next_block () != NULL)
       nextblock = page_res_it.next_block ()->block;
     else
       nextblock = NULL;
                                  //regardless of tilde crunching
     write_results(page_res_it,
                   determine_newline_type(page_res_it.word()->word,
                                          page_res_it.block()->block,
                                          nextword, nextblock), force_eol);
     page_res_it.forward();
   }
 }

◆ ParseLanguageString()

void tesseract::Tesseract::ParseLanguageString	(	const char *	lang_str,
		GenericVector< STRING > *	to_load,
		GenericVector< STRING > *	not_to_load
	)

Definition at line 261 of file tessedit.cpp.

                                                                         {
   STRING remains(lang_str);
   while (remains.length() > 0) {
     // Find the start of the lang code and which vector to add to.
     const char* start = remains.string();
     while (*start == '+')
       ++start;
     GenericVector<STRING>* target = to_load;
     if (*start == '~') {
       target = not_to_load;
       ++start;
     }
     // Find the index of the end of the lang code in string start.
     int end = strlen(start);
     const char* plus = strchr(start, '+');
     if (plus != NULL && plus - start < end)
       end = plus - start;
     STRING lang_code(start);
     lang_code.truncate_at(end);
     STRING next(start + end);
     remains = next;
     // Check whether lang_code is already in the target vector and add.
     if (!IsStrInList(lang_code, *target)) {
       target->push_back(lang_code);
     }
   }
 }

◆ pgeditor_main()

void tesseract::Tesseract::pgeditor_main	(	int	width,
		int	height,
		PAGE_RES *	page_res
	)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 337 of file pgedit.cpp.

                                                                        {
   current_page_res = page_res;
   if (current_page_res->block_res_list.empty())
     return;
 
   recog_done = false;
   stillRunning = true;
 
   build_image_window(width, height);
   word_display_mode.turn_on_bit(DF_EDGE_STEP);
   do_re_display(&tesseract::Tesseract::word_set_display);
 #ifndef GRAPHICS_DISABLED
   pe = new ParamsEditor(this, image_win);
 #endif
   PGEventHandler pgEventHandler(this);
 
   image_win->AddEventHandler(&pgEventHandler);
   image_win->AddMessageBox();
 
   SVMenuNode* svMenuRoot = build_menu_new();
 
   svMenuRoot->BuildMenu(image_win);
   image_win->SetVisible(true);
 
   image_win->AwaitEvent(SVET_DESTROY);
   image_win->AddEventHandler(NULL);
 }

◆ pix_binary()

Pix* tesseract::Tesseract::pix_binary ( ) const

inline

Definition at line 189 of file tesseractclass.h.

                           {
     return pix_binary_;
   }

◆ pix_grey()

Pix* tesseract::Tesseract::pix_grey ( ) const

inline

Definition at line 192 of file tesseractclass.h.

                         {
     return pix_grey_;
   }

◆ pix_original()

Pix* tesseract::Tesseract::pix_original ( ) const

inline

Definition at line 199 of file tesseractclass.h.

199 { return pix_original_; }

◆ potential_word_crunch()

BOOL8 tesseract::Tesseract::potential_word_crunch	(	WERD_RES *	word,
		GARBAGE_LEVEL	garbage_level,
		BOOL8	ok_dict_word
	)

Definition at line 546 of file docqual.cpp.

                                                            {
   float rating_per_ch;
   int adjusted_len;
   const char *str = word->best_choice->unichar_string().string();
   const char *lengths = word->best_choice->unichar_lengths().string();
   BOOL8 word_crunchable;
   int poor_indicator_count = 0;
 
   word_crunchable = !crunch_leave_accept_strings ||
                     word->reject_map.length() < 3 ||
                     (acceptable_word_string(*word->uch_set,
                                             str, lengths) == AC_UNACCEPTABLE &&
                      !ok_dict_word);
 
   adjusted_len = word->reject_map.length();
   if (adjusted_len > 10)
     adjusted_len = 10;
   rating_per_ch = word->best_choice->rating() / adjusted_len;
 
   if (rating_per_ch > crunch_pot_poor_rate) {
     if (crunch_debug > 2) {
       tprintf("Potential poor rating on \"%s\"\n",
               word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }
 
   if (word_crunchable &&
       word->best_choice->certainty() < crunch_pot_poor_cert) {
     if (crunch_debug > 2) {
       tprintf("Potential poor cert on \"%s\"\n",
               word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }
 
   if (garbage_level != G_OK) {
     if (crunch_debug > 2) {
       tprintf("Potential garbage on \"%s\"\n",
               word->best_choice->unichar_string().string());
     }
     poor_indicator_count++;
   }
   return poor_indicator_count >= crunch_pot_indicators;
 }

◆ PreenXHeights()

void tesseract::Tesseract::PreenXHeights ( BLOCK_LIST * block_list )

Any row xheight that is significantly different from the median is set to the median.

Definition at line 193 of file applybox.cpp.

                                                     {
   double median_xheight = MedianXHeight(block_list);
   double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
   // Strip all fuzzy space markers to simplify the PAGE_RES.
   BLOCK_IT b_it(block_list);
   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
     BLOCK* block = b_it.data();
     ROW_IT r_it(block->row_list());
     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
       ROW* row = r_it.data();
       float diff = fabs(row->x_height() - median_xheight);
       if (diff > max_deviation) {
         if (applybox_debug) {
           tprintf("row xheight=%g, but median xheight = %g\n",
                   row->x_height(), median_xheight);
         }
         row->set_x_height(static_cast<float>(median_xheight));
       }
     }
   }
 }

◆ PrepareForPageseg()

void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 688 of file tesseractclass.cpp.

                                   {
   textord_.set_use_cjk_fp_model(textord_use_cjk_fp_model);
   // Find the max splitter strategy over all langs.
   ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy =
       static_cast<ShiroRekhaSplitter::SplitStrategy>(
       static_cast<inT32>(pageseg_devanagari_split_strategy));
   for (int i = 0; i < sub_langs_.size(); ++i) {
     ShiroRekhaSplitter::SplitStrategy pageseg_strategy =
         static_cast<ShiroRekhaSplitter::SplitStrategy>(
         static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
     if (pageseg_strategy > max_pageseg_strategy)
       max_pageseg_strategy = pageseg_strategy;
     pixDestroy(&sub_langs_[i]->pix_binary_);
     sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
   }
   // Perform shiro-rekha (top-line) splitting and replace the current image by
   // the newly splitted image.
   splitter_.set_orig_pix(pix_binary());
   splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
   if (splitter_.Split(true, &pixa_debug_)) {
     ASSERT_HOST(splitter_.splitted_image());
     pixDestroy(&pix_binary_);
     pix_binary_ = pixClone(splitter_.splitted_image());
   }
 }

◆ PrepareForTessOCR()

void tesseract::Tesseract::PrepareForTessOCR	(	BLOCK_LIST *	block_list,
		Tesseract *	osd_tess,
		OSResults *	osr
	)

Definition at line 719 of file tesseractclass.cpp.

                                                                        {
   // Find the max splitter strategy over all langs.
   ShiroRekhaSplitter::SplitStrategy max_ocr_strategy =
       static_cast<ShiroRekhaSplitter::SplitStrategy>(
       static_cast<inT32>(ocr_devanagari_split_strategy));
   for (int i = 0; i < sub_langs_.size(); ++i) {
     ShiroRekhaSplitter::SplitStrategy ocr_strategy =
         static_cast<ShiroRekhaSplitter::SplitStrategy>(
         static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
     if (ocr_strategy > max_ocr_strategy)
       max_ocr_strategy = ocr_strategy;
   }
   // Utilize the segmentation information available.
   splitter_.set_segmentation_block_list(block_list);
   splitter_.set_ocr_split_strategy(max_ocr_strategy);
   // Run the splitter for OCR
   bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
   // Restore pix_binary to the binarized original pix for future reference.
   ASSERT_HOST(splitter_.orig_pix());
   pixDestroy(&pix_binary_);
   pix_binary_ = pixClone(splitter_.orig_pix());
   // If the pageseg and ocr strategies are different, refresh the block list
   // (from the last SegmentImage call) with blobs from the real image to be used
   // for OCR.
   if (splitter_.HasDifferentSplitStrategies()) {
     BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
                 pixGetHeight(pix_binary_));
     Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
         splitter_.orig_pix();
     extract_edges(pix_for_ocr, &block);
     splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
   }
   // The splitter isn't needed any more after this, so save memory by clearing.
   splitter_.Clear();
 }

◆ PrerecAllWordsPar()

void tesseract::Tesseract::PrerecAllWordsPar ( const GenericVector< WordData > & words )

Definition at line 39 of file par_control.cpp.

                                                                       {
   // Prepare all the blobs.
   GenericVector<BlobData> blobs;
   for (int w = 0; w < words.size(); ++w) {
     if (words[w].word->ratings != NULL &&
         words[w].word->ratings->get(0, 0) == NULL) {
       for (int s = 0; s < words[w].lang_words.size(); ++s) {
         Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
         const WERD_RES& word = *words[w].lang_words[s];
         for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
           blobs.push_back(BlobData(b, sub, word));
         }
       }
     }
   }
   // Pre-classify all the blobs.
   if (tessedit_parallelize > 1) {
 #ifdef _OPENMP
 #pragma omp parallel for num_threads(10)
 #endif  // _OPENMP
     for (int b = 0; b < blobs.size(); ++b) {
       *blobs[b].choices =
           blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
     }
   } else {
     // TODO(AMD) parallelize this.
     for (int b = 0; b < blobs.size(); ++b) {
       *blobs[b].choices =
           blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
     }
   }
 }

◆ process_cmd_win_event()

BOOL8 tesseract::Tesseract::process_cmd_win_event	(	inT32	cmd_event,
		char *	new_value
	)

Definition at line 397 of file pgedit.cpp.

                                         {
   char msg[160];
   BOOL8 exit = FALSE;
 
   color_mode = CM_RAINBOW;
 
   // Run recognition on the full page if needed.
   switch (cmd_event) {
     case BLAMER_CMD_EVENT:
     case SHOW_SUBSCRIPT_CMD_EVENT:
     case SHOW_SUPERSCRIPT_CMD_EVENT:
     case SHOW_ITALIC_CMD_EVENT:
     case SHOW_BOLD_CMD_EVENT:
     case SHOW_UNDERLINE_CMD_EVENT:
     case SHOW_FIXEDPITCH_CMD_EVENT:
     case SHOW_SERIF_CMD_EVENT:
     case SHOW_SMALLCAPS_CMD_EVENT:
     case SHOW_DROPCAPS_CMD_EVENT:
       if (!recog_done) {
         recog_all_words(current_page_res, NULL, NULL, NULL, 0);
         recog_done = true;
       }
       break;
     default:
       break;
   }
 
   switch (cmd_event) {
     case NULL_CMD_EVENT:
       break;
 
     case CHANGE_DISP_CMD_EVENT:
     case DUMP_WERD_CMD_EVENT:
     case SHOW_POINT_CMD_EVENT:
     case SHOW_BLN_WERD_CMD_EVENT:
     case RECOG_WERDS:
     case RECOG_PSEUDO:
     case SHOW_BLOB_FEATURES:
       mode =(CMD_EVENTS) cmd_event;
       break;
     case DEBUG_WERD_CMD_EVENT:
       mode = DEBUG_WERD_CMD_EVENT;
       word_config_ = image_win->ShowInputDialog("Config File Name");
       break;
     case BOUNDING_BOX_CMD_EVENT:
       if (new_value[0] == 'T')
         word_display_mode.turn_on_bit(DF_BOX);
       else
         word_display_mode.turn_off_bit(DF_BOX);
       mode = CHANGE_DISP_CMD_EVENT;
       break;
     case BLAMER_CMD_EVENT:
       if (new_value[0] == 'T')
         word_display_mode.turn_on_bit(DF_BLAMER);
       else
         word_display_mode.turn_off_bit(DF_BLAMER);
       do_re_display(&tesseract::Tesseract::word_display);
       mode = CHANGE_DISP_CMD_EVENT;
       break;
     case CORRECT_TEXT_CMD_EVENT:
       if (new_value[0] == 'T')
         word_display_mode.turn_on_bit(DF_TEXT);
       else
         word_display_mode.turn_off_bit(DF_TEXT);
       mode = CHANGE_DISP_CMD_EVENT;
       break;
     case POLYGONAL_CMD_EVENT:
       if (new_value[0] == 'T')
         word_display_mode.turn_on_bit(DF_POLYGONAL);
       else
         word_display_mode.turn_off_bit(DF_POLYGONAL);
       mode = CHANGE_DISP_CMD_EVENT;
       break;
     case BL_NORM_CMD_EVENT:
       if (new_value[0] == 'T')
         word_display_mode.turn_on_bit(DF_BN_POLYGONAL);
       else
         word_display_mode.turn_off_bit(DF_BN_POLYGONAL);
       mode = CHANGE_DISP_CMD_EVENT;
       break;
     case BITMAP_CMD_EVENT:
       if (new_value[0] == 'T')
         word_display_mode.turn_on_bit(DF_EDGE_STEP);
       else
         word_display_mode.turn_off_bit(DF_EDGE_STEP);
       mode = CHANGE_DISP_CMD_EVENT;
       break;
     case UNIFORM_DISP_CMD_EVENT:
       do_re_display(&tesseract::Tesseract::word_set_display);
       break;
     case IMAGE_CMD_EVENT:
       display_image =(new_value[0] == 'T');
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case BLOCKS_CMD_EVENT:
       display_blocks =(new_value[0] == 'T');
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case BASELINES_CMD_EVENT:
       display_baselines =(new_value[0] == 'T');
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case SHOW_SUBSCRIPT_CMD_EVENT:
       color_mode = CM_SUBSCRIPT;
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case SHOW_SUPERSCRIPT_CMD_EVENT:
       color_mode = CM_SUPERSCRIPT;
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case SHOW_ITALIC_CMD_EVENT:
       color_mode = CM_ITALIC;
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case SHOW_BOLD_CMD_EVENT:
       color_mode = CM_BOLD;
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case SHOW_UNDERLINE_CMD_EVENT:
       color_mode = CM_UNDERLINE;
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case SHOW_FIXEDPITCH_CMD_EVENT:
       color_mode = CM_FIXEDPITCH;
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case SHOW_SERIF_CMD_EVENT:
       color_mode = CM_SERIF;
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case SHOW_SMALLCAPS_CMD_EVENT:
       color_mode = CM_SMALLCAPS;
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case SHOW_DROPCAPS_CMD_EVENT:
       color_mode = CM_DROPCAPS;
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case REFRESH_CMD_EVENT:
       do_re_display(&tesseract::Tesseract::word_display);
       break;
     case QUIT_CMD_EVENT:
       exit = TRUE;
       ScrollView::Exit();
       break;
 
     default:
       sprintf(msg, "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
       image_win->AddMessage(msg);
     break;
   }
   return exit;
 }

◆ process_image_event()

void tesseract::Tesseract::process_image_event ( const SVEvent & event )

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 564 of file pgedit.cpp.

                                                           {
    // The following variable should remain static, since it is used by
    // debug editor, which uses a single Tesseract instance.
   static ICOORD down;
   ICOORD up;
   TBOX selection_box;
   char msg[80];
 
   switch(event.type) {
 
     case SVET_SELECTION:
       if (event.type == SVET_SELECTION) {
         down.set_x(event.x + event.x_size);
         down.set_y(event.y + event.y_size);
         if (mode == SHOW_POINT_CMD_EVENT)
           show_point(current_page_res, event.x, event.y);
       }
 
       up.set_x(event.x);
       up.set_y(event.y);
 
       selection_box = TBOX(down, up);
 
       switch(mode) {
         case CHANGE_DISP_CMD_EVENT:
           process_selected_words(
               current_page_res,
               selection_box,
               &tesseract::Tesseract::word_blank_and_set_display);
           break;
        case DUMP_WERD_CMD_EVENT:
           process_selected_words(current_page_res,
                                  selection_box,
                                  &tesseract::Tesseract::word_dumper);
           break;
         case SHOW_BLN_WERD_CMD_EVENT:
           process_selected_words(current_page_res,
                                  selection_box,
                                  &tesseract::Tesseract::word_bln_display);
           break;
         case DEBUG_WERD_CMD_EVENT:
           debug_word(current_page_res, selection_box);
           break;
         case SHOW_POINT_CMD_EVENT:
           break;                 // ignore up event
 
         case RECOG_WERDS:
           image_win->AddMessage("Recogging selected words");
           this->process_selected_words(current_page_res,
                                        selection_box,
                                        &Tesseract::recog_interactive);
           break;
         case RECOG_PSEUDO:
           image_win->AddMessage("Recogging selected blobs");
           recog_pseudo_word(current_page_res, selection_box);
           break;
         case SHOW_BLOB_FEATURES:
           blob_feature_display(current_page_res, selection_box);
           break;
 
         default:
           sprintf(msg, "Mode %d not yet implemented", mode);
           image_win->AddMessage(msg);
           break;
       }
     default:
       break;
   }
 }

◆ process_selected_words()

void tesseract::Tesseract::process_selected_words	(	PAGE_RES *	page_res,
		TBOX &	selection_box,
		BOOL8(tesseract::Tesseract::)(PAGE_RES_IT pr_it)	word_processor
	)

Definition at line 30 of file pagewalk.cpp.

                                                                   {
   for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
        page_res_it.forward()) {
     WERD* word = page_res_it.word()->word;
     if (word->bounding_box().overlap(selection_box)) {
       if (!(this->*word_processor)(&page_res_it))
         return;
     }
   }
 }

◆ ProcessTargetWord()

bool tesseract::Tesseract::ProcessTargetWord	(	const TBOX &	word_box,
		const TBOX &	target_word_box,
		const char *	word_config,
		int	pass
	)

Definition at line 121 of file control.cpp.

                                             {
   if (word_config != NULL) {
     if (word_box.major_overlap(target_word_box)) {
       if (backup_config_file_ == NULL) {
         backup_config_file_ = kBackUpConfigFile;
         FILE* config_fp = fopen(backup_config_file_, "wb");
         ParamUtils::PrintParams(config_fp, params());
         fclose(config_fp);
         ParamUtils::ReadParamsFile(word_config,
                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,
                                    params());
       }
     } else {
       if (backup_config_file_ != NULL) {
         ParamUtils::ReadParamsFile(backup_config_file_,
                                    SET_PARAM_CONSTRAINT_DEBUG_ONLY,
                                    params());
         backup_config_file_ = NULL;
       }
     }
   } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
     return false;
   }
   return true;
 }

◆ quality_based_rejection()

void tesseract::Tesseract::quality_based_rejection	(	PAGE_RES_IT &	page_res_it,
		BOOL8	good_quality_doc
	)

Definition at line 143 of file docqual.cpp.

                                                                 {
   if ((tessedit_good_quality_unrej && good_quality_doc))
     unrej_good_quality_words(page_res_it);
   doc_and_block_rejection(page_res_it, good_quality_doc);
   if (unlv_tilde_crunching) {
     tilde_crunch(page_res_it);
     tilde_delete(page_res_it);
   }
 }

◆ read_config_file()

void tesseract::Tesseract::read_config_file	(	const char *	filename,
		SetParamConstraint	constraint
	)

Definition at line 60 of file tessedit.cpp.

                                                                 {
   STRING path = datadir;
   path += "configs/";
   path += filename;
   FILE* fp;
   if ((fp = fopen(path.string(), "rb")) != NULL) {
     fclose(fp);
   } else {
     path = datadir;
     path += "tessconfigs/";
     path += filename;
     if ((fp = fopen(path.string(), "rb")) != NULL) {
       fclose(fp);
     } else {
       path = filename;
     }
   }
   ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
 }

◆ ReassignDiacritics()

bool tesseract::Tesseract::ReassignDiacritics	(	int	pass,
		PAGE_RES_IT *	pr_it,
		bool *	make_next_word_fuzzy
	)

Definition at line 927 of file control.cpp.

                                                                {
   *make_next_word_fuzzy = false;
   WERD* real_word = pr_it->word()->word;
   if (real_word->rej_cblob_list()->empty() ||
       real_word->cblob_list()->empty() ||
       real_word->rej_cblob_list()->length() > noise_maxperword)
     return false;
   real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
   // Get the noise outlines into a vector with matching bool map.
   GenericVector<C_OUTLINE*> outlines;
   real_word->GetNoiseOutlines(&outlines);
   GenericVector<bool> word_wanted;
   GenericVector<bool> overlapped_any_blob;
   GenericVector<C_BLOB*> target_blobs;
   AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
                                      &word_wanted, &overlapped_any_blob,
                                      &target_blobs);
   // Filter the outlines that overlapped any blob and put them into the word
   // now. This simplifies the remaining task and also makes it more accurate
   // as it has more completed blobs to work on.
   GenericVector<bool> wanted;
   GenericVector<C_BLOB*> wanted_blobs;
   GenericVector<C_OUTLINE*> wanted_outlines;
   int num_overlapped = 0;
   int num_overlapped_used = 0;
   for (int i = 0; i < overlapped_any_blob.size(); ++i) {
     if (overlapped_any_blob[i]) {
       ++num_overlapped;
       if (word_wanted[i]) ++num_overlapped_used;
       wanted.push_back(word_wanted[i]);
       wanted_blobs.push_back(target_blobs[i]);
       wanted_outlines.push_back(outlines[i]);
       outlines[i] = NULL;
     }
   }
   real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);
   AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
                              &target_blobs);
   int non_overlapped = 0;
   int non_overlapped_used = 0;
   for (int i = 0; i < word_wanted.size(); ++i) {
     if (word_wanted[i]) ++non_overlapped_used;
     if (outlines[i] != NULL) ++non_overlapped_used;
   }
   if (debug_noise_removal) {
     tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
             num_overlapped_used, num_overlapped, non_overlapped_used,
             non_overlapped);
     real_word->bounding_box().print();
   }
   // Now we have decided which outlines we want, put them into the real_word.
   if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
                                      make_next_word_fuzzy)) {
     pr_it->MakeCurrentWordFuzzy();
   }
   // TODO(rays) Parts of combos have a deep copy of the real word, and need
   // to have their noise outlines moved/assigned in the same way!!
   return num_overlapped_used != 0 || non_overlapped_used != 0;
 }

◆ recog_all_words()

bool tesseract::Tesseract::recog_all_words	(	PAGE_RES *	page_res,
		ETEXT_DESC *	monitor,
		const TBOX *	target_word_box,
		const char *	word_config,
		int	dopasses
	)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters

page_res	page structure
monitor	progress monitor
word_config	word_config file
target_word_box	specifies just to extract a rectangle
dopasses	0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 300 of file control.cpp.

                                               {
   PAGE_RES_IT page_res_it(page_res);
 
   if (tessedit_minimal_rej_pass1) {
     tessedit_test_adaption.set_value (TRUE);
     tessedit_minimal_rejection.set_value (TRUE);
   }
 
   if (dopasses==0 || dopasses==1) {
     page_res_it.restart_page();
     // ****************** Pass 1 *******************
 
     // If the adaptive classifier is full switch to one we prepared earlier,
     // ie on the previous page. If the current adaptive classifier is non-empty,
     // prepare a backup starting at this page, in case it fills up. Do all this
     // independently for each language.
     if (AdaptiveClassifierIsFull()) {
       SwitchAdaptiveClassifier();
     } else if (!AdaptiveClassifierIsEmpty()) {
       StartBackupAdaptiveClassifier();
     }
     // Now check the sub-langs as well.
     for (int i = 0; i < sub_langs_.size(); ++i) {
       if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
         sub_langs_[i]->SwitchAdaptiveClassifier();
       } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
         sub_langs_[i]->StartBackupAdaptiveClassifier();
       }
     }
     // Set up all words ready for recognition, so that if parallelism is on
     // all the input and output classes are ready to run the classifier.
     GenericVector<WordData> words;
     SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
     if (tessedit_parallelize) {
       PrerecAllWordsPar(words);
     }
 
     stats_.word_count = words.size();
 
     stats_.dict_words = 0;
     stats_.doc_blob_quality = 0;
     stats_.doc_outline_errs = 0;
     stats_.doc_char_quality = 0;
     stats_.good_char_count = 0;
     stats_.doc_good_char_quality = 0;
 
     most_recently_used_ = this;
     // Run pass 1 word recognition.
     if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
     // Pass 1 post-processing.
     for (page_res_it.restart_page(); page_res_it.word() != NULL;
          page_res_it.forward()) {
       if (page_res_it.word()->word->flag(W_REP_CHAR)) {
         fix_rep_char(&page_res_it);
         continue;
       }
 
       // Count dict words.
       if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
         ++(stats_.dict_words);
 
       // Update misadaption log (we only need to do it on pass 1, since
       // adaption only happens on this pass).
       if (page_res_it.word()->blamer_bundle != NULL &&
           page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
         page_res->misadaption_log.push_back(
             page_res_it.word()->blamer_bundle->misadaption_debug());
       }
     }
   }
 
   if (dopasses == 1) return true;
 
   // ****************** Pass 2 *******************
   if (tessedit_tess_adaption_mode != 0x0 && !tessedit_test_adaption &&
       AnyTessLang()) {
     page_res_it.restart_page();
     GenericVector<WordData> words;
     SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
     if (tessedit_parallelize) {
       PrerecAllWordsPar(words);
     }
     most_recently_used_ = this;
     // Run pass 2 word recognition.
     if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
   }
 
   // The next passes are only required for Tess-only.
   if (AnyTessLang() && !AnyLSTMLang()) {
     // ****************** Pass 3 *******************
     // Fix fuzzy spaces.
     set_global_loc_code(LOC_FUZZY_SPACE);
 
     if (!tessedit_test_adaption && tessedit_fix_fuzzy_spaces
         && !tessedit_word_for_word && !right_to_left())
       fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
 
     // ****************** Pass 4 *******************
     if (tessedit_enable_dict_correction) dictionary_correction_pass(page_res);
     if (tessedit_enable_bigram_correction) bigram_correction_pass(page_res);
 
     // ****************** Pass 5,6 *******************
     rejection_passes(page_res, monitor, target_word_box, word_config);
 
     // ****************** Pass 8 *******************
     font_recognition_pass(page_res);
 
     // ****************** Pass 9 *******************
     // Check the correctness of the final results.
     blamer_pass(page_res);
     script_pos_pass(page_res);
   }
 
   // Write results pass.
   set_global_loc_code(LOC_WRITE_RESULTS);
   // This is now redundant, but retained commented so show how to obtain
   // bounding boxes and style information.
 
   // changed by jetsoft
   // needed for dll to output memory structure
   if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
     output_pass(page_res_it, target_word_box);
   // end jetsoft
   PageSegMode pageseg_mode = static_cast<PageSegMode>(
       static_cast<int>(tessedit_pageseg_mode));
   textord_.CleanupSingleRowResult(pageseg_mode, page_res);
 
   // Remove empty words, as these mess up the result iterators.
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
        page_res_it.forward()) {
     WERD_RES* word = page_res_it.word();
     POLY_BLOCK* pb = page_res_it.block()->block != NULL
                          ? page_res_it.block()->block->poly_block()
                          : NULL;
     if (word->best_choice == NULL || word->best_choice->length() == 0 ||
         (word->best_choice->IsAllSpaces() && (pb == NULL || pb->IsText()))) {
       page_res_it.DeleteCurrentWord();
     }
   }
 
   if (monitor != NULL) {
     monitor->progress = 100;
   }
   return true;
 }

◆ recog_interactive()

BOOL8 tesseract::Tesseract::recog_interactive ( PAGE_RES_IT * pr_it )

Recognize a single word in interactive mode.

Parameters

pr_it the page results iterator

Definition at line 82 of file control.cpp.

                                                      {
   inT16 char_qual;
   inT16 good_char_qual;
 
   WordData word_data(*pr_it);
   SetupWordPassN(2, &word_data);
   // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
   if (lstm_recognizer_ == NULL) {
     classify_word_and_language(2, pr_it, &word_data);
   } else {
     classify_word_and_language(1, pr_it, &word_data);
   }
   if (tessedit_debug_quality_metrics) {
     WERD_RES* word_res = pr_it->word();
     word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
     tprintf("\n%d chars;  word_blob_quality: %d;  outline_errs: %d; "
             "char_quality: %d; good_char_quality: %d\n",
             word_res->reject_map.length(),
             word_blob_quality(word_res, pr_it->row()->row),
             word_outline_errs(word_res), char_qual, good_char_qual);
   }
   return TRUE;
 }

◆ recog_pseudo_word()

void tesseract::Tesseract::recog_pseudo_word	(	PAGE_RES *	page_res,
		TBOX &	selection_box
	)

Definition at line 67 of file control.cpp.

                                                        {
   PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
   if (it != NULL) {
     recog_interactive(it);
     it->DeleteCurrentWord();
     delete it;
   }
 }

◆ recog_training_segmented()

void tesseract::Tesseract::recog_training_segmented	(	const STRING &	fname,
		PAGE_RES *	page_res,
		volatile ETEXT_DESC *	monitor,
		FILE *	output_file
	)

Definition at line 79 of file recogtraining.cpp.

                                                             {
   STRING box_fname = fname;
   const char *lastdot = strrchr(box_fname.string(), '.');
   if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
   box_fname += ".box";
   // ReadNextBox() will close box_file
   FILE *box_file = open_file(box_fname.string(), "r");
 
   PAGE_RES_IT page_res_it;
   page_res_it.page_res = page_res;
   page_res_it.restart_page();
   STRING label;
 
   // Process all the words on this page.
   TBOX tbox;  // tesseract-identified box
   TBOX bbox;  // box from the box file
   bool keep_going;
   int line_number = 0;
   int examined_words = 0;
   do {
     keep_going = read_t(&page_res_it, &tbox);
     keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
                               &bbox);
     // Align bottom left points of the TBOXes.
     while (keep_going &&
            !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
       if (bbox.bottom() < tbox.bottom()) {
         page_res_it.forward();
         keep_going = read_t(&page_res_it, &tbox);
       } else {
         keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
                                  &bbox);
       }
     }
     while (keep_going &&
            !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
       if (bbox.left() > tbox.left()) {
         page_res_it.forward();
         keep_going = read_t(&page_res_it, &tbox);
       } else {
         keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
                                  &bbox);
       }
     }
     // OCR the word if top right points of the TBOXes are similar.
     if (keep_going &&
         NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
         NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
         ambigs_classify_and_output(label.string(), &page_res_it, output_file);
         examined_words++;
     }
     page_res_it.forward();
   } while (keep_going);
 
   // Set up scripts on all of the words that did not get sent to
   // ambigs_classify_and_output.  They all should have, but if all the
   // werd_res's don't get uch_sets, tesseract will crash when you try
   // to iterate over them. :-(
   int total_words = 0;
   for (page_res_it.restart_page(); page_res_it.block() != NULL;
        page_res_it.forward()) {
     if (page_res_it.word()) {
       if (page_res_it.word()->uch_set == NULL)
         page_res_it.word()->SetupFake(unicharset);
       total_words++;
     }
   }
   if (examined_words < 0.85 * total_words) {
     tprintf("TODO(antonova): clean up recog_training_segmented; "
             " It examined only a small fraction of the ambigs image.\n");
   }
   tprintf("recog_training_segmented: examined %d / %d words.\n",
           examined_words, total_words);
 }

◆ recog_word()

void tesseract::Tesseract::recog_word ( WERD_RES * word )

Definition at line 46 of file tfacepp.cpp.

                                          {
   if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
       word->blamer_bundle->incorrect_result_reason() == IRR_NO_TRUTH)) {
     if (classify_debug_level) tprintf("No truth for word - skipping\n");
     word->tess_failed = true;
     return;
   }
   ASSERT_HOST(!word->chopped_word->blobs.empty());
   recog_word_recursive(word);
   word->SetupBoxWord();
   if (word->best_choice->length() != word->box_word->length()) {
     tprintf("recog_word ASSERT FAIL String:\"%s\"; "
             "Strlen=%d; #Blobs=%d\n",
             word->best_choice->debug_string().string(),
             word->best_choice->length(), word->box_word->length());
   }
   ASSERT_HOST(word->best_choice->length() == word->box_word->length());
   // Check that the ratings matrix size matches the sum of all the
   // segmentation states.
   if (!word->StatesAllValid()) {
     tprintf("Not all words have valid states relative to ratings matrix!!");
     word->DebugWordChoices(true, NULL);
     ASSERT_HOST(word->StatesAllValid());
   }
   if (tessedit_override_permuter) {
     /* Override the permuter type if a straight dictionary check disagrees. */
     uint8_t perm_type = word->best_choice->permuter();
     if ((perm_type != SYSTEM_DAWG_PERM) &&
         (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
       uint8_t real_dict_perm_type = dict_word(*word->best_choice);
       if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
            (real_dict_perm_type == FREQ_DAWG_PERM) ||
            (real_dict_perm_type == USER_DAWG_PERM)) &&
           (alpha_count(word->best_choice->unichar_string().string(),
                        word->best_choice->unichar_lengths().string()) > 0)) {
         word->best_choice->set_permuter(real_dict_perm_type);  // use dict perm
       }
     }
     if (tessedit_rejection_debug &&
         perm_type != word->best_choice->permuter()) {
       tprintf("Permuter Type Flipped from %d to %d\n",
               perm_type, word->best_choice->permuter());
     }
   }
   // Factored out from control.cpp
   ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
   if (word->best_choice == NULL || word->best_choice->length() == 0 ||
       static_cast<int>(strspn(word->best_choice->unichar_string().string(),
                               " ")) == word->best_choice->length()) {
     word->tess_failed = true;
     word->reject_map.initialise(word->box_word->length());
     word->reject_map.rej_word_tess_failure();
   } else {
     word->tess_failed = false;
   }
 }

◆ recog_word_recursive()

void tesseract::Tesseract::recog_word_recursive ( WERD_RES * word )

Definition at line 110 of file tfacepp.cpp.

                                                    {
   int word_length = word->chopped_word->NumBlobs();  // no of blobs
   if (word_length > MAX_UNDIVIDED_LENGTH) {
     return split_and_recog_word(word);
   }
   cc_recog(word);
   word_length = word->rebuild_word->NumBlobs();  // No of blobs in output.
 
   // Do sanity checks and minor fixes on best_choice.
   if (word->best_choice->length() > word_length) {
     word->best_choice->make_bad();  // should never happen
     tprintf("recog_word: Discarded long string \"%s\""
             " (%d characters vs %d blobs)\n",
             word->best_choice->unichar_string().string(),
             word->best_choice->length(), word_length);
     tprintf("Word is at:");
     word->word->bounding_box().print();
   }
   if (word->best_choice->length() < word_length) {
     UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
     while (word->best_choice->length() < word_length) {
       word->best_choice->append_unichar_id(space_id, 1, 0.0,
                                            word->best_choice->certainty());
     }
   }
 }

◆ RecogAllWordsPassN()

bool tesseract::Tesseract::RecogAllWordsPassN	(	int	pass_n,
		ETEXT_DESC *	monitor,
		PAGE_RES_IT *	pr_it,
		GenericVector< WordData > *	words
	)

Definition at line 210 of file control.cpp.

                                                                    {
   // TODO(rays) Before this loop can be parallelized (it would yield a massive
   // speed-up) all remaining member globals need to be converted to local/heap
   // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
   // added. The results will be significantly different with adaption on, and
   // deterioration will need investigation.
   pr_it->restart_page();
   for (int w = 0; w < words->size(); ++w) {
     WordData* word = &(*words)[w];
     if (w > 0) word->prev_word = &(*words)[w - 1];
     if (monitor != NULL) {
       monitor->ocr_alive = TRUE;
       if (pass_n == 1) {
         monitor->progress = 70 * w / words->size();
         if (monitor->progress_callback != NULL) {
           TBOX box = pr_it->word()->word->bounding_box();
           (*monitor->progress_callback)(monitor->progress, box.left(),
                                         box.right(), box.top(), box.bottom());
         }
       } else {
         monitor->progress = 70 + 30 * w / words->size();
         if (monitor->progress_callback != NULL) {
           (*monitor->progress_callback)(monitor->progress, 0, 0, 0, 0);
         }
       }
       if (monitor->deadline_exceeded() ||
           (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
                                                          words->size()))) {
         // Timeout. Fake out the rest of the words.
         for (; w < words->size(); ++w) {
           (*words)[w].word->SetupFake(unicharset);
         }
         return false;
       }
     }
     if (word->word->tess_failed) {
       int s;
       for (s = 0; s < word->lang_words.size() &&
            word->lang_words[s]->tess_failed; ++s) {}
       // If all are failed, skip it. Image words are skipped by this test.
       if (s > word->lang_words.size()) continue;
     }
     // Sync pr_it with the wth WordData.
     while (pr_it->word() != NULL && pr_it->word() != word->word)
       pr_it->forward();
     ASSERT_HOST(pr_it->word() != NULL);
     bool make_next_word_fuzzy = false;
     if (!AnyLSTMLang() &&
         ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
       // Needs to be setup again to see the new outlines in the chopped_word.
       SetupWordPassN(pass_n, word);
     }
 
     classify_word_and_language(pass_n, pr_it, word);
     if (tessedit_dump_choices || debug_noise_removal) {
       tprintf("Pass%d: %s [%s]\n", pass_n,
               word->word->best_choice->unichar_string().string(),
               word->word->best_choice->debug_string().string());
     }
     pr_it->forward();
     if (make_next_word_fuzzy && pr_it->word() != NULL) {
       pr_it->MakeCurrentWordFuzzy();
     }
   }
   return true;
 }

◆ recognize_page()

void tesseract::Tesseract::recognize_page ( STRING & image_name )

◆ reject_edge_blobs()

void tesseract::Tesseract::reject_edge_blobs ( WERD_RES * word )

Definition at line 263 of file reject.cpp.

                                                 {
   TBOX word_box = word->word->bounding_box();
   // Use the box_word as it is already denormed back to image coordinates.
   int blobcount = word->box_word->length();
 
   if (word_box.left() < tessedit_image_border ||
       word_box.bottom() < tessedit_image_border ||
       word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
       word_box.top() + tessedit_image_border > ImageHeight() - 1) {
     ASSERT_HOST(word->reject_map.length() == blobcount);
     for (int blobindex = 0; blobindex < blobcount; blobindex++) {
       TBOX blob_box = word->box_word->BlobBox(blobindex);
       if (blob_box.left() < tessedit_image_border ||
           blob_box.bottom() < tessedit_image_border ||
           blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
           blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
         word->reject_map[blobindex].setrej_edge_char();
         // Close to edge
       }
     }
   }
 }

◆ reject_I_1_L()

void tesseract::Tesseract::reject_I_1_L ( WERD_RES * word )

Definition at line 191 of file reject.cpp.

                                            {
   inT16 i;
   inT16 offset;
 
   for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
        offset += word->best_choice->unichar_lengths()[i], i += 1) {
     if (STRING (conflict_set_I_l_1).
     contains (word->best_choice->unichar_string()[offset])) {
                                  //rej 1Il conflict
       word->reject_map[i].setrej_1Il_conflict ();
     }
   }
 }

◆ reject_mostly_rejects()

void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES * word )

Definition at line 573 of file reject.cpp.

                                                     {
   /* Reject the whole of the word if the fraction of rejects exceeds a limit */
 
   if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
     rej_whole_of_mostly_reject_word_fract)
     word->reject_map.rej_word_mostly_rej();
 }

◆ rejection_passes()

void tesseract::Tesseract::rejection_passes	(	PAGE_RES *	page_res,
		ETEXT_DESC *	monitor,
		const TBOX *	target_word_box,
		const char *	word_config
	)

Definition at line 598 of file control.cpp.

                                                           {
   PAGE_RES_IT page_res_it(page_res);
   // ****************** Pass 5 *******************
   // Gather statistics on rejects.
   int word_index = 0;
   while (!tessedit_test_adaption && page_res_it.word() != NULL) {
     set_global_loc_code(LOC_MM_ADAPT);
     WERD_RES* word = page_res_it.word();
     word_index++;
     if (monitor != NULL) {
       monitor->ocr_alive = TRUE;
       monitor->progress = 95 + 5 * word_index / stats_.word_count;
     }
     if (word->rebuild_word == NULL) {
       // Word was not processed by tesseract.
       page_res_it.forward();
       continue;
     }
     check_debug_pt(word, 70);
 
     // changed by jetsoft
     // specific to its needs to extract one word when need
     if (target_word_box &&
         !ProcessTargetWord(word->word->bounding_box(),
                            *target_word_box, word_config, 4)) {
       page_res_it.forward();
       continue;
     }
     // end jetsoft
 
     page_res_it.rej_stat_word();
     int chars_in_word = word->reject_map.length();
     int rejects_in_word = word->reject_map.reject_count();
 
     int blob_quality = word_blob_quality(word, page_res_it.row()->row);
     stats_.doc_blob_quality += blob_quality;
     int outline_errs = word_outline_errs(word);
     stats_.doc_outline_errs += outline_errs;
     inT16 all_char_quality;
     inT16 accepted_all_char_quality;
     word_char_quality(word, page_res_it.row()->row,
                       &all_char_quality, &accepted_all_char_quality);
     stats_.doc_char_quality += all_char_quality;
     uint8_t permuter_type = word->best_choice->permuter();
     if ((permuter_type == SYSTEM_DAWG_PERM) ||
         (permuter_type == FREQ_DAWG_PERM) ||
         (permuter_type == USER_DAWG_PERM)) {
       stats_.good_char_count += chars_in_word - rejects_in_word;
       stats_.doc_good_char_quality += accepted_all_char_quality;
     }
     check_debug_pt(word, 80);
     if (tessedit_reject_bad_qual_wds &&
         (blob_quality == 0) && (outline_errs >= chars_in_word))
       word->reject_map.rej_word_bad_quality();
     check_debug_pt(word, 90);
     page_res_it.forward();
   }
 
   if (tessedit_debug_quality_metrics) {
     tprintf
       ("QUALITY: num_chs= %d  num_rejs= %d %5.3f blob_qual= %d %5.3f"
        " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
       page_res->char_count, page_res->rej_count,
       page_res->rej_count / static_cast<float>(page_res->char_count),
       stats_.doc_blob_quality,
       stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
       stats_.doc_outline_errs,
       stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
       stats_.doc_char_quality,
       stats_.doc_char_quality / static_cast<float>(page_res->char_count),
       stats_.doc_good_char_quality,
       (stats_.good_char_count > 0) ?
       (stats_.doc_good_char_quality /
        static_cast<float>(stats_.good_char_count)) : 0.0);
   }
   BOOL8 good_quality_doc =
     ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
      quality_rej_pc) &&
     (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
      quality_blob_pc) &&
     (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
      quality_outline_pc) &&
     (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
      quality_char_pc);
 
   // ****************** Pass 6 *******************
   // Do whole document or whole block rejection pass
   if (!tessedit_test_adaption) {
     set_global_loc_code(LOC_DOC_BLK_REJ);
     quality_based_rejection(page_res_it, good_quality_doc);
   }
 }

◆ repeated_nonalphanum_wd()

BOOL8 tesseract::Tesseract::repeated_nonalphanum_wd	(	WERD_RES *	word,
		ROW *	row
	)

Definition at line 582 of file reject.cpp.

                                                                  {
   inT16 char_quality;
   inT16 accepted_char_quality;
 
   if (word->best_choice->unichar_lengths().length() <= 1)
     return FALSE;
 
   if (!STRING(ok_repeated_ch_non_alphanum_wds).
     contains(word->best_choice->unichar_string()[0]))
     return FALSE;
 
   UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
   for (int i = 1; i < word->best_choice->length(); ++i) {
     if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
   }
 
   word_char_quality(word, row, &char_quality, &accepted_char_quality);
 
   if ((word->best_choice->unichar_lengths().length () == char_quality) &&
     (char_quality == accepted_char_quality))
     return TRUE;
   else
     return FALSE;
 }

◆ ReportFailedBox()

void tesseract::Tesseract::ReportFailedBox	(	int	boxfile_lineno,
		TBOX	box,
		const char *	box_ch,
		const char *	err_msg
	)

Logs a bad box by line in the box file and box coords.

Definition at line 764 of file applybox.cpp.

                                                                          {
   tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
           boxfile_lineno + 1, box_ch,
           box.left(), box.bottom(), box.right(), box.top(), err_msg);
 }

◆ ReportXhtFixResult()

void tesseract::Tesseract::ReportXhtFixResult	(	bool	accept_new_word,
		float	new_x_ht,
		WERD_RES *	word,
		WERD_RES *	new_word
	)

Definition at line 1413 of file control.cpp.

                                                                        {
   tprintf("New XHT Match:%s = %s ",
           word->best_choice->unichar_string().string(),
           word->best_choice->debug_string().string());
   word->reject_map.print(debug_fp);
   tprintf(" -> %s = %s ",
           new_word->best_choice->unichar_string().string(),
           new_word->best_choice->debug_string().string());
   new_word->reject_map.print(debug_fp);
   tprintf(" %s->%s %s %s\n",
           word->guessed_x_ht ? "GUESS" : "CERT",
           new_word->guessed_x_ht ? "GUESS" : "CERT",
           new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
           accept_new_word ? "ACCEPTED" : "");
 }

◆ ReSegmentByClassification()

void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES * page_res )

Resegments the words by running the classifier in an attempt to find the correct segmentation that produces the required string.

Definition at line 509 of file applybox.cpp.

                                                             {
   PAGE_RES_IT pr_it(page_res);
   WERD_RES* word_res;
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
     WERD* word = word_res->word;
     if (word->text() == NULL || word->text()[0] == '\0')
       continue;  // Ignore words that have no text.
     // Convert the correct text to a vector of UNICHAR_ID
     GenericVector<UNICHAR_ID> target_text;
     if (!ConvertStringToUnichars(word->text(), &target_text)) {
       tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
               word->text());
       pr_it.DeleteCurrentWord();
       continue;
     }
     if (!FindSegmentation(target_text, word_res)) {
       tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
               word->text());
       pr_it.DeleteCurrentWord();
       continue;
     }
   }
 }

◆ ResegmentCharBox()

bool tesseract::Tesseract::ResegmentCharBox	(	PAGE_RES *	page_res,
		const TBOX *	prev_box,
		const TBOX &	box,
		const TBOX &	next_box,
		const char *	correct_text
	)

Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.

Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.

Returns: false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.

This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.

Definition at line 340 of file applybox.cpp.

                                                            {
   if (applybox_debug > 1) {
     tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
   }
   PAGE_RES_IT page_res_it(page_res);
   WERD_RES* word_res;
   for (word_res = page_res_it.word(); word_res != NULL;
        word_res = page_res_it.forward()) {
     if (!word_res->box_word->bounding_box().major_overlap(box))
       continue;
     if (applybox_debug > 1) {
       tprintf("Checking word box:");
       word_res->box_word->bounding_box().print();
     }
     int word_len = word_res->box_word->length();
     for (int i = 0; i < word_len; ++i) {
       TBOX char_box = TBOX();
       int blob_count = 0;
       for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
         TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
         if (!blob_box.major_overlap(box))
           break;
         if (word_res->correct_text[i + blob_count].length() > 0)
           break;  // Blob is claimed already.
         double current_box_miss_metric = BoxMissMetric(blob_box, box);
         double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
         if (applybox_debug > 2) {
           tprintf("Checking blob:");
           blob_box.print();
           tprintf("Current miss metric = %g, next = %g\n",
                   current_box_miss_metric, next_box_miss_metric);
         }
         if (current_box_miss_metric > next_box_miss_metric)
           break;  // Blob is a better match for next box.
         char_box += blob_box;
       }
       if (blob_count > 0) {
         if (applybox_debug > 1) {
           tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
         }
         if (!char_box.almost_equal(box, 3) &&
             (box.x_gap(next_box) < -3 ||
              (prev_box != NULL && prev_box->x_gap(box) < -3))) {
           return false;
         }
         // We refine just the box_word, best_state and correct_text here.
         // The rebuild_word is made in TidyUp.
         // blob_count blobs are put together to match the box. Merge the
         // box_word boxes, save the blob_count in the state and the text.
         word_res->box_word->MergeBoxes(i, i + blob_count);
         word_res->best_state[i] = blob_count;
         word_res->correct_text[i] = correct_text;
         if (applybox_debug > 2) {
           tprintf("%d Blobs match: blob box:", blob_count);
           word_res->box_word->BlobBox(i).print();
           tprintf("Matches box:");
           box.print();
           tprintf("With next box:");
           next_box.print();
         }
         // Eliminated best_state and correct_text entries for the consumed
         // blobs.
         for (int j = 1; j < blob_count; ++j) {
           word_res->best_state.remove(i + 1);
           word_res->correct_text.remove(i + 1);
         }
         // Assume that no box spans multiple source words, so we are done with
         // this box.
         if (applybox_debug > 1) {
           tprintf("Best state = ");
           for (int j = 0; j < word_res->best_state.size(); ++j) {
             tprintf("%d ", word_res->best_state[j]);
           }
           tprintf("\n");
           tprintf("Correct text = [[ ");
           for (int j = 0; j < word_res->correct_text.size(); ++j) {
             tprintf("%s ", word_res->correct_text[j].string());
           }
           tprintf("]]\n");
         }
         return true;
       }
     }
   }
   if (applybox_debug > 0) {
     tprintf("FAIL!\n");
   }
   return false;  // Failure.
 }

◆ ResegmentWordBox()

bool tesseract::Tesseract::ResegmentWordBox	(	BLOCK_LIST *	block_list,
		const TBOX &	box,
		const TBOX &	next_box,
		const char *	correct_text
	)

Consume all source blobs that strongly overlap the given box, putting them into a new word, with the correct_text label. Fights over which box owns which blobs are settled by applying the blobs to box or next_box with the least non-overlap.

Returns: false if the box was in error, which can only be caused by failing to find an overlapping blob for a box.

Definition at line 438 of file applybox.cpp.

                                                            {
   if (applybox_debug > 1) {
     tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
   }
   WERD* new_word = NULL;
   BLOCK_IT b_it(block_list);
   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
     BLOCK* block = b_it.data();
     if (!box.major_overlap(block->bounding_box()))
       continue;
     ROW_IT r_it(block->row_list());
     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
       ROW* row = r_it.data();
       if (!box.major_overlap(row->bounding_box()))
         continue;
       WERD_IT w_it(row->word_list());
       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
         WERD* word = w_it.data();
         if (applybox_debug > 2) {
           tprintf("Checking word:");
           word->bounding_box().print();
         }
         if (word->text() != NULL && word->text()[0] != '\0')
           continue;  // Ignore words that are already done.
         if (!box.major_overlap(word->bounding_box()))
           continue;
         C_BLOB_IT blob_it(word->cblob_list());
         for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
              blob_it.forward()) {
           C_BLOB* blob = blob_it.data();
           TBOX blob_box = blob->bounding_box();
           if (!blob_box.major_overlap(box))
             continue;
           double current_box_miss_metric = BoxMissMetric(blob_box, box);
           double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
           if (applybox_debug > 2) {
             tprintf("Checking blob:");
             blob_box.print();
             tprintf("Current miss metric = %g, next = %g\n",
                     current_box_miss_metric, next_box_miss_metric);
           }
           if (current_box_miss_metric > next_box_miss_metric)
             continue;  // Blob is a better match for next box.
           if (applybox_debug > 2) {
             tprintf("Blob match: blob:");
             blob_box.print();
             tprintf("Matches box:");
             box.print();
             tprintf("With next box:");
             next_box.print();
           }
           if (new_word == NULL) {
             // Make a new word with a single blob.
             new_word = word->shallow_copy();
             new_word->set_text(correct_text);
             w_it.add_to_end(new_word);
           }
           C_BLOB_IT new_blob_it(new_word->cblob_list());
           new_blob_it.add_to_end(blob_it.extract());
         }
       }
     }
   }
   if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
   return new_word != NULL;
 }

◆ ResetAdaptiveClassifier()

void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 658 of file tesseractclass.cpp.

                                         {
   ResetAdaptiveClassifierInternal();
   for (int i = 0; i < sub_langs_.size(); ++i) {
     sub_langs_[i]->ResetAdaptiveClassifierInternal();
   }
 }

◆ ResetDocumentDictionary()

void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 666 of file tesseractclass.cpp.

                                         {
   getDict().ResetDocumentDictionary();
   for (int i = 0; i < sub_langs_.size(); ++i) {
     sub_langs_[i]->getDict().ResetDocumentDictionary();
   }
 }

◆ reskew()

const FCOORD& tesseract::Tesseract::reskew ( ) const

inline

Definition at line 181 of file tesseractclass.h.

                                {
     return reskew_;
   }

◆ RetryWithLanguage()

int tesseract::Tesseract::RetryWithLanguage	(	const WordData &	word_data,
		WordRecognizer	recognizer,
		bool	debug,
		WERD_RES **	in_word,
		PointerVector< WERD_RES > *	best_words
	)

Definition at line 888 of file control.cpp.

                                                                       {
   if (debug) {
     tprintf("Trying word using lang %s, oem %d\n",
             lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
   }
   // Run the recognizer on the word.
   PointerVector<WERD_RES> new_words;
   (this->*recognizer)(word_data, in_word, &new_words);
   if (new_words.empty()) {
     // Transfer input word to new_words, as the classifier must have put
     // the result back in the input.
     new_words.push_back(*in_word);
     *in_word = NULL;
   }
   if (debug) {
     for (int i = 0; i < new_words.size(); ++i)
       new_words[i]->DebugTopChoice("Lang result");
   }
   // Initial version is a bit of a hack based on better certainty and rating
   // or a dictionary vs non-dictionary word.
   return SelectBestWords(classify_max_rating_ratio,
                          classify_max_certainty_margin,
                          debug, &new_words, best_words);
 }

◆ right_to_left()

bool tesseract::Tesseract::right_to_left ( ) const

inline

Definition at line 250 of file tesseractclass.h.

                              {
     return right_to_left_;
   }

◆ RunOldFixXht()

bool tesseract::Tesseract::RunOldFixXht	(	WERD_RES *	word,
		BLOCK *	block,
		ROW *	row
	)

◆ safe_dict_word()

inT16 tesseract::Tesseract::safe_dict_word ( const WERD_RES * werd_res )

Definition at line 607 of file reject.cpp.

                                                         {
   const WERD_CHOICE &word = *werd_res->best_choice;
   int dict_word_type = werd_res->tesseract->dict_word(word);
   return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
 }

◆ scaled_color()

Pix* tesseract::Tesseract::scaled_color ( ) const

inline

Definition at line 233 of file tesseractclass.h.

                             {
     return scaled_color_;
   }

◆ scaled_factor()

int tesseract::Tesseract::scaled_factor ( ) const

inline

Definition at line 236 of file tesseractclass.h.

                             {
     return scaled_factor_;
   }

◆ script_pos_pass()

void tesseract::Tesseract::script_pos_pass ( PAGE_RES * page_res )

Definition at line 718 of file control.cpp.

                                                   {
   PAGE_RES_IT page_res_it(page_res);
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
       page_res_it.forward()) {
     WERD_RES* word = page_res_it.word();
      if (word->word->flag(W_REP_CHAR)) {
       page_res_it.forward();
       continue;
     }
     float x_height = page_res_it.block()->block->x_height();
     float word_x_height = word->x_height;
     if (word_x_height < word->best_choice->min_x_height() ||
         word_x_height > word->best_choice->max_x_height()) {
       word_x_height = (word->best_choice->min_x_height() +
           word->best_choice->max_x_height()) / 2.0f;
     }
     // Test for small caps. Word capheight must be close to block xheight,
     // and word must contain no lower case letters, and at least one upper case.
     double small_cap_xheight = x_height * kXHeightCapRatio;
     double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
     if (word->uch_set->script_has_xheight() &&
         small_cap_xheight - small_cap_delta <= word_x_height &&
         word_x_height <= small_cap_xheight + small_cap_delta) {
       // Scan for upper/lower.
       int num_upper = 0;
       int num_lower = 0;
       for (int i = 0; i < word->best_choice->length(); ++i) {
         if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
           ++num_upper;
         else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
           ++num_lower;
       }
       if (num_upper > 0 && num_lower == 0)
         word->small_caps = true;
     }
     word->SetScriptPositions();
   }
 }

◆ SearchForText()

void tesseract::Tesseract::SearchForText	(	const GenericVector< BLOB_CHOICE_LIST >	choices,
		int	choices_pos,
		int	choices_length,
		const GenericVector< UNICHAR_ID > &	target_text,
		int	text_index,
		float	rating,
		GenericVector< int > *	segmentation,
		float *	best_rating,
		GenericVector< int > *	best_segmentation
	)

Recursive helper to find a match to the target_text (from text_index position) in the choices (from choices_pos position).

Parameters

choices	is an array of GenericVectors, of length choices_length, with each element representing a starting position in the word, and the GenericVector holding classification results for a sequence of consecutive blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
choices_pos
choices_length
target_text
text_index
rating
segmentation
best_rating
best_segmentation

Definition at line 629 of file applybox.cpp.

                                                                      {
   const UnicharAmbigsVector& table = getDict().getUnicharAmbigs().dang_ambigs();
   for (int length = 1; length <= choices[choices_pos].size(); ++length) {
     // Rating of matching choice or worst choice if no match.
     float choice_rating = 0.0f;
     // Find the corresponding best BLOB_CHOICE.
     BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
     for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
          choice_it.forward()) {
       BLOB_CHOICE* choice = choice_it.data();
       choice_rating = choice->rating();
       UNICHAR_ID class_id = choice->unichar_id();
       if (class_id == target_text[text_index]) {
         break;
       }
       // Search ambigs table.
       if (class_id < table.size() && table[class_id] != NULL) {
         AmbigSpec_IT spec_it(table[class_id]);
         for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
              spec_it.forward()) {
           const AmbigSpec *ambig_spec = spec_it.data();
           // We'll only do 1-1.
           if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
               ambig_spec->correct_ngram_id == target_text[text_index])
             break;
         }
         if (!spec_it.cycled_list())
           break;  // Found an ambig.
       }
     }
     if (choice_it.cycled_list())
       continue;  // No match.
     segmentation->push_back(length);
     if (choices_pos + length == choices_length &&
         text_index + 1 == target_text.size()) {
       // This is a complete match. If the rating is good record a new best.
       if (applybox_debug > 2) {
         tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
                 rating + choice_rating, *best_rating, segmentation->size(),
                 best_segmentation->size());
       }
       if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
         *best_segmentation = *segmentation;
         *best_rating = rating + choice_rating;
       }
     } else if (choices_pos + length < choices_length &&
                text_index + 1 < target_text.size()) {
       if (applybox_debug > 3) {
         tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
                 target_text[text_index],
                 unicharset.id_to_unichar(target_text[text_index]),
                 choice_it.data()->unichar_id() == target_text[text_index]
                      ? "Match" : "Ambig",
                 choices_pos, length);
       }
       SearchForText(choices, choices_pos + length, choices_length, target_text,
                     text_index + 1, rating + choice_rating, segmentation,
                     best_rating, best_segmentation);
       if (applybox_debug > 3) {
         tprintf("End recursion for %d=%s\n", target_text[text_index],
                 unicharset.id_to_unichar(target_text[text_index]));
       }
     }
     segmentation->truncate(segmentation->size() - 1);
   }
 }

◆ SearchWords()

void tesseract::Tesseract::SearchWords ( PointerVector< WERD_RES > * words )

Definition at line 253 of file linerec.cpp.

                                                           {
   // Run the segmentation search on the network outputs and make a BoxWord
   // for each of the output words.
   // If we drop a word as junk, then there is always a space in front of the
   // next.
   const Dict* stopper_dict = lstm_recognizer_->GetDict();
   if (stopper_dict == nullptr) stopper_dict = &getDict();
   bool any_nonspace_delimited = false;
   for (int w = 0; w < words->size(); ++w) {
     WERD_RES* word = (*words)[w];
     if (word->best_choice != nullptr &&
         word->best_choice->ContainsAnyNonSpaceDelimited()) {
       any_nonspace_delimited = true;
       break;
     }
   }
   for (int w = 0; w < words->size(); ++w) {
     WERD_RES* word = (*words)[w];
     if (word->best_choice == NULL) {
       // If we are using the beam search, the unicharset had better match!
       word->SetupWordScript(unicharset);
       WordSearch(word);
     } else if (word->best_choice->unicharset() == &unicharset &&
                !lstm_recognizer_->IsRecoding()) {
       // We set up the word without using the dictionary, so set the permuter
       // now, but we can only do it because the unicharsets match.
       word->best_choice->set_permuter(
           getDict().valid_word(*word->best_choice, true));
     }
     if (word->best_choice == NULL) {
       // It is a dud.
       word->SetupFake(lstm_recognizer_->GetUnicharset());
     } else {
       // Set the best state.
       for (int i = 0; i < word->best_choice->length(); ++i) {
         int length = word->best_choice->state(i);
         word->best_state.push_back(length);
       }
       word->reject_map.initialise(word->best_choice->length());
       word->tess_failed = false;
       word->tess_accepted = true;
       word->tess_would_adapt = false;
       word->done = true;
       word->tesseract = this;
       float word_certainty = MIN(word->space_certainty,
                                  word->best_choice->certainty());
       word_certainty *= kCertaintyScale;
       // Arbitrary ding factor for non-dictionary words.
       if (!lstm_recognizer_->IsRecoding() &&
           !Dict::valid_word_permuter(word->best_choice->permuter(), true))
         word_certainty -= kNonDictionaryPenalty;
       if (getDict().stopper_debug_level >= 1) {
         tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
                 word->best_choice->certainty(), word->space_certainty,
                 MIN(word->space_certainty, word->best_choice->certainty()) *
                     kCertaintyScale,
                 word_certainty);
         word->best_choice->print();
       }
       word->best_choice->set_certainty(word_certainty);
       // Discard words that are impossibly bad, but allow a bit more for
       // dictionary words, and keep bad words in non-space-delimited langs.
       if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
           any_nonspace_delimited ||
           (word_certainty >= kWorstDictCertainty &&
            Dict::valid_word_permuter(word->best_choice->permuter(), true))) {
         word->tess_accepted = stopper_dict->AcceptableResult(word);
       } else {
         if (getDict().stopper_debug_level >= 1) {
           tprintf("Deleting word with certainty %g\n", word_certainty);
           word->best_choice->print();
         }
         // It is a dud.
         word->SetupFake(lstm_recognizer_->GetUnicharset());
       }
     }
   }
 }

◆ SegmentPage()

int tesseract::Tesseract::SegmentPage	(	const STRING *	input_file,
		BLOCK_LIST *	blocks,
		Tesseract *	osd_tess,
		OSResults *	osr
	)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be NULL. On return the blocks list owns all the constructed page layout.

Definition at line 103 of file pagesegmain.cpp.

                                                                 {
   ASSERT_HOST(pix_binary_ != NULL);
   int width = pixGetWidth(pix_binary_);
   int height = pixGetHeight(pix_binary_);
   // Get page segmentation mode.
   PageSegMode pageseg_mode = static_cast<PageSegMode>(
       static_cast<int>(tessedit_pageseg_mode));
   // If a UNLV zone file can be found, use that instead of segmentation.
   if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
       input_file != NULL && input_file->length() > 0) {
     STRING name = *input_file;
     const char* lastdot = strrchr(name.string(), '.');
     if (lastdot != NULL)
       name[lastdot - name.string()] = '\0';
     read_unlv_file(name, width, height, blocks);
   }
   if (blocks->empty()) {
     // No UNLV file present. Work according to the PageSegMode.
     // First make a single block covering the whole image.
     BLOCK_IT block_it(blocks);
     BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
     block->set_right_to_left(right_to_left());
     block_it.add_to_end(block);
   } else {
     // UNLV file present. Use PSM_SINGLE_BLOCK.
     pageseg_mode = PSM_SINGLE_BLOCK;
   }
   // The diacritic_blobs holds noise blobs that may be diacritics. They
   // are separated out on areas of the image that seem noisy and short-circuit
   // the layout process, going straight from the initial partition creation
   // right through to after word segmentation, where they are added to the
   // rej_cblobs list of the most appropriate word. From there classification
   // will determine whether they are used.
   BLOBNBOX_LIST diacritic_blobs;
   int auto_page_seg_ret_val = 0;
   TO_BLOCK_LIST to_blocks;
   if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
       PSM_SPARSE(pageseg_mode)) {
     auto_page_seg_ret_val = AutoPageSeg(
         pageseg_mode, blocks, &to_blocks,
         enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
     if (pageseg_mode == PSM_OSD_ONLY)
       return auto_page_seg_ret_val;
     // To create blobs from the image region bounds uncomment this line:
     //  to_blocks.clear();  // Uncomment to go back to the old mode.
   } else {
     deskew_ = FCOORD(1.0f, 0.0f);
     reskew_ = FCOORD(1.0f, 0.0f);
     if (pageseg_mode == PSM_CIRCLE_WORD) {
       Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
       if (pixcleaned != NULL) {
         pixDestroy(&pix_binary_);
         pix_binary_ = pixcleaned;
       }
     }
   }
 
   if (auto_page_seg_ret_val < 0) {
     return -1;
   }
 
   if (blocks->empty()) {
     if (textord_debug_tabfind)
       tprintf("Empty page\n");
     return 0;  // AutoPageSeg found an empty page.
   }
   bool splitting =
       pageseg_devanagari_split_strategy != ShiroRekhaSplitter::NO_SPLIT;
   bool cjk_mode = textord_use_cjk_fp_model;
 
   textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
                        pix_thresholds_, pix_grey_, splitting || cjk_mode,
                        &diacritic_blobs, blocks, &to_blocks);
   return auto_page_seg_ret_val;
 }

◆ SelectGoodDiacriticOutlines()

bool tesseract::Tesseract::SelectGoodDiacriticOutlines	(	int	pass,
		float	certainty_threshold,
		PAGE_RES_IT *	pr_it,
		C_BLOB *	blob,
		const GenericVector< C_OUTLINE *> &	outlines,
		int	num_outlines,
		GenericVector< bool > *	ok_outlines
	)

Definition at line 1122 of file control.cpp.

                                       {
   STRING best_str;
   float target_cert = certainty_threshold;
   if (blob != NULL) {
     float target_c2;
     target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
     if (debug_noise_removal) {
       tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
               target_cert, target_c2);
       blob->bounding_box().print();
     }
     target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
   }
   GenericVector<bool> test_outlines = *ok_outlines;
   // Start with all the outlines in.
   STRING all_str;
   GenericVector<bool> best_outlines = *ok_outlines;
   float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
                                              pr_it, blob, &all_str);
   if (debug_noise_removal) {
     TBOX ol_box;
     for (int i = 0; i < test_outlines.size(); ++i) {
       if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
     }
     tprintf("All Noise blob classified as %s=%g, delta=%g at:",
             all_str.string(), best_cert, best_cert - target_cert);
     ol_box.print();
   }
   // Iteratively zero out the bit that improves the certainty the most, until
   // we get past the threshold, have zero bits, or fail to improve.
   int best_index = 0;  // To zero out.
   while (num_outlines > 1 && best_index >= 0 &&
          (blob == NULL || best_cert < target_cert || blob != NULL)) {
     // Find the best bit to zero out.
     best_index = -1;
     for (int i = 0; i < outlines.size(); ++i) {
       if (test_outlines[i]) {
         test_outlines[i] = false;
         STRING str;
         float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
                                               pr_it, blob, &str);
         if (debug_noise_removal) {
           TBOX ol_box;
           for (int j = 0; j < outlines.size(); ++j) {
             if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
             tprintf("%d", test_outlines[j]);
           }
           tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
                   cert, cert - target_cert);
           ol_box.print();
         }
         if (cert > best_cert) {
           best_cert = cert;
           best_index = i;
           best_outlines = test_outlines;
         }
         test_outlines[i] = true;
       }
     }
     if (best_index >= 0) {
       test_outlines[best_index] = false;
       --num_outlines;
     }
   }
   if (best_cert >= target_cert) {
     // Save the best combination.
     *ok_outlines = best_outlines;
     if (debug_noise_removal) {
       tprintf("%s noise combination ", blob ? "Adding" : "New");
       for (int i = 0; i < best_outlines.size(); ++i) {
         tprintf("%d", best_outlines[i]);
       }
       tprintf(" yields certainty %g, beating target of %g\n", best_cert,
               target_cert);
     }
     return true;
   }
   return false;
 }

◆ set_done()

void tesseract::Tesseract::set_done	(	WERD_RES *	word,
		inT16	pass
	)

◆ set_pix_grey()

void tesseract::Tesseract::set_pix_grey ( Pix * grey_pix )

inline

Definition at line 195 of file tesseractclass.h.

                                    {
     pixDestroy(&pix_grey_);
     pix_grey_ = grey_pix;
   }

◆ set_pix_original()

void tesseract::Tesseract::set_pix_original ( Pix * original_pix )

inline

Definition at line 201 of file tesseractclass.h.

                                            {
     pixDestroy(&pix_original_);
     pix_original_ = original_pix;
     // Clone to sublangs as well.
     for (int i = 0; i < sub_langs_.size(); ++i)
       sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
                                                    : nullptr);
   }

◆ set_pix_thresholds()

void tesseract::Tesseract::set_pix_thresholds ( Pix * thresholds )

inline

Definition at line 217 of file tesseractclass.h.

                                            {
     pixDestroy(&pix_thresholds_);
     pix_thresholds_ = thresholds;
   }

◆ set_source_resolution()

void tesseract::Tesseract::set_source_resolution ( int ppi )

inline

Definition at line 224 of file tesseractclass.h.

                                       {
     source_resolution_ = ppi;
   }

◆ set_unlv_suspects()

void tesseract::Tesseract::set_unlv_suspects ( WERD_RES * word )

Definition at line 305 of file output.cpp.

                                                     {
   int len = word_res->reject_map.length();
   const WERD_CHOICE &word = *(word_res->best_choice);
   const UNICHARSET &uchset = *word.unicharset();
   int i;
   float rating_per_ch;
 
   if (suspect_level == 0) {
     for (i = 0; i < len; i++) {
       if (word_res->reject_map[i].rejected())
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
     return;
   }
 
   if (suspect_level >= 3)
     return;                      //Use defaults
 
   /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
 
   if (safe_dict_word(word_res) &&
       (count_alphas(word) > suspect_short_words)) {
     /* Unreject alphas in dictionary words */
     for (i = 0; i < len; ++i) {
       if (word_res->reject_map[i].rejected() &&
           uchset.get_isalpha(word.unichar_id(i)))
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
   }
 
   rating_per_ch = word.rating() / word_res->reject_map.length();
 
   if (rating_per_ch >= suspect_rating_per_ch)
     return;  // Don't touch bad ratings
 
   if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
     /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
     for (i = 0; i < len; ++i) {
       if (word_res->reject_map[i].rejected() &&
           (!uchset.eq(word.unichar_id(i), " ")))
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
   }
 
   for (i = 0; i < len; i++) {
     if (word_res->reject_map[i].rejected()) {
       if (word_res->reject_map[i].flag(R_DOC_REJ))
         word_res->reject_map[i].setrej_minimal_rej_accept();
       if (word_res->reject_map[i].flag(R_BLOCK_REJ))
         word_res->reject_map[i].setrej_minimal_rej_accept();
       if (word_res->reject_map[i].flag(R_ROW_REJ))
         word_res->reject_map[i].setrej_minimal_rej_accept();
     }
   }
 
   if (suspect_level == 2)
     return;
 
   if (!suspect_constrain_1Il ||
       (word_res->reject_map.length() <= suspect_short_words)) {
     for (i = 0; i < len; i++) {
       if (word_res->reject_map[i].rejected()) {
         if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
           word_res->reject_map[i].flag(R_POSTNN_1IL)))
           word_res->reject_map[i].setrej_minimal_rej_accept();
 
         if (!suspect_constrain_1Il &&
           word_res->reject_map[i].flag(R_MM_REJECT))
           word_res->reject_map[i].setrej_minimal_rej_accept();
       }
     }
   }
 
   if (acceptable_word_string(*word_res->uch_set,
                              word.unichar_string().string(),
                              word.unichar_lengths().string()) !=
                                  AC_UNACCEPTABLE ||
       acceptable_number_string(word.unichar_string().string(),
                                word.unichar_lengths().string())) {
     if (word_res->reject_map.length() > suspect_short_words) {
       for (i = 0; i < len; i++) {
         if (word_res->reject_map[i].rejected() &&
           (!word_res->reject_map[i].perm_rejected() ||
            word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
            word_res->reject_map[i].flag (R_POSTNN_1IL) ||
            word_res->reject_map[i].flag (R_MM_REJECT))) {
           word_res->reject_map[i].setrej_minimal_rej_accept();
         }
       }
     }
   }
 }

◆ set_word_fonts()

void tesseract::Tesseract::set_word_fonts ( WERD_RES * word )

set_word_fonts

Get the fonts for the word.

Definition at line 1907 of file control.cpp.

                                              {
   // Don't try to set the word fonts for an lstm word, as the configs
   // will be meaningless.
   if (word->chopped_word == NULL) return;
   ASSERT_HOST(word->best_choice != NULL);
 
   int fontinfo_size = get_fontinfo_table().size();
   if (fontinfo_size == 0) return;
   GenericVector<int> font_total_score;
   font_total_score.init_to_size(fontinfo_size, 0);
 
   word->italic = 0;
   word->bold = 0;
   // Compute the font scores for the word
   if (tessedit_debug_fonts) {
     tprintf("Examining fonts in %s\n",
             word->best_choice->debug_string().string());
   }
   for (int b = 0; b < word->best_choice->length(); ++b) {
     BLOB_CHOICE* choice = word->GetBlobChoice(b);
     if (choice == NULL) continue;
     const GenericVector<ScoredFont>& fonts = choice->fonts();
     for (int f = 0; f < fonts.size(); ++f) {
       int fontinfo_id = fonts[f].fontinfo_id;
       if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
         font_total_score[fontinfo_id] += fonts[f].score;
       }
     }
   }
   // Find the top and 2nd choice for the word.
   int score1 = 0, score2 = 0;
   inT16 font_id1 = -1, font_id2 = -1;
   for (int f = 0; f < fontinfo_size; ++f) {
     if (tessedit_debug_fonts && font_total_score[f] > 0) {
       tprintf("Font %s, total score = %d\n",
               fontinfo_table_.get(f).name, font_total_score[f]);
     }
     if (font_total_score[f] > score1) {
       score2 = score1;
       font_id2 = font_id1;
       score1 = font_total_score[f];
       font_id1 = f;
     } else if (font_total_score[f] > score2) {
       score2 = font_total_score[f];
       font_id2 = f;
     }
   }
   word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
   word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
   // Each score has a limit of MAX_UINT16, so divide by that to get the number
   // of "votes" for that font, ie number of perfect scores.
   word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8);
   word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8);
   if (score1 > 0) {
     FontInfo fi = fontinfo_table_.get(font_id1);
     if (tessedit_debug_fonts) {
       if (word->fontinfo_id2_count > 0) {
         tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
                 fi.name, word->fontinfo_id_count,
                 fontinfo_table_.get(font_id2).name,
                 word->fontinfo_id2_count);
       } else {
         tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
                 fi.name, word->fontinfo_id_count);
       }
     }
     word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
     word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
   }
 }

◆ SetBlackAndWhitelist()

void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 673 of file tesseractclass.cpp.

                                      {
   // Set the white and blacklists (if any)
   unicharset.set_black_and_whitelist(tessedit_char_blacklist.string(),
                                      tessedit_char_whitelist.string(),
                                      tessedit_char_unblacklist.string());
   // Black and white lists should apply to all loaded classifiers.
   for (int i = 0; i < sub_langs_.size(); ++i) {
     sub_langs_[i]->unicharset.set_black_and_whitelist(
         tessedit_char_blacklist.string(), tessedit_char_whitelist.string(),
         tessedit_char_unblacklist.string());
   }
 }

◆ SetEquationDetect()

void tesseract::Tesseract::SetEquationDetect ( EquationDetect * detector )

Definition at line 652 of file tesseractclass.cpp.

                                                           {
   equ_detect_ = detector;
   equ_detect_->SetLangTesseract(this);
 }

◆ SetScaledColor()

void tesseract::Tesseract::SetScaledColor	(	int	factor,
		Pix *	color
	)

inline

Definition at line 239 of file tesseractclass.h.

                                               {
     scaled_factor_ = factor;
     scaled_color_ = color;
   }

◆ SetupAllWordsPassN()

void tesseract::Tesseract::SetupAllWordsPassN	(	int	pass_n,
		const TBOX *	target_word_box,
		const char *	word_config,
		PAGE_RES *	page_res,
		GenericVector< WordData > *	words
	)

If tesseract is to be run, sets the words up ready for it.

Definition at line 151 of file control.cpp.

                                                                    {
   // Prepare all the words.
   PAGE_RES_IT page_res_it(page_res);
   for (page_res_it.restart_page(); page_res_it.word() != NULL;
        page_res_it.forward()) {
     if (target_word_box == NULL ||
         ProcessTargetWord(page_res_it.word()->word->bounding_box(),
                           *target_word_box, word_config, 1)) {
       words->push_back(WordData(page_res_it));
     }
   }
   // Setup all the words for recognition with polygonal approximation.
   for (int w = 0; w < words->size(); ++w) {
     SetupWordPassN(pass_n, &(*words)[w]);
     if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
   }
 }

◆ SetupApplyBoxes()

PAGE_RES * tesseract::Tesseract::SetupApplyBoxes	(	const GenericVector< TBOX > &	boxes,
		BLOCK_LIST *	block_list
	)

Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.

Definition at line 217 of file applybox.cpp.

                                                              {
   PreenXHeights(block_list);
   // Strip all fuzzy space markers to simplify the PAGE_RES.
   BLOCK_IT b_it(block_list);
   for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
     BLOCK* block = b_it.data();
     ROW_IT r_it(block->row_list());
     for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
       ROW* row = r_it.data();
       WERD_IT w_it(row->word_list());
       for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
         WERD* word = w_it.data();
         if (word->cblob_list()->empty()) {
           delete w_it.extract();
         } else {
           word->set_flag(W_FUZZY_SP, false);
           word->set_flag(W_FUZZY_NON, false);
         }
       }
     }
   }
   PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
   PAGE_RES_IT pr_it(page_res);
   WERD_RES* word_res;
   while ((word_res = pr_it.word()) != NULL) {
     MaximallyChopWord(boxes, pr_it.block()->block,
                       pr_it.row()->row, word_res);
     pr_it.forward();
   }
   return page_res;
 }

◆ SetupPageSegAndDetectOrientation()

ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation	(	PageSegMode	pageseg_mode,
		BLOCK_LIST *	blocks,
		Tesseract *	osd_tess,
		OSResults *	osr,
		TO_BLOCK_LIST *	to_blocks,
		Pix **	photo_mask_pix,
		Pix **	music_mask_pix
	)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a NULL pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 274 of file pagesegmain.cpp.

                           {
   int vertical_x = 0;
   int vertical_y = 1;
   TabVector_LIST v_lines;
   TabVector_LIST h_lines;
   ICOORD bleft(0, 0);
 
   ASSERT_HOST(pix_binary_ != NULL);
   if (tessedit_dump_pageseg_images) {
     pixa_debug_.AddPix(pix_binary_, "PageSegInput");
   }
   // Leptonica is used to find the rule/separator lines in the input.
   LineFinder::FindAndRemoveLines(source_resolution_,
                                  textord_tabfind_show_vlines, pix_binary_,
                                  &vertical_x, &vertical_y, music_mask_pix,
                                  &v_lines, &h_lines);
   if (tessedit_dump_pageseg_images) {
     pixa_debug_.AddPix(pix_binary_, "NoLines");
   }
   // Leptonica is used to find a mask of the photo regions in the input.
   *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
   if (tessedit_dump_pageseg_images) {
     pixa_debug_.AddPix(pix_binary_, "NoImages");
   }
   if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
 
   // The rest of the algorithm uses the usual connected components.
   textord_.find_components(pix_binary_, blocks, to_blocks);
 
   TO_BLOCK_IT to_block_it(to_blocks);
   // There must be exactly one input block.
   // TODO(rays) handle new textline finding with a UNLV zone file.
   ASSERT_HOST(to_blocks->singleton());
   TO_BLOCK* to_block = to_block_it.data();
   TBOX blkbox = to_block->block->bounding_box();
   ColumnFinder* finder = NULL;
 
   if (to_block->line_size >= 2) {
     finder = new ColumnFinder(static_cast<int>(to_block->line_size),
                               blkbox.botleft(), blkbox.topright(),
                               source_resolution_, textord_use_cjk_fp_model,
                               textord_tabfind_aligned_gap_fraction,
                               &v_lines, &h_lines, vertical_x, vertical_y);
 
     finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
 
     if (equ_detect_) {
       equ_detect_->LabelSpecialText(to_block);
     }
 
     BLOBNBOX_CLIST osd_blobs;
     // osd_orientation is the number of 90 degree rotations to make the
     // characters upright. (See osdetect.h for precise definition.)
     // We want the text lines horizontal, (vertical text indicates vertical
     // textlines) which may conflict (eg vertically written CJK).
     int osd_orientation = 0;
     bool vertical_text = textord_tabfind_force_vertical_text ||
                          pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
     if (!vertical_text && textord_tabfind_vertical_text &&
         PSM_ORIENTATION_ENABLED(pageseg_mode)) {
       vertical_text =
           finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
                                           to_block, &osd_blobs);
     }
     if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) {
       GenericVector<int> osd_scripts;
       if (osd_tess != this) {
         // We are running osd as part of layout analysis, so constrain the
         // scripts to those allowed by *this.
         AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
         for (int s = 0; s < sub_langs_.size(); ++s) {
           AddAllScriptsConverted(sub_langs_[s]->unicharset,
                                  osd_tess->unicharset, &osd_scripts);
         }
       }
       os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
       if (pageseg_mode == PSM_OSD_ONLY) {
         delete finder;
         return NULL;
       }
       osd_orientation = osr->best_result.orientation_id;
       double osd_score = osr->orientations[osd_orientation];
       double osd_margin = min_orientation_margin * 2;
       for (int i = 0; i < 4; ++i) {
         if (i != osd_orientation &&
             osd_score - osr->orientations[i] < osd_margin) {
           osd_margin = osd_score - osr->orientations[i];
         }
       }
       int best_script_id = osr->best_result.script_id;
       const char* best_script_str =
           osd_tess->unicharset.get_script_from_script_id(best_script_id);
       bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
           best_script_id == osd_tess->unicharset.hiragana_sid() ||
           best_script_id == osd_tess->unicharset.katakana_sid() ||
           strcmp("Japanese", best_script_str) == 0 ||
           strcmp("Korean", best_script_str) == 0 ||
           strcmp("Hangul", best_script_str) == 0;
       if (cjk) {
         finder->set_cjk_script(true);
       }
       if (osd_margin < min_orientation_margin) {
         // The margin is weak.
         if (!cjk && !vertical_text && osd_orientation == 2) {
           // upside down latin text is improbable with such a weak margin.
           tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
                   "Don't rotate.\n", osd_margin);
           osd_orientation = 0;
         } else {
           tprintf(
               "OSD: Weak margin (%.2f) for %d blob text block, "
               "but using orientation anyway: %d\n",
               osd_margin, osd_blobs.length(), osd_orientation);
         }
       }
     }
     osd_blobs.shallow_clear();
     finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
   }
 
   return finder;
 }

◆ SetupUniversalFontIds()

void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 436 of file tessedit.cpp.

                                       {
   // Note that we can get away with bitwise copying FontInfo in
   // all_fonts, as it is a temporary structure and we avoid setting the
   // delete callback.
   UnicityTable<FontInfo> all_fonts;
   all_fonts.set_compare_callback(NewPermanentTessCallback(CompareFontInfo));
 
   // Create the universal ID table.
   CollectFonts(get_fontinfo_table(), &all_fonts);
   for (int i = 0; i < sub_langs_.size(); ++i) {
     CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
   }
   // Assign ids from the table to each font table.
   AssignIds(all_fonts, &get_fontinfo_table());
   for (int i = 0; i < sub_langs_.size(); ++i) {
     AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
   }
   font_table_size_ = all_fonts.size();
 }

◆ SetupWordPassN()

void tesseract::Tesseract::SetupWordPassN	(	int	pass_n,
		WordData *	word
	)

Definition at line 174 of file control.cpp.

                                                          {
   if (pass_n == 1 || !word->word->done) {
     if (pass_n == 1) {
       word->word->SetupForRecognition(unicharset, this, BestPix(),
                                       tessedit_ocr_engine_mode, NULL,
                                       classify_bln_numeric_mode,
                                       textord_use_cjk_fp_model,
                                       poly_allow_detailed_fx,
                                       word->row, word->block);
     } else if (pass_n == 2) {
       // TODO(rays) Should we do this on pass1 too?
       word->word->caps_height = 0.0;
       if (word->word->x_height == 0.0f)
         word->word->x_height = word->row->x_height();
     }
     word->lang_words.truncate(0);
     for (int s = 0; s <= sub_langs_.size(); ++s) {
       // The sub_langs_.size() entry is for the master language.
       Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
       WERD_RES* word_res = new WERD_RES;
       word_res->InitForRetryRecognition(*word->word);
       word->lang_words.push_back(word_res);
       // LSTM doesn't get setup for pass2.
       if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
         word_res->SetupForRecognition(
               lang_t->unicharset, lang_t, BestPix(),
               lang_t->tessedit_ocr_engine_mode, NULL,
               lang_t->classify_bln_numeric_mode,
               lang_t->textord_use_cjk_fp_model,
               lang_t->poly_allow_detailed_fx, word->row, word->block);
       }
     }
   }
 }

◆ SetupWordScripts()

void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST * blocks )

◆ source_resolution()

int tesseract::Tesseract::source_resolution ( ) const

inline

Definition at line 221 of file tesseractclass.h.

                                 {
     return source_resolution_;
   }

◆ split_and_recog_word()

void tesseract::Tesseract::split_and_recog_word ( WERD_RES * word )

Definition at line 144 of file tfacepp.cpp.

                                                    {
   // Find the biggest blob gap in the chopped_word.
   int bestgap = -MAX_INT32;
   int split_index = 0;
   for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
     TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
     TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
     int gap = blob_box.left() - prev_box.right();
     if (gap > bestgap) {
       bestgap = gap;
       split_index = b;
     }
   }
   ASSERT_HOST(split_index > 0);
 
   WERD_RES *word2 = NULL;
   BlamerBundle *orig_bb = NULL;
   split_word(word, split_index, &word2, &orig_bb);
 
   // Recognize the first part of the word.
   recog_word_recursive(word);
   // Recognize the second part of the word.
   recog_word_recursive(word2);
 
   join_words(word, word2, orig_bb);
 }

◆ split_word()

void tesseract::Tesseract::split_word	(	WERD_RES *	word,
		int	split_pt,
		WERD_RES **	right_piece,
		BlamerBundle **	orig_blamer_bundle
	)		const

Definition at line 182 of file tfacepp.cpp.

                                                                     {
   ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
 
   // Save a copy of the blamer bundle so we can try to reconstruct it below.
   BlamerBundle *orig_bb =
       word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL;
 
   WERD_RES *word2 = new WERD_RES(*word);
 
   // blow away the copied chopped_word, as we want to work with
   // the blobs from the input chopped_word so seam_arrays can be merged.
   TWERD *chopped = word->chopped_word;
   TWERD *chopped2 = new TWERD;
   chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
   for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
     chopped2->blobs.push_back(chopped->blobs[i]);
   }
   chopped->blobs.truncate(split_pt);
   word->chopped_word = NULL;
   delete word2->chopped_word;
   word2->chopped_word = NULL;
 
   const UNICHARSET &unicharset = *word->uch_set;
   word->ClearResults();
   word2->ClearResults();
   word->chopped_word = chopped;
   word2->chopped_word = chopped2;
   word->SetupBasicsFromChoppedWord(unicharset);
   word2->SetupBasicsFromChoppedWord(unicharset);
 
   // Try to adjust the blamer bundle.
   if (orig_bb != NULL) {
     // TODO(rays) Looks like a leak to me.
     // orig_bb should take, rather than copy.
     word->blamer_bundle = new BlamerBundle();
     word2->blamer_bundle = new BlamerBundle();
     orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
                          word2->chopped_word->blobs[0]->bounding_box().left(),
                          wordrec_debug_blamer,
                          word->blamer_bundle, word2->blamer_bundle);
   }
 
   *right_piece = word2;
   *orig_blamer_bundle = orig_bb;
 }

◆ SubAndSuperscriptFix()

bool tesseract::Tesseract::SubAndSuperscriptFix ( WERD_RES * word )

Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, acccept.

This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.

Returns: Whether we modified the given word.

Definition at line 101 of file superscript.cpp.

                                                    {
   if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
       !word->best_choice) {
     return false;
   }
   int num_leading, num_trailing;
   ScriptPos sp_leading, sp_trailing;
   float leading_certainty, trailing_certainty;
   float avg_certainty, unlikely_threshold;
 
   // Calculate the number of whole suspicious characters at the edges.
   GetSubAndSuperscriptCandidates(
           word, &num_leading, &sp_leading, &leading_certainty,
           &num_trailing, &sp_trailing, &trailing_certainty,
           &avg_certainty, &unlikely_threshold);
 
   const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
   const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
 
   int num_blobs = word->best_choice->length();
 
   // Calculate the remainder (partial characters) at the edges.
   // This accounts for us having classified the best version of
   // a word as [speaker?'] when it was instead [speaker.^{21}]
   // (that is we accidentally thought the 2 was attached to the period).
   int num_remainder_leading = 0, num_remainder_trailing = 0;
   if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
     int super_y_bottom =
         kBlnBaselineOffset + kBlnXHeight * superscript_min_y_bottom;
     int sub_y_top =
         kBlnBaselineOffset + kBlnXHeight * subscript_max_y_top;
     int last_word_char = num_blobs - 1 - num_trailing;
     float last_char_certainty = word->best_choice->certainty(last_word_char);
     if (word->best_choice->unichar_id(last_word_char) != 0 &&
         last_char_certainty <= unlikely_threshold) {
       ScriptPos rpos;
       YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
                      NULL, NULL, &rpos, &num_remainder_trailing);
       if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
       if (num_remainder_trailing > 0 &&
           last_char_certainty < trailing_certainty) {
         trailing_certainty = last_char_certainty;
       }
     }
     bool another_blob_available = (num_remainder_trailing == 0) ||
         num_leading + num_trailing + 1 < num_blobs;
     int first_char_certainty = word->best_choice->certainty(num_leading);
     if (another_blob_available &&
         word->best_choice->unichar_id(num_leading) != 0 &&
         first_char_certainty <= unlikely_threshold) {
       ScriptPos lpos;
       YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
                      &lpos, &num_remainder_leading, NULL, NULL);
       if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
       if (num_remainder_leading > 0 &&
           first_char_certainty < leading_certainty) {
         leading_certainty = first_char_certainty;
       }
     }
   }
 
   // If nothing to do, bail now.
   if (num_leading + num_trailing +
       num_remainder_leading + num_remainder_trailing == 0) {
     return false;
   }
 
   if (superscript_debug >= 1) {
     tprintf("Candidate for superscript detection: %s (",
             word->best_choice->unichar_string().string());
     if (num_leading || num_remainder_leading) {
       tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
               leading_pos);
     }
     if (num_trailing || num_remainder_trailing) {
       tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
               trailing_pos);
     }
     tprintf(")\n");
   }
   if (superscript_debug >= 3) {
     word->best_choice->print();
   }
   if (superscript_debug >= 2) {
     tprintf(" Certainties -- Average: %.2f  Unlikely thresh: %.2f  ",
             avg_certainty, unlikely_threshold);
     if (num_leading)
       tprintf("Orig. leading (min): %.2f  ", leading_certainty);
     if (num_trailing)
       tprintf("Orig. trailing (min): %.2f  ", trailing_certainty);
     tprintf("\n");
   }
 
   // We've now calculated the number of rebuilt blobs we want to carve off.
   // However, split_word() works from TBLOBs in chopped_word, so we need to
   // convert to those.
   int num_chopped_leading =
       LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
   int num_chopped_trailing =
       TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
 
   int retry_leading = 0;
   int retry_trailing = 0;
   bool is_good = false;
   WERD_RES *revised = TrySuperscriptSplits(
       num_chopped_leading, leading_certainty, sp_leading,
       num_chopped_trailing, trailing_certainty, sp_trailing,
       word, &is_good, &retry_leading, &retry_trailing);
   if (is_good) {
     word->ConsumeWordResults(revised);
   } else if (retry_leading || retry_trailing) {
     int retry_chopped_leading =
         LeadingUnicharsToChopped(revised, retry_leading);
     int retry_chopped_trailing =
         TrailingUnicharsToChopped(revised, retry_trailing);
     WERD_RES *revised2 = TrySuperscriptSplits(
         retry_chopped_leading, leading_certainty, sp_leading,
         retry_chopped_trailing, trailing_certainty, sp_trailing,
         revised, &is_good, &retry_leading, &retry_trailing);
     if (is_good) {
       word->ConsumeWordResults(revised2);
     }
     delete revised2;
   }
   delete revised;
   return is_good;
 }

◆ terrible_word_crunch()

BOOL8 tesseract::Tesseract::terrible_word_crunch	(	WERD_RES *	word,
		GARBAGE_LEVEL	garbage_level
	)

Definition at line 508 of file docqual.cpp.

                                                                    {
   float rating_per_ch;
   int adjusted_len;
   int crunch_mode = 0;
 
   if ((word->best_choice->unichar_string().length () == 0) ||
     (strspn (word->best_choice->unichar_string().string(), " ") ==
     word->best_choice->unichar_string().unsigned_size ()))
     crunch_mode = 1;
   else {
     adjusted_len = word->reject_map.length ();
     if (adjusted_len > crunch_rating_max)
       adjusted_len = crunch_rating_max;
     rating_per_ch = word->best_choice->rating () / adjusted_len;
 
     if (rating_per_ch > crunch_terrible_rating)
       crunch_mode = 2;
     else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
       crunch_mode = 3;
     else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
       (garbage_level != G_OK))
       crunch_mode = 4;
     else if ((rating_per_ch > crunch_poor_garbage_rate) &&
       (garbage_level != G_OK))
       crunch_mode = 5;
   }
   if (crunch_mode > 0) {
     if (crunch_debug > 2) {
       tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
         crunch_mode, word->best_choice->unichar_string().string());
     }
     return TRUE;
   }
   else
     return FALSE;
 }

◆ tess_acceptable_word()

bool tesseract::Tesseract::tess_acceptable_word ( WERD_RES * word )

Definition at line 69 of file tessbox.cpp.

                                                    {
   return getDict().AcceptableResult(word);
 }

◆ tess_add_doc_word()

void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE * word_choice )

Definition at line 79 of file tessbox.cpp.

                                                           {
   getDict().add_document_word(*word_choice);
 }

◆ tess_segment_pass_n()

void tesseract::Tesseract::tess_segment_pass_n	(	int	pass_n,
		WERD_RES *	word
	)

Definition at line 39 of file tessbox.cpp.

                                                               {
   int saved_enable_assoc = 0;
   int saved_chop_enable = 0;
 
   if (word->word->flag(W_DONT_CHOP)) {
     saved_enable_assoc = wordrec_enable_assoc;
     saved_chop_enable = chop_enable;
     wordrec_enable_assoc.set_value(0);
     chop_enable.set_value(0);
   }
   if (pass_n == 1)
     set_pass1();
   else
     set_pass2();
   recog_word(word);
   if (word->best_choice == NULL)
     word->SetupFake(*word->uch_set);
   if (word->word->flag(W_DONT_CHOP)) {
     wordrec_enable_assoc.set_value(saved_enable_assoc);
     chop_enable.set_value(saved_chop_enable);
   }
 }

◆ TestNewNormalization()

bool tesseract::Tesseract::TestNewNormalization	(	int	original_misfits,
		float	baseline_shift,
		float	new_x_ht,
		WERD_RES *	word,
		BLOCK *	block,
		ROW *	row
	)

Definition at line 1468 of file control.cpp.

                                                                              {
   bool accept_new_x_ht = false;
   WERD_RES new_x_ht_word(word->word);
   if (word->blamer_bundle != NULL) {
     new_x_ht_word.blamer_bundle = new BlamerBundle();
     new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
   }
   new_x_ht_word.x_height = new_x_ht;
   new_x_ht_word.baseline_shift = baseline_shift;
   new_x_ht_word.caps_height = 0.0;
   new_x_ht_word.SetupForRecognition(
         unicharset, this, BestPix(), tessedit_ocr_engine_mode, NULL,
         classify_bln_numeric_mode, textord_use_cjk_fp_model,
       poly_allow_detailed_fx, row, block);
   match_word_pass_n(2, &new_x_ht_word, row, block);
   if (!new_x_ht_word.tess_failed) {
     int new_misfits = CountMisfitTops(&new_x_ht_word);
     if (debug_x_ht_level >= 1) {
       tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
               original_misfits, word->x_height,
               new_misfits, new_x_ht);
       tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
               word->best_choice->rating(), word->best_choice->certainty(),
               new_x_ht_word.best_choice->rating(),
               new_x_ht_word.best_choice->certainty());
     }
     // The misfits must improve and either the rating or certainty.
     accept_new_x_ht = new_misfits < original_misfits &&
                       (new_x_ht_word.best_choice->certainty() >
                           word->best_choice->certainty() ||
                        new_x_ht_word.best_choice->rating() <
                           word->best_choice->rating());
     if (debug_x_ht_level >= 1) {
       ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
     }
   }
   if (accept_new_x_ht) {
     word->ConsumeWordResults(&new_x_ht_word);
     return true;
   }
   return false;
 }

◆ textord()

const Textord& tesseract::Tesseract::textord ( ) const

inline

Definition at line 243 of file tesseractclass.h.

                                  {
     return textord_;
   }

◆ TidyUp()

void tesseract::Tesseract::TidyUp ( PAGE_RES * page_res )

Counts up the labelled words and the blobs within.
Deletes all unused or emptied words, counting the unused ones.
Resets W_BOL and W_EOL flags correctly.
Builds the rebuild_word and rebuilds the box_word and the best_choice.

Definition at line 706 of file applybox.cpp.

                                          {
   int ok_blob_count = 0;
   int bad_blob_count = 0;
   int ok_word_count = 0;
   int unlabelled_words = 0;
   PAGE_RES_IT pr_it(page_res);
   WERD_RES* word_res;
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
     int ok_in_word = 0;
     int blob_count = word_res->correct_text.size();
     WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
     word_choice->set_permuter(TOP_CHOICE_PERM);
     for (int c = 0; c < blob_count; ++c) {
       if (word_res->correct_text[c].length() > 0) {
         ++ok_in_word;
       }
       // Since we only need a fake word_res->best_choice, the actual
       // unichar_ids do not matter. Which is fortunate, since TidyUp()
       // can be called while training Tesseract, at the stage where
       // unicharset is not meaningful yet.
       word_choice->append_unichar_id_space_allocated(
           INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
     }
     if (ok_in_word > 0) {
       ok_blob_count += ok_in_word;
       bad_blob_count += word_res->correct_text.size() - ok_in_word;
       word_res->LogNewRawChoice(word_choice);
       word_res->LogNewCookedChoice(1, false, word_choice);
     } else {
       ++unlabelled_words;
       if (applybox_debug > 0) {
         tprintf("APPLY_BOXES: Unlabelled word at :");
         word_res->word->bounding_box().print();
       }
       pr_it.DeleteCurrentWord();
       delete word_choice;
     }
   }
   pr_it.restart_page();
   for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
     // Denormalize back to a BoxWord.
     word_res->RebuildBestState();
     word_res->SetupBoxWord();
     word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
     word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
   }
   if (applybox_debug > 0) {
     tprintf("   Found %d good blobs.\n", ok_blob_count);
     if (bad_blob_count > 0) {
       tprintf("   Leaving %d unlabelled blobs in %d words.\n",
               bad_blob_count, ok_word_count);
     }
     if (unlabelled_words > 0)
       tprintf("   %d remaining unlabelled words deleted.\n", unlabelled_words);
   }
 }

◆ tilde_crunch()

void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT & page_res_it )

Definition at line 422 of file docqual.cpp.

                                                      {
   WERD_RES *word;
   GARBAGE_LEVEL garbage_level;
   PAGE_RES_IT copy_it;
   BOOL8 prev_potential_marked = FALSE;
   BOOL8 found_terrible_word = FALSE;
   BOOL8 ok_dict_word;
 
   page_res_it.restart_page();
   while (page_res_it.word() != NULL) {
     POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
     if (pb != NULL && !pb->IsText()) {
       page_res_it.forward();
       continue;
     }
     word = page_res_it.word();
 
     if (crunch_early_convert_bad_unlv_chs)
       convert_bad_unlv_chs(word);
 
     if (crunch_early_merge_tess_fails)
       word->merge_tess_fails();
 
     if (word->reject_map.accept_count () != 0) {
       found_terrible_word = FALSE;
                                  //Forget earlier potential crunches
       prev_potential_marked = FALSE;
     }
     else {
       ok_dict_word = safe_dict_word(word);
       garbage_level = garbage_word (word, ok_dict_word);
 
       if ((garbage_level != G_NEVER_CRUNCH) &&
       (terrible_word_crunch (word, garbage_level))) {
         if (crunch_debug > 0) {
           tprintf ("T CRUNCHING: \"%s\"\n",
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = CR_KEEP_SPACE;
         if (prev_potential_marked) {
           while (copy_it.word () != word) {
             if (crunch_debug > 0) {
               tprintf ("P1 CRUNCHING: \"%s\"\n",
                 copy_it.word()->best_choice->unichar_string().string());
             }
             copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
             copy_it.forward ();
           }
           prev_potential_marked = FALSE;
         }
         found_terrible_word = TRUE;
       }
       else if ((garbage_level != G_NEVER_CRUNCH) &&
         (potential_word_crunch (word,
       garbage_level, ok_dict_word))) {
         if (found_terrible_word) {
           if (crunch_debug > 0) {
             tprintf ("P2 CRUNCHING: \"%s\"\n",
               word->best_choice->unichar_string().string());
           }
           word->unlv_crunch_mode = CR_KEEP_SPACE;
         }
         else if (!prev_potential_marked) {
           copy_it = page_res_it;
           prev_potential_marked = TRUE;
           if (crunch_debug > 1) {
             tprintf ("P3 CRUNCHING: \"%s\"\n",
               word->best_choice->unichar_string().string());
           }
         }
       }
       else {
         found_terrible_word = FALSE;
                                  //Forget earlier potential crunches
         prev_potential_marked = FALSE;
         if (crunch_debug > 2) {
           tprintf ("NO CRUNCH: \"%s\"\n",
             word->best_choice->unichar_string().string());
         }
       }
     }
     page_res_it.forward ();
   }
 }

◆ tilde_delete()

void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT & page_res_it )

Definition at line 594 of file docqual.cpp.

                                                      {
   WERD_RES *word;
   PAGE_RES_IT copy_it;
   BOOL8 deleting_from_bol = FALSE;
   BOOL8 marked_delete_point = FALSE;
   inT16 debug_delete_mode;
   CRUNCH_MODE delete_mode;
   inT16 x_debug_delete_mode;
   CRUNCH_MODE x_delete_mode;
 
   page_res_it.restart_page();
   while (page_res_it.word() != NULL) {
     word = page_res_it.word();
 
     delete_mode = word_deletable (word, debug_delete_mode);
     if (delete_mode != CR_NONE) {
       if (word->word->flag (W_BOL) || deleting_from_bol) {
         if (crunch_debug > 0) {
           tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
             debug_delete_mode,
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = delete_mode;
         deleting_from_bol = TRUE;
       } else if (word->word->flag(W_EOL)) {
         if (marked_delete_point) {
           while (copy_it.word() != word) {
             x_delete_mode = word_deletable (copy_it.word (),
               x_debug_delete_mode);
             if (crunch_debug > 0) {
               tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
                 x_debug_delete_mode,
                 copy_it.word()->best_choice->unichar_string().string());
             }
             copy_it.word ()->unlv_crunch_mode = x_delete_mode;
             copy_it.forward ();
           }
         }
         if (crunch_debug > 0) {
           tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
             debug_delete_mode,
             word->best_choice->unichar_string().string());
         }
         word->unlv_crunch_mode = delete_mode;
         deleting_from_bol = FALSE;
         marked_delete_point = FALSE;
       }
       else {
         if (!marked_delete_point) {
           copy_it = page_res_it;
           marked_delete_point = TRUE;
         }
       }
     }
     else {
       deleting_from_bol = FALSE;
                                  //Forget earlier potential crunches
       marked_delete_point = FALSE;
     }
     /*
       The following step has been left till now as the tess fails are used to
       determine if the word is deletable.
     */
     if (!crunch_early_merge_tess_fails)
       word->merge_tess_fails();
     page_res_it.forward ();
   }
 }

◆ TrainedXheightFix()

bool tesseract::Tesseract::TrainedXheightFix	(	WERD_RES *	word,
		BLOCK *	block,
		ROW *	row
	)

Definition at line 1434 of file control.cpp.

                                                                         {
   int original_misfits = CountMisfitTops(word);
   if (original_misfits == 0)
     return false;
   float baseline_shift = 0.0f;
   float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
   if (baseline_shift != 0.0f) {
     // Try the shift on its own first.
     if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
                               word, block, row))
       return false;
     original_misfits = CountMisfitTops(word);
     if (original_misfits > 0) {
       float new_baseline_shift;
       // Now recompute the new x_height.
       new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
       if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
         // No test of return value here, as we are definitely making a change
         // to the word by shifting the baseline.
         TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
                              word, block, row);
       }
     }
     return true;
   } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
     return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
                                 word, block, row);
   } else {
     return false;
   }
 }

◆ TrainFromBoxes()

void tesseract::Tesseract::TrainFromBoxes	(	const GenericVector< TBOX > &	boxes,
		const GenericVector< STRING > &	texts,
		BLOCK_LIST *	block_list,
		DocumentData *	training_data
	)

Definition at line 76 of file linerec.cpp.

                                                             {
   int box_count = boxes.size();
   // Process all the text lines in this page, as defined by the boxes.
   int end_box = 0;
   // Don't let \t, which marks newlines in the box file, get into the line
   // content, as that makes the line unusable in training.
   while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
   for (int start_box = end_box; start_box < box_count; start_box = end_box) {
     // Find the textline of boxes starting at start and their bounding box.
     TBOX line_box = boxes[start_box];
     STRING line_str = texts[start_box];
     for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
          ++end_box) {
       line_box += boxes[end_box];
       line_str += texts[end_box];
     }
     // Find the most overlapping block.
     BLOCK* best_block = NULL;
     int best_overlap = 0;
     BLOCK_IT b_it(block_list);
     for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
       BLOCK* block = b_it.data();
       if (block->poly_block() != NULL && !block->poly_block()->IsText())
         continue;  // Not a text block.
       TBOX block_box = block->bounding_box();
       block_box.rotate(block->re_rotation());
       if (block_box.major_overlap(line_box)) {
         TBOX overlap_box = line_box.intersection(block_box);
         if (overlap_box.area() > best_overlap) {
           best_overlap = overlap_box.area();
           best_block = block;
         }
       }
     }
     ImageData* imagedata = NULL;
     if (best_block == NULL) {
       tprintf("No block overlapping textline: %s\n", line_str.string());
     } else {
       imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
                               *best_block);
     }
     if (imagedata != NULL)
       training_data->AddPageToDocument(imagedata);
     // Don't let \t, which marks newlines in the box file, get into the line
     // content, as that makes the line unusable in training.
     while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
   }
 }

◆ TrainLineRecognizer()

void tesseract::Tesseract::TrainLineRecognizer	(	const STRING &	input_imagename,
		const STRING &	output_basename,
		BLOCK_LIST *	block_list
	)

Definition at line 45 of file linerec.cpp.

                                                             {
   STRING lstmf_name = output_basename + ".lstmf";
   DocumentData images(lstmf_name);
   if (applybox_page > 0) {
     // Load existing document for the previous pages.
     if (!images.LoadDocument(lstmf_name.string(), 0, 0, nullptr)) {
       tprintf("Failed to read training data from %s!\n", lstmf_name.string());
       return;
     }
   }
   GenericVector<TBOX> boxes;
   GenericVector<STRING> texts;
   // Get the boxes for this page, if there are any.
   if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, NULL,
                     NULL) ||
       boxes.empty()) {
     tprintf("Failed to read boxes from %s\n", input_imagename.string());
     return;
   }
   TrainFromBoxes(boxes, texts, block_list, &images);
   images.Shuffle();
   if (!images.SaveDocument(lstmf_name.string(), NULL)) {
     tprintf("Failed to write training data to %s!\n", lstmf_name.string());
   }
 }

◆ TrySuperscriptSplits()

WERD_RES * tesseract::Tesseract::TrySuperscriptSplits	(	int	num_chopped_leading,
		float	leading_certainty,
		ScriptPos	leading_pos,
		int	num_chopped_trailing,
		float	trailing_certainty,
		ScriptPos	trailing_pos,
		WERD_RES *	word,
		bool *	is_good,
		int *	retry_rebuild_leading,
		int *	retry_rebuild_trailing
	)

Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.

Parameters

[in]	num_chopped_leading	how many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript)
[in]	leading_certainty	the (minimum) certainty had by the characters in the original leading section.
[in]	leading_pos	"super" or "sub" (for debugging)
[in]	num_chopped_trailing	how many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript)
[in]	trailing_certainty	the (minimum) certainty had by the characters in the original trailing section.
[in]	trailing_pos	"super" or "sub" (for debugging)
[in]	word	the word to try to chop up.
[out]	is_good	do we believe our result?
[out]	retry_rebuild_leading,retry_rebuild_trailing	If non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars.

Returns: A word which is the result of re-recognizing as asked.

Definition at line 382 of file superscript.cpp.

                                                              {
   int num_chopped = word->chopped_word->NumBlobs();
 
   *retry_rebuild_leading = *retry_rebuild_trailing = 0;
 
   // Chop apart the word into up to three pieces.
 
   BlamerBundle *bb0 = NULL;
   BlamerBundle *bb1 = NULL;
   WERD_RES *prefix = NULL;
   WERD_RES *core = NULL;
   WERD_RES *suffix = NULL;
   if (num_chopped_leading > 0) {
     prefix = new WERD_RES(*word);
     split_word(prefix, num_chopped_leading, &core, &bb0);
   } else {
     core = new WERD_RES(*word);
   }
 
   if (num_chopped_trailing > 0) {
     int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
     split_word(core, split_pt, &suffix, &bb1);
   }
 
   //  Recognize the pieces in turn.
   int saved_cp_multiplier = classify_class_pruner_multiplier;
   int saved_im_multiplier = classify_integer_matcher_multiplier;
   if (prefix) {
     // Turn off Tesseract's y-position penalties for the leading superscript.
     classify_class_pruner_multiplier.set_value(0);
     classify_integer_matcher_multiplier.set_value(0);
 
     // Adjust our expectations about the baseline for this prefix.
     if (superscript_debug >= 3) {
       tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
     }
     recog_word_recursive(prefix);
     if (superscript_debug >= 2) {
       tprintf(" The leading bits look like %s %s\n",
               ScriptPosToString(leading_pos),
               prefix->best_choice->unichar_string().string());
     }
 
     // Restore the normal y-position penalties.
     classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
     classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
   }
 
   if (superscript_debug >= 3) {
     tprintf(" recognizing middle %d chopped blobs\n",
             num_chopped - num_chopped_leading - num_chopped_trailing);
   }
 
   if (suffix) {
     // Turn off Tesseract's y-position penalties for the trailing superscript.
     classify_class_pruner_multiplier.set_value(0);
     classify_integer_matcher_multiplier.set_value(0);
 
     if (superscript_debug >= 3) {
       tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
     }
     recog_word_recursive(suffix);
     if (superscript_debug >= 2) {
       tprintf(" The trailing bits look like %s %s\n",
               ScriptPosToString(trailing_pos),
               suffix->best_choice->unichar_string().string());
     }
 
     // Restore the normal y-position penalties.
     classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
     classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
   }
 
   // Evaluate whether we think the results are believably better
   // than what we already had.
   bool good_prefix = !prefix || BelievableSuperscript(
       superscript_debug >= 1, *prefix,
       superscript_bettered_certainty * leading_certainty,
       retry_rebuild_leading, NULL);
   bool good_suffix = !suffix || BelievableSuperscript(
       superscript_debug >= 1, *suffix,
       superscript_bettered_certainty * trailing_certainty,
       NULL, retry_rebuild_trailing);
 
   *is_good = good_prefix && good_suffix;
   if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
     // None of it is any good. Quit now.
     delete core;
     delete prefix;
     delete suffix;
     return NULL;
   }
   recog_word_recursive(core);
 
   // Now paste the results together into core.
   if (suffix) {
     suffix->SetAllScriptPositions(trailing_pos);
     join_words(core, suffix, bb1);
   }
   if (prefix) {
     prefix->SetAllScriptPositions(leading_pos);
     join_words(prefix, core, bb0);
     core = prefix;
     prefix = NULL;
   }
 
   if (superscript_debug >= 1) {
     tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
             core->best_choice->unichar_string().string());
   }
   return core;
 }

◆ unrej_good_chs()

void tesseract::Tesseract::unrej_good_chs	(	WERD_RES *	word,
		ROW *	row
	)

Definition at line 120 of file docqual.cpp.

                                                        {
   if (word->bln_boxes == NULL ||
       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
     return;
 
   DocQualCallbacks cb(word);
   word->bln_boxes->ProcessMatchedBlobs(
       *word->rebuild_word,
       NewPermanentTessCallback(&cb, &DocQualCallbacks::AcceptIfGoodQuality));
 }

◆ unrej_good_quality_words()

void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT & page_res_it )

Definition at line 165 of file docqual.cpp.

                                                                    {
   WERD_RES *word;
   ROW_RES *current_row;
   BLOCK_RES *current_block;
   int i;
 
   page_res_it.restart_page ();
   while (page_res_it.word () != NULL) {
     check_debug_pt (page_res_it.word (), 100);
     if (bland_unrej) {
       word = page_res_it.word ();
       for (i = 0; i < word->reject_map.length (); i++) {
         if (word->reject_map[i].accept_if_good_quality ())
           word->reject_map[i].setrej_quality_accept ();
       }
       page_res_it.forward ();
     }
     else if ((page_res_it.row ()->char_count > 0) &&
       ((page_res_it.row ()->rej_count /
       (float) page_res_it.row ()->char_count) <=
     quality_rowrej_pc)) {
       word = page_res_it.word ();
       if (word->reject_map.quality_recoverable_rejects() &&
           (tessedit_unrej_any_wd ||
            acceptable_word_string(*word->uch_set,
                                   word->best_choice->unichar_string().string(),
                                   word->best_choice->unichar_lengths().string())
                != AC_UNACCEPTABLE)) {
         unrej_good_chs(word, page_res_it.row ()->row);
       }
       page_res_it.forward ();
     }
     else {
       /* Skip to end of dodgy row */
       current_row = page_res_it.row ();
       while ((page_res_it.word () != NULL) &&
         (page_res_it.row () == current_row))
         page_res_it.forward ();
     }
     check_debug_pt (page_res_it.word (), 110);
   }
   page_res_it.restart_page ();
   page_res_it.page_res->char_count = 0;
   page_res_it.page_res->rej_count = 0;
   current_block = NULL;
   current_row = NULL;
   while (page_res_it.word () != NULL) {
     if (current_block != page_res_it.block ()) {
       current_block = page_res_it.block ();
       current_block->char_count = 0;
       current_block->rej_count = 0;
     }
     if (current_row != page_res_it.row ()) {
       current_row = page_res_it.row ();
       current_row->char_count = 0;
       current_row->rej_count = 0;
       current_row->whole_word_rej_count = 0;
     }
     page_res_it.rej_stat_word ();
     page_res_it.forward ();
   }
 }

◆ word_adaptable()

BOOL8 tesseract::Tesseract::word_adaptable	(	WERD_RES *	word,
		uinT16	mode
	)

Definition at line 45 of file adaptions.cpp.

                                              {
   if (tessedit_adaption_debug) {
     tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
           word->best_choice == NULL ? "" :
           word->best_choice->unichar_string().string(),
           word->best_choice->rating(), word->best_choice->certainty());
   }
 
   BOOL8 status = FALSE;
   BITS16 flags(mode);
 
   enum MODES
   {
     ADAPTABLE_WERD,
     ACCEPTABLE_WERD,
     CHECK_DAWGS,
     CHECK_SPACES,
     CHECK_ONE_ELL_CONFLICT,
     CHECK_AMBIG_WERD
   };
 
   /*
   0: NO adaption
   */
   if (mode == 0) {
     if (tessedit_adaption_debug) tprintf("adaption disabled\n");
     return FALSE;
   }
 
   if (flags.bit (ADAPTABLE_WERD)) {
     status |= word->tess_would_adapt;  // result of Classify::AdaptableWord()
     if (tessedit_adaption_debug && !status) {
       tprintf("tess_would_adapt bit is false\n");
     }
   }
 
   if (flags.bit (ACCEPTABLE_WERD)) {
     status |= word->tess_accepted;
     if (tessedit_adaption_debug && !status) {
       tprintf("tess_accepted bit is false\n");
     }
   }
 
   if (!status) {                  // If not set then
     return FALSE;                // ignore other checks
   }
 
   if (flags.bit (CHECK_DAWGS) &&
     (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
     (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
     (word->best_choice->permuter () != USER_DAWG_PERM) &&
     (word->best_choice->permuter () != NUMBER_PERM)) {
     if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
     return FALSE;
   }
 
   if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) {
     if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
     return FALSE;
   }
 
   if (flags.bit (CHECK_SPACES) &&
     (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
     if (tessedit_adaption_debug) tprintf("word contains spaces\n");
     return FALSE;
   }
 
   if (flags.bit (CHECK_AMBIG_WERD) &&
       word->best_choice->dangerous_ambig_found()) {
     if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
     return FALSE;
   }
 
   if (tessedit_adaption_debug) {
     tprintf("returning status %d\n", status);
   }
   return status;
 }

◆ word_blank_and_set_display()

BOOL8 tesseract::Tesseract::word_blank_and_set_display ( PAGE_RES_IT * pr_its )

Definition at line 716 of file pgedit.cpp.

                                                                {
   pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
                                            ScrollView::BLACK);
   return word_set_display(pr_it);
 }

◆ word_bln_display()

BOOL8 tesseract::Tesseract::word_bln_display ( PAGE_RES_IT * pr_it )

word_bln_display()

Normalize word and display in word window

Definition at line 728 of file pgedit.cpp.

                                                     {
   WERD_RES* word_res = pr_it->word();
   if (word_res->chopped_word == NULL) {
     // Setup word normalization parameters.
     word_res->SetupForRecognition(unicharset, this, BestPix(),
                                   tessedit_ocr_engine_mode, NULL,
                                   classify_bln_numeric_mode,
                                   textord_use_cjk_fp_model,
                                   poly_allow_detailed_fx,
                                   pr_it->row()->row, pr_it->block()->block);
   }
   bln_word_window_handle()->Clear();
   display_bln_lines(bln_word_window_handle(), ScrollView::CYAN,
                      1.0, 0.0f, -1000.0f, 1000.0f);
   C_BLOB_IT it(word_res->word->cblob_list());
   ScrollView::Color color = WERD::NextColor(ScrollView::BLACK);
   for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
     it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN,
                            bln_word_window_handle());
     color = WERD::NextColor(color);
   }
   bln_word_window_handle()->Update();
   return TRUE;
 }

◆ word_blob_quality()

inT16 tesseract::Tesseract::word_blob_quality	(	WERD_RES *	word,
		ROW *	row
	)

Definition at line 65 of file docqual.cpp.

                                                            {
   if (word->bln_boxes == NULL ||
       word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
     return 0;
 
   DocQualCallbacks cb(word);
   word->bln_boxes->ProcessMatchedBlobs(
       *word->rebuild_word,
       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountMatchingBlobs));
   return cb.match_count;
 }

◆ word_char_quality()

void tesseract::Tesseract::word_char_quality	(	WERD_RES *	word,
		ROW *	row,
		inT16 *	match_count,
		inT16 *	accepted_match_count
	)

Definition at line 97 of file docqual.cpp.

                                                                {
   if (word->bln_boxes == NULL || word->rebuild_word == NULL ||
       word->rebuild_word->blobs.empty()) {
     *match_count = 0;
     *accepted_match_count = 0;
     return;
   }
 
   DocQualCallbacks cb(word);
   word->bln_boxes->ProcessMatchedBlobs(
       *word->rebuild_word,
       NewPermanentTessCallback(&cb, &DocQualCallbacks::CountAcceptedBlobs));
   *match_count = cb.match_count;
   *accepted_match_count = cb.accepted_match_count;
 }

◆ word_contains_non_1_digit()

BOOL8 tesseract::Tesseract::word_contains_non_1_digit	(	const char *	word,
		const char *	word_lengths
	)

Definition at line 509 of file reject.cpp.

                                                                      {
   inT16 i;
   inT16 offset;
 
   for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
     if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
         (word_lengths[i] != 1 || word[offset] != '1'))
       return TRUE;
   }
   return FALSE;
 }

◆ word_deletable()

CRUNCH_MODE tesseract::Tesseract::word_deletable	(	WERD_RES *	word,
		inT16 &	delete_mode
	)

Definition at line 899 of file docqual.cpp.

                                                                         {
   int word_len = word->reject_map.length ();
   float rating_per_ch;
   TBOX box;                       //BB of word
 
   if (word->unlv_crunch_mode == CR_NONE) {
     delete_mode = 0;
     return CR_NONE;
   }
 
   if (word_len == 0) {
     delete_mode = 1;
     return CR_DELETE;
   }
 
   if (word->rebuild_word != NULL) {
     // Cube leaves rebuild_word NULL.
     box = word->rebuild_word->bounding_box();
     if (box.height () < crunch_del_min_ht * kBlnXHeight) {
       delete_mode = 4;
       return CR_DELETE;
     }
 
     if (noise_outlines(word->rebuild_word)) {
       delete_mode = 5;
       return CR_DELETE;
     }
   }
 
   if ((failure_count (word) * 1.5) > word_len) {
     delete_mode = 2;
     return CR_LOOSE_SPACE;
   }
 
   if (word->best_choice->certainty () < crunch_del_cert) {
     delete_mode = 7;
     return CR_LOOSE_SPACE;
   }
 
   rating_per_ch = word->best_choice->rating () / word_len;
 
   if (rating_per_ch > crunch_del_rating) {
     delete_mode = 8;
     return CR_LOOSE_SPACE;
   }
 
   if (box.top () < kBlnBaselineOffset - crunch_del_low_word * kBlnXHeight) {
     delete_mode = 9;
     return CR_LOOSE_SPACE;
   }
 
   if (box.bottom () >
   kBlnBaselineOffset + crunch_del_high_word * kBlnXHeight) {
     delete_mode = 10;
     return CR_LOOSE_SPACE;
   }
 
   if (box.height () > crunch_del_max_ht * kBlnXHeight) {
     delete_mode = 11;
     return CR_LOOSE_SPACE;
   }
 
   if (box.width () < crunch_del_min_width * kBlnXHeight) {
     delete_mode = 3;
     return CR_LOOSE_SPACE;
   }
 
   delete_mode = 0;
   return CR_NONE;
 }

◆ word_display()

BOOL8 tesseract::Tesseract::word_display ( PAGE_RES_IT * pr_it )

word_display() Word Processor

Display a word according to its display modes

Definition at line 760 of file pgedit.cpp.

                                                 {
   WERD_RES* word_res = pr_it->word();
   WERD* word = word_res->word;
   TBOX word_bb;                   // word bounding box
   int word_height;               // ht of word BB
   BOOL8 displayed_something = FALSE;
   float shift;                   // from bot left
   C_BLOB_IT c_it;                // cblob iterator
 
   if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
     BoxWord* box_word = word_res->box_word;
     WERD_CHOICE* best_choice = word_res->best_choice;
     int length = box_word->length();
     if (word_res->fontinfo == NULL) return false;
     const FontInfo& font_info = *word_res->fontinfo;
     for (int i = 0; i < length; ++i) {
       ScrollView::Color color = ScrollView::GREEN;
       switch (color_mode) {
         case CM_SUBSCRIPT:
           if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
             color = ScrollView::RED;
           break;
         case CM_SUPERSCRIPT:
           if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
             color = ScrollView::RED;
           break;
         case CM_ITALIC:
           if (font_info.is_italic())
             color = ScrollView::RED;
           break;
         case CM_BOLD:
           if (font_info.is_bold())
             color = ScrollView::RED;
           break;
         case CM_FIXEDPITCH:
           if (font_info.is_fixed_pitch())
             color = ScrollView::RED;
           break;
         case CM_SERIF:
           if (font_info.is_serif())
             color = ScrollView::RED;
           break;
         case CM_SMALLCAPS:
           if (word_res->small_caps)
             color = ScrollView::RED;
           break;
         case CM_DROPCAPS:
           if (best_choice->BlobPosition(i) == SP_DROPCAP)
             color = ScrollView::RED;
           break;
           // TODO(rays) underline is currently completely unsupported.
         case CM_UNDERLINE:
         default:
           break;
       }
       image_win->Pen(color);
       TBOX box = box_word->BlobBox(i);
       image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
     }
     return true;
   }
   /*
     Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color)
     etc. are to keep the compiler happy.
   */
                                  // display bounding box
   if (word->display_flag(DF_BOX)) {
     word->bounding_box().plot(image_win,
      (ScrollView::Color)((inT32)
       editor_image_word_bb_color),
      (ScrollView::Color)((inT32)
       editor_image_word_bb_color));
 
     ScrollView::Color c = (ScrollView::Color)
        ((inT32) editor_image_blob_bb_color);
     image_win->Pen(c);
     c_it.set_to_list(word->cblob_list());
     for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
       c_it.data()->bounding_box().plot(image_win);
     displayed_something = TRUE;
   }
 
                                  // display edge steps
   if (word->display_flag(DF_EDGE_STEP)) {     // edgesteps available
     word->plot(image_win);      // rainbow colors
     displayed_something = TRUE;
   }
 
                                  // display poly approx
   if (word->display_flag(DF_POLYGONAL)) {
                                  // need to convert
     TWERD* tword = TWERD::PolygonalCopy(poly_allow_detailed_fx, word);
     tword->plot(image_win);
     delete tword;
     displayed_something = TRUE;
   }
 
   // Display correct text and blamer information.
   STRING text;
   STRING blame;
   if (word->display_flag(DF_TEXT) && word->text() != NULL) {
     text = word->text();
   }
   if (word->display_flag(DF_BLAMER) &&
       !(word_res->blamer_bundle != NULL &&
         word_res->blamer_bundle->incorrect_result_reason() == IRR_CORRECT)) {
     text = "";
     const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
     if (blamer_bundle == NULL) {
       text += "NULL";
     } else {
       text = blamer_bundle->TruthString();
     }
     text += " -> ";
     STRING best_choice_str;
     if (word_res->best_choice == NULL) {
       best_choice_str = "NULL";
     } else {
       word_res->best_choice->string_and_lengths(&best_choice_str, NULL);
     }
     text += best_choice_str;
     IncorrectResultReason reason = (blamer_bundle == NULL) ?
         IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
     ASSERT_HOST(reason < IRR_NUM_REASONS)
     blame += " [";
     blame += BlamerBundle::IncorrectReasonName(reason);
     blame += "]";
   }
   if (text.length() > 0) {
     word_bb = word->bounding_box();
     image_win->Pen(ScrollView::RED);
     word_height = word_bb.height();
     int text_height = 0.50 * word_height;
     if (text_height > 20) text_height = 20;
     image_win->TextAttributes("Arial", text_height, false, false, false);
     shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
     image_win->Text(word_bb.left() + shift,
                     word_bb.bottom() + 0.25 * word_height, text.string());
     if (blame.length() > 0) {
       image_win->Text(word_bb.left() + shift,
                       word_bb.bottom() + 0.25 * word_height - text_height,
                       blame.string());
     }
 
     displayed_something = TRUE;
   }
 
   if (!displayed_something)      // display BBox anyway
     word->bounding_box().plot(image_win,
      (ScrollView::Color)((inT32) editor_image_word_bb_color),
      (ScrollView::Color)((inT32)
       editor_image_word_bb_color));
   return TRUE;
 }

◆ word_dumper()

BOOL8 tesseract::Tesseract::word_dumper ( PAGE_RES_IT * pr_it )

word_dumper()

Dump members to the debug window

Definition at line 921 of file pgedit.cpp.

                                                {
   if (pr_it->block()->block != NULL) {
     tprintf("\nBlock data...\n");
     pr_it->block()->block->print(NULL, FALSE);
   }
   tprintf("\nRow data...\n");
   pr_it->row()->row->print(NULL);
   tprintf("\nWord data...\n");
   WERD_RES* word_res = pr_it->word();
   word_res->word->print();
   if (word_res->blamer_bundle != NULL && wordrec_debug_blamer &&
       word_res->blamer_bundle->incorrect_result_reason() != IRR_CORRECT) {
     tprintf("Current blamer debug: %s\n",
             word_res->blamer_bundle->debug().string());
   }
   return TRUE;
 }

◆ word_outline_errs()

inT16 tesseract::Tesseract::word_outline_errs ( WERD_RES * word )

Definition at line 77 of file docqual.cpp.

                                                  {
   inT16 i = 0;
   inT16 err_count = 0;
 
   if (word->rebuild_word != NULL) {
     for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
       TBLOB* blob = word->rebuild_word->blobs[b];
       err_count += count_outline_errs(word->best_choice->unichar_string()[i],
                                       blob->NumOutlines());
       i++;
     }
   }
   return err_count;
 }

◆ word_set_display()

BOOL8 tesseract::Tesseract::word_set_display ( PAGE_RES_IT * pr_it )

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 945 of file pgedit.cpp.

                                                     {
   WERD* word = pr_it->word()->word;
   word->set_display_flag(DF_BOX, word_display_mode.bit(DF_BOX));
   word->set_display_flag(DF_TEXT, word_display_mode.bit(DF_TEXT));
   word->set_display_flag(DF_POLYGONAL, word_display_mode.bit(DF_POLYGONAL));
   word->set_display_flag(DF_EDGE_STEP, word_display_mode.bit(DF_EDGE_STEP));
   word->set_display_flag(DF_BN_POLYGONAL,
     word_display_mode.bit(DF_BN_POLYGONAL));
   word->set_display_flag(DF_BLAMER, word_display_mode.bit(DF_BLAMER));
   return word_display(pr_it);
 }

◆ worst_noise_blob()

inT16 tesseract::Tesseract::worst_noise_blob	(	WERD_RES *	word_res,
		float *	worst_noise_score
	)

Definition at line 680 of file fixspace.cpp.

                                                             {
   float noise_score[512];
   int i;
   int min_noise_blob;            // 1st contender
   int max_noise_blob;            // last contender
   int non_noise_count;
   int worst_noise_blob;          // Worst blob
   float small_limit = kBlnXHeight * fixsp_small_outlines_size;
   float non_noise_limit = kBlnXHeight * 0.8;
 
   if (word_res->rebuild_word == NULL)
     return -1;  // Can't handle cube words.
 
   // Normalised.
   int blob_count = word_res->box_word->length();
   ASSERT_HOST(blob_count <= 512);
   if (blob_count < 5)
     return -1;                   // too short to split
 
   /* Get the noise scores for all blobs */
 
   #ifndef SECURE_NAMES
   if (debug_fix_space_level > 5)
     tprintf("FP fixspace Noise metrics for \"%s\": ",
             word_res->best_choice->unichar_string().string());
   #endif
 
   for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
     TBLOB* blob = word_res->rebuild_word->blobs[i];
     if (word_res->reject_map[i].accepted())
       noise_score[i] = non_noise_limit;
     else
       noise_score[i] = blob_noise_score(blob);
 
     if (debug_fix_space_level > 5)
       tprintf("%1.1f ", noise_score[i]);
   }
   if (debug_fix_space_level > 5)
     tprintf("\n");
 
   /* Now find the worst one which is far enough away from the end of the word */
 
   non_noise_count = 0;
   for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
     if (noise_score[i] >= non_noise_limit) {
       non_noise_count++;
     }
   }
   if (non_noise_count < fixsp_non_noise_limit)
     return -1;
 
   min_noise_blob = i;
 
   non_noise_count = 0;
   for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
        i--) {
     if (noise_score[i] >= non_noise_limit) {
       non_noise_count++;
     }
   }
   if (non_noise_count < fixsp_non_noise_limit)
     return -1;
 
   max_noise_blob = i;
 
   if (min_noise_blob > max_noise_blob)
     return -1;
 
   *worst_noise_score = small_limit;
   worst_noise_blob = -1;
   for (i = min_noise_blob; i <= max_noise_blob; i++) {
     if (noise_score[i] < *worst_noise_score) {
       worst_noise_blob = i;
       *worst_noise_score = noise_score[i];
     }
   }
   return worst_noise_blob;
 }

◆ write_results()

void tesseract::Tesseract::write_results	(	PAGE_RES_IT &	page_res_it,
		char	newline_type,
		BOOL8	force_eol
	)

Definition at line 130 of file output.cpp.

                                                {  // override tilde crunch?
   WERD_RES *word = page_res_it.word();
   const UNICHARSET &uchset = *word->uch_set;
   int i;
   BOOL8 need_reject = FALSE;
   UNICHAR_ID space = uchset.unichar_to_id(" ");
 
   if ((word->unlv_crunch_mode != CR_NONE ||
        word->best_choice->length() == 0) &&
       !tessedit_zero_kelvin_rejection && !tessedit_word_for_word) {
     if ((word->unlv_crunch_mode != CR_DELETE) &&
         (!stats_.tilde_crunch_written ||
          ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
           (word->word->space () > 0) &&
           !word->word->flag (W_FUZZY_NON) &&
           !word->word->flag (W_FUZZY_SP)))) {
       if (!word->word->flag (W_BOL) &&
           (word->word->space () > 0) &&
           !word->word->flag (W_FUZZY_NON) &&
           !word->word->flag (W_FUZZY_SP)) {
         stats_.last_char_was_tilde = false;
       }
       need_reject = TRUE;
     }
     if ((need_reject && !stats_.last_char_was_tilde) ||
         (force_eol && stats_.write_results_empty_block)) {
       /* Write a reject char - mark as rejected unless zero_rejection mode */
       stats_.last_char_was_tilde = TRUE;
       stats_.tilde_crunch_written = true;
       stats_.last_char_was_newline = false;
       stats_.write_results_empty_block = false;
     }
 
     if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
       stats_.tilde_crunch_written = false;
       stats_.last_char_was_newline = true;
       stats_.last_char_was_tilde = false;
     }
 
     if (force_eol)
       stats_.write_results_empty_block = true;
     return;
   }
 
   /* NORMAL PROCESSING of non tilde crunched words */
 
   stats_.tilde_crunch_written = false;
   if (newline_type)
     stats_.last_char_was_newline = true;
   else
     stats_.last_char_was_newline = false;
   stats_.write_results_empty_block = force_eol;  // about to write a real word
 
   if (unlv_tilde_crunching &&
       stats_.last_char_was_tilde &&
       (word->word->space() == 0) &&
       !(word->word->flag(W_REP_CHAR) && tessedit_write_rep_codes) &&
       (word->best_choice->unichar_id(0) == space)) {
     /* Prevent adjacent tilde across words - we know that adjacent tildes within
        words have been removed */
     word->MergeAdjacentBlobs(0);
   }
   if (newline_type ||
     (word->word->flag (W_REP_CHAR) && tessedit_write_rep_codes))
     stats_.last_char_was_tilde = false;
   else {
     if (word->reject_map.length () > 0) {
       if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
         stats_.last_char_was_tilde = true;
       else
         stats_.last_char_was_tilde = false;
     }
     else if (word->word->space () > 0)
       stats_.last_char_was_tilde = false;
     /* else it is unchanged as there are no output chars */
   }
 
   ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
 
   set_unlv_suspects(word);
   check_debug_pt (word, 120);
   if (tessedit_rejection_debug) {
     tprintf ("Dict word: \"%s\": %d\n",
              word->best_choice->debug_string().string(),
              dict_word(*(word->best_choice)));
   }
   if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
     if (tessedit_zero_rejection) {
       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
       for (i = 0; i < word->best_choice->length(); ++i) {
         if (word->reject_map[i].rejected())
           word->reject_map[i].setrej_minimal_rej_accept();
       }
     }
     if (tessedit_minimal_rejection) {
       /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
       for (i = 0; i < word->best_choice->length(); ++i) {
         if ((word->best_choice->unichar_id(i) != space) &&
             word->reject_map[i].rejected())
           word->reject_map[i].setrej_minimal_rej_accept();
       }
     }
   }
 }

Member Data Documentation

◆ applybox_debug

int tesseract::Tesseract::applybox_debug = 1

"Debug level"

Definition at line 830 of file tesseractclass.h.

◆ applybox_exposure_pattern

char* tesseract::Tesseract::applybox_exposure_pattern = ".exp"

"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"

Definition at line 835 of file tesseractclass.h.

◆ applybox_learn_chars_and_char_frags_mode

bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false

"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."

Definition at line 839 of file tesseractclass.h.

◆ applybox_learn_ngrams_mode

bool tesseract::Tesseract::applybox_learn_ngrams_mode = false

"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."

Definition at line 842 of file tesseractclass.h.

◆ applybox_page

int tesseract::Tesseract::applybox_page = 0

"Page number to apply boxes from"

Definition at line 831 of file tesseractclass.h.

◆ bestrate_pruning_factor

double tesseract::Tesseract::bestrate_pruning_factor = 2.0

"Multiplying factor of" " current best rate to prune other hypotheses"

Definition at line 1117 of file tesseractclass.h.

◆ bidi_debug

int tesseract::Tesseract::bidi_debug = 0

"Debug level for BiDi"

Definition at line 829 of file tesseractclass.h.

◆ bland_unrej

bool tesseract::Tesseract::bland_unrej = false

"unrej potential with no checks"

Definition at line 943 of file tesseractclass.h.

◆ chs_leading_punct

char* tesseract::Tesseract::chs_leading_punct = "('`\""

"Leading punctuation"

Definition at line 882 of file tesseractclass.h.

◆ chs_trailing_punct1

char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!"

"1st Trailing punctuation"

Definition at line 883 of file tesseractclass.h.

◆ chs_trailing_punct2

char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\""

"2nd Trailing punctuation"

Definition at line 884 of file tesseractclass.h.

◆ conflict_set_I_l_1

char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]"

"Il1 conflict set"

Definition at line 1059 of file tesseractclass.h.

◆ crunch_accept_ok

bool tesseract::Tesseract::crunch_accept_ok = true

"Use acceptability in okstring"

Definition at line 972 of file tesseractclass.h.

◆ crunch_debug

int tesseract::Tesseract::crunch_debug = 0

"As it says"

Definition at line 981 of file tesseractclass.h.

◆ crunch_del_cert

double tesseract::Tesseract::crunch_del_cert = -10.0

"POTENTIAL crunch cert lt this"

Definition at line 961 of file tesseractclass.h.

◆ crunch_del_high_word

double tesseract::Tesseract::crunch_del_high_word = 1.5

"Del if word gt xht x this above bl"

Definition at line 966 of file tesseractclass.h.

◆ crunch_del_low_word

double tesseract::Tesseract::crunch_del_low_word = 0.5

"Del if word gt xht x this below bl"

Definition at line 967 of file tesseractclass.h.

◆ crunch_del_max_ht

double tesseract::Tesseract::crunch_del_max_ht = 3.0

"Del if word ht gt xht x this"

Definition at line 963 of file tesseractclass.h.

◆ crunch_del_min_ht

double tesseract::Tesseract::crunch_del_min_ht = 0.7

"Del if word ht lt xht x this"

Definition at line 962 of file tesseractclass.h.

◆ crunch_del_min_width

double tesseract::Tesseract::crunch_del_min_width = 3.0

"Del if word width lt xht x this"

Definition at line 964 of file tesseractclass.h.

◆ crunch_del_rating

double tesseract::Tesseract::crunch_del_rating = 60

"POTENTIAL crunch rating lt this"

Definition at line 960 of file tesseractclass.h.

◆ crunch_early_convert_bad_unlv_chs

bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false

"Take out ~^ early?"

Definition at line 951 of file tesseractclass.h.

◆ crunch_early_merge_tess_fails

bool tesseract::Tesseract::crunch_early_merge_tess_fails = true

"Before word crunch?"

Definition at line 950 of file tesseractclass.h.

◆ crunch_include_numerals

bool tesseract::Tesseract::crunch_include_numerals = false

"Fiddle alpha figures"

Definition at line 975 of file tesseractclass.h.

◆ crunch_leave_accept_strings

bool tesseract::Tesseract::crunch_leave_accept_strings = false

"Don't pot crunch sensible strings"

Definition at line 974 of file tesseractclass.h.

◆ crunch_leave_lc_strings

int tesseract::Tesseract::crunch_leave_lc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 977 of file tesseractclass.h.

◆ crunch_leave_ok_strings

bool tesseract::Tesseract::crunch_leave_ok_strings = true

"Don't touch sensible strings"

Definition at line 971 of file tesseractclass.h.

◆ crunch_leave_uc_strings

int tesseract::Tesseract::crunch_leave_uc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 979 of file tesseractclass.h.

◆ crunch_long_repetitions

int tesseract::Tesseract::crunch_long_repetitions = 3

"Crunch words with long repetitions"

Definition at line 980 of file tesseractclass.h.

◆ crunch_poor_garbage_cert

double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0

"crunch garbage cert lt this"

Definition at line 955 of file tesseractclass.h.

◆ crunch_poor_garbage_rate

double tesseract::Tesseract::crunch_poor_garbage_rate = 60

"crunch garbage rating lt this"

Definition at line 956 of file tesseractclass.h.

◆ crunch_pot_garbage

bool tesseract::Tesseract::crunch_pot_garbage = true

"POTENTIAL crunch garbage"

Definition at line 959 of file tesseractclass.h.

◆ crunch_pot_indicators

int tesseract::Tesseract::crunch_pot_indicators = 1

"How many potential indicators needed"

Definition at line 970 of file tesseractclass.h.

◆ crunch_pot_poor_cert

double tesseract::Tesseract::crunch_pot_poor_cert = -8.0

"POTENTIAL crunch cert lt this"

Definition at line 958 of file tesseractclass.h.

◆ crunch_pot_poor_rate

double tesseract::Tesseract::crunch_pot_poor_rate = 40

"POTENTIAL crunch rating lt this"

Definition at line 957 of file tesseractclass.h.

◆ crunch_rating_max

int tesseract::Tesseract::crunch_rating_max = 10

"For adj length in rating per ch"

Definition at line 969 of file tesseractclass.h.

◆ crunch_small_outlines_size

double tesseract::Tesseract::crunch_small_outlines_size = 0.6

"Small if lt xht x this"

Definition at line 968 of file tesseractclass.h.

◆ crunch_terrible_garbage

bool tesseract::Tesseract::crunch_terrible_garbage = true

"As it says"

Definition at line 953 of file tesseractclass.h.

◆ crunch_terrible_rating

double tesseract::Tesseract::crunch_terrible_rating = 80.0

"crunch rating lt this"

Definition at line 952 of file tesseractclass.h.

◆ debug_acceptable_wds

bool tesseract::Tesseract::debug_acceptable_wds = false

"Dump word pass/fail chk"

Definition at line 881 of file tesseractclass.h.

◆ debug_fix_space_level

int tesseract::Tesseract::debug_fix_space_level = 0

"Contextual fixspace debug"

Definition at line 987 of file tesseractclass.h.

◆ debug_noise_removal

int tesseract::Tesseract::debug_noise_removal = 0

"Debug reassignment of small outlines"

Definition at line 865 of file tesseractclass.h.

◆ debug_x_ht_level

int tesseract::Tesseract::debug_x_ht_level = 0

"Reestimate debug"

Definition at line 880 of file tesseractclass.h.

◆ docqual_excuse_outline_errs

bool tesseract::Tesseract::docqual_excuse_outline_errs = false

"Allow outline errs in unrejection?"

Definition at line 911 of file tesseractclass.h.

◆ enable_new_segsearch

bool tesseract::Tesseract::enable_new_segsearch = false

"Enable new segmentation search path."

Definition at line 1157 of file tesseractclass.h.

◆ enable_noise_removal

bool tesseract::Tesseract::enable_noise_removal = true

"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"

Definition at line 864 of file tesseractclass.h.

◆ file_type

char* tesseract::Tesseract::file_type = ".tif"

"Filename extension"

Definition at line 1066 of file tesseractclass.h.

◆ fixsp_done_mode

int tesseract::Tesseract::fixsp_done_mode = 1

"What constitues done for spacing"

Definition at line 986 of file tesseractclass.h.

◆ fixsp_non_noise_limit

int tesseract::Tesseract::fixsp_non_noise_limit = 1

"How many non-noise blbs either side?"

Definition at line 983 of file tesseractclass.h.

◆ fixsp_small_outlines_size

double tesseract::Tesseract::fixsp_small_outlines_size = 0.28

"Small if lt xht x this"

Definition at line 984 of file tesseractclass.h.

◆ heuristic_max_char_wh_ratio

double tesseract::Tesseract::heuristic_max_char_wh_ratio = 2.0

"max char width-to-height ratio allowed in segmentation"

Definition at line 1155 of file tesseractclass.h.

◆ heuristic_segcost_rating_base

double tesseract::Tesseract::heuristic_segcost_rating_base = 1.25

"base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost."

Definition at line 1146 of file tesseractclass.h.

◆ heuristic_weight_rating

double tesseract::Tesseract::heuristic_weight_rating = 1

"weight associated with char rating in combined cost of state"

Definition at line 1148 of file tesseractclass.h.

◆ heuristic_weight_seamcut

double tesseract::Tesseract::heuristic_weight_seamcut = 0

"weight associated with seam cut in combined cost of state"

Definition at line 1153 of file tesseractclass.h.

◆ heuristic_weight_width

double tesseract::Tesseract::heuristic_weight_width = 1000.0

"weight associated with width evidence in combined cost of" " state"

Definition at line 1151 of file tesseractclass.h.

◆ hocr_font_info

bool tesseract::Tesseract::hocr_font_info = false

"Add font info to hocr output"

Definition at line 949 of file tesseractclass.h.

◆ include_page_breaks

bool tesseract::Tesseract::include_page_breaks = false

"Include page separator string in output text after each " "image/page."

Definition at line 1097 of file tesseractclass.h.

◆ interactive_display_mode

bool tesseract::Tesseract::interactive_display_mode = false

"Run interactively?"

Definition at line 1065 of file tesseractclass.h.

◆ language_model_fixed_length_choices_depth

int tesseract::Tesseract::language_model_fixed_length_choices_depth = 3

"Depth of blob choice lists to explore" " when fixed length dawgs are on"

Definition at line 1140 of file tesseractclass.h.

◆ load_fixed_length_dawgs

bool tesseract::Tesseract::load_fixed_length_dawgs = true

"Load fixed length" " dawgs (e.g. for non-space delimited languages)"

Definition at line 1113 of file tesseractclass.h.

◆ lstm_use_matrix

bool tesseract::Tesseract::lstm_use_matrix = 1

"Use ratings matrix/beam searct with lstm"

Definition at line 907 of file tesseractclass.h.

◆ min_orientation_margin

double tesseract::Tesseract::min_orientation_margin = 7.0

"Min acceptable orientation margin"

Definition at line 1075 of file tesseractclass.h.

◆ min_sane_x_ht_pixels

int tesseract::Tesseract::min_sane_x_ht_pixels = 8

"Reject any x-ht lt or eq than this"

Definition at line 1060 of file tesseractclass.h.

◆ multilang_debug_level

int tesseract::Tesseract::multilang_debug_level = 0

"Print multilang debug info."

Definition at line 902 of file tesseractclass.h.

◆ ngram_permuter_activated

bool tesseract::Tesseract::ngram_permuter_activated = false

"Activate character-level n-gram-based permuter"

Definition at line 1136 of file tesseractclass.h.

◆ noise_cert_basechar

double tesseract::Tesseract::noise_cert_basechar = -8.0

"Hingepoint for base char certainty"

Definition at line 868 of file tesseractclass.h.

◆ noise_cert_disjoint

double tesseract::Tesseract::noise_cert_disjoint = -2.5

"Hingepoint for disjoint certainty"

Definition at line 871 of file tesseractclass.h.

◆ noise_cert_factor

double tesseract::Tesseract::noise_cert_factor = 0.375

"Scaling on certainty diff from Hingepoint"

Definition at line 877 of file tesseractclass.h.

◆ noise_cert_punc

double tesseract::Tesseract::noise_cert_punc = -2.5

"Threshold for new punc char certainty"

Definition at line 874 of file tesseractclass.h.

◆ noise_maxperblob

int tesseract::Tesseract::noise_maxperblob = 8

"Max diacritics to apply to a blob"

Definition at line 878 of file tesseractclass.h.

◆ noise_maxperword

int tesseract::Tesseract::noise_maxperword = 16

"Max diacritics to apply to a word"

Definition at line 879 of file tesseractclass.h.

◆ numeric_punctuation

char* tesseract::Tesseract::numeric_punctuation = ".,"

"Punct. chs expected WITHIN numbers"

Definition at line 989 of file tesseractclass.h.

◆ ocr_devanagari_split_strategy

int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."

Definition at line 824 of file tesseractclass.h.

◆ ok_repeated_ch_non_alphanum_wds

char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075"

"Allow NN to unrej"

Definition at line 1058 of file tesseractclass.h.

◆ outlines_2

char* tesseract::Tesseract::outlines_2 = "ij!?%\":;"

"Non standard number of outlines"

Definition at line 909 of file tesseractclass.h.

◆ outlines_odd

char* tesseract::Tesseract::outlines_odd = "%| "

"Non standard number of outlines"

Definition at line 908 of file tesseractclass.h.

◆ page_separator

char* tesseract::Tesseract::page_separator = "\f"

"Page separator (default is form feed control character)"

Definition at line 1099 of file tesseractclass.h.

◆ pageseg_devanagari_split_strategy

int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."

Definition at line 820 of file tesseractclass.h.

◆ paragraph_debug_level

int tesseract::Tesseract::paragraph_debug_level = 0

"Print paragraph debug info."

Definition at line 903 of file tesseractclass.h.

◆ paragraph_text_based

bool tesseract::Tesseract::paragraph_text_based = true

"Run paragraph detection on the post-text-recognition " "(more accurate)"

Definition at line 906 of file tesseractclass.h.

◆ permute_chartype_word

bool tesseract::Tesseract::permute_chartype_word = 0

"Turn on character type (property) consistency permuter"

Definition at line 1129 of file tesseractclass.h.

◆ permute_debug

bool tesseract::Tesseract::permute_debug = 0

"char permutation debug"

Definition at line 1115 of file tesseractclass.h.

◆ permute_fixed_length_dawg

bool tesseract::Tesseract::permute_fixed_length_dawg = 0

"Turn on fixed-length phrasebook search permuter"

Definition at line 1127 of file tesseractclass.h.

◆ permute_only_top

bool tesseract::Tesseract::permute_only_top = false

"Run only the top choice permuter"

Definition at line 1137 of file tesseractclass.h.

◆ permute_script_word

bool tesseract::Tesseract::permute_script_word = 0

"Turn on word script consistency permuter"

Definition at line 1119 of file tesseractclass.h.

◆ poly_allow_detailed_fx

bool tesseract::Tesseract::poly_allow_detailed_fx = false

"Allow feature extractors to see the original outline"

Definition at line 1079 of file tesseractclass.h.

◆ preserve_interword_spaces

bool tesseract::Tesseract::preserve_interword_spaces = false

"Preserve multiple interword spaces"

Definition at line 1094 of file tesseractclass.h.

◆ quality_blob_pc

double tesseract::Tesseract::quality_blob_pc = 0.0

"good_quality_doc gte good blobs limit"

Definition at line 886 of file tesseractclass.h.

◆ quality_char_pc

double tesseract::Tesseract::quality_char_pc = 0.95

"good_quality_doc gte good char limit"

Definition at line 889 of file tesseractclass.h.

◆ quality_min_initial_alphas_reqd

int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2

"alphas in a good word"

Definition at line 890 of file tesseractclass.h.

◆ quality_outline_pc

double tesseract::Tesseract::quality_outline_pc = 1.0

"good_quality_doc lte outline error limit"

Definition at line 888 of file tesseractclass.h.

◆ quality_rej_pc

double tesseract::Tesseract::quality_rej_pc = 0.08

"good_quality_doc lte rejection limit"

Definition at line 885 of file tesseractclass.h.

◆ quality_rowrej_pc

double tesseract::Tesseract::quality_rowrej_pc = 1.1

"good_quality_doc gte good char limit"

Definition at line 945 of file tesseractclass.h.

◆ rej_1Il_trust_permuter_type

bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true

"Don't double check"

Definition at line 1049 of file tesseractclass.h.

◆ rej_1Il_use_dict_word

bool tesseract::Tesseract::rej_1Il_use_dict_word = false

"Use dictword test"

Definition at line 1048 of file tesseractclass.h.

◆ rej_alphas_in_number_perm

bool tesseract::Tesseract::rej_alphas_in_number_perm = false

"Extend permuter check"

Definition at line 1054 of file tesseractclass.h.

◆ rej_trust_doc_dawg

bool tesseract::Tesseract::rej_trust_doc_dawg = false

"Use DOC dawg in 11l conf. detector"

Definition at line 1047 of file tesseractclass.h.

◆ rej_use_good_perm

bool tesseract::Tesseract::rej_use_good_perm = true

"Individual rejection control"

Definition at line 1052 of file tesseractclass.h.

◆ rej_use_sensible_wd

bool tesseract::Tesseract::rej_use_sensible_wd = false

"Extend permuter check"

Definition at line 1053 of file tesseractclass.h.

◆ rej_use_tess_accepted

bool tesseract::Tesseract::rej_use_tess_accepted = true

"Individual rejection control"

Definition at line 1050 of file tesseractclass.h.

◆ rej_use_tess_blanks

bool tesseract::Tesseract::rej_use_tess_blanks = true

"Individual rejection control"

Definition at line 1051 of file tesseractclass.h.

◆ rej_whole_of_mostly_reject_word_fract

double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85

"if >this fract"

Definition at line 1055 of file tesseractclass.h.

◆ segment_debug

int tesseract::Tesseract::segment_debug = 0

"Debug the whole segmentation process"

Definition at line 1114 of file tesseractclass.h.

◆ segment_reward_chartype

double tesseract::Tesseract::segment_reward_chartype = 0.97

"Score multipler for char type consistency within a word. "

Definition at line 1131 of file tesseractclass.h.

◆ segment_reward_ngram_best_choice

double tesseract::Tesseract::segment_reward_ngram_best_choice = 0.99

"Score multipler for ngram permuter's best choice" " (only used in the Han script path)."

Definition at line 1134 of file tesseractclass.h.

◆ segment_reward_script

double tesseract::Tesseract::segment_reward_script = 0.95

"Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " "Smaller value implies bigger reward."

Definition at line 1125 of file tesseractclass.h.

◆ segment_segcost_rating

bool tesseract::Tesseract::segment_segcost_rating = 0

"incorporate segmentation cost in word rating?"

Definition at line 1121 of file tesseractclass.h.

◆ segsearch_max_fixed_pitch_char_wh_ratio

double tesseract::Tesseract::segsearch_max_fixed_pitch_char_wh_ratio = 2.0

"Maximum character width-to-height ratio for" "fixed pitch fonts"

Definition at line 1160 of file tesseractclass.h.

◆ subscript_max_y_top

double tesseract::Tesseract::subscript_max_y_top = 0.5

"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."

Definition at line 1008 of file tesseractclass.h.

◆ superscript_bettered_certainty

double tesseract::Tesseract::superscript_bettered_certainty = 0.97

"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"

Definition at line 1000 of file tesseractclass.h.

◆ superscript_debug

int tesseract::Tesseract::superscript_debug = 0

"Debug level for sub & superscript fixer"

Definition at line 993 of file tesseractclass.h.

◆ superscript_min_y_bottom

double tesseract::Tesseract::superscript_min_y_bottom = 0.3

"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."

Definition at line 1012 of file tesseractclass.h.

◆ superscript_scaledown_ratio

double tesseract::Tesseract::superscript_scaledown_ratio = 0.4

"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."

Definition at line 1004 of file tesseractclass.h.

◆ superscript_worse_certainty

double tesseract::Tesseract::superscript_worse_certainty = 2.0

"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"

Definition at line 996 of file tesseractclass.h.

◆ suspect_accept_rating

double tesseract::Tesseract::suspect_accept_rating = -999.9

"Accept good rating limit"

Definition at line 1032 of file tesseractclass.h.

◆ suspect_constrain_1Il

bool tesseract::Tesseract::suspect_constrain_1Il = false

"UNLV keep 1Il chars rejected"

Definition at line 1030 of file tesseractclass.h.

◆ suspect_level

int tesseract::Tesseract::suspect_level = 99

"Suspect marker level"

Definition at line 1026 of file tesseractclass.h.

◆ suspect_rating_per_ch

double tesseract::Tesseract::suspect_rating_per_ch = 999.9

"Don't touch bad rating limit"

Definition at line 1031 of file tesseractclass.h.

◆ suspect_short_words

int tesseract::Tesseract::suspect_short_words = 2

"Don't Suspect dict wds longer than this"

Definition at line 1029 of file tesseractclass.h.

◆ suspect_space_level

int tesseract::Tesseract::suspect_space_level = 100

"Min suspect level for rejecting spaces"

Definition at line 1028 of file tesseractclass.h.

◆ tessedit_adaption_debug

bool tesseract::Tesseract::tessedit_adaption_debug = false

"Generate and print debug information for adaption"

Definition at line 828 of file tesseractclass.h.

◆ tessedit_ambigs_training

bool tesseract::Tesseract::tessedit_ambigs_training = false

"Perform training for ambiguities"

Definition at line 816 of file tesseractclass.h.

◆ tessedit_bigram_debug

int tesseract::Tesseract::tessedit_bigram_debug = 0

"Amount of debug output for bigram " "correction."

Definition at line 861 of file tesseractclass.h.

◆ tessedit_char_blacklist

char* tesseract::Tesseract::tessedit_char_blacklist = ""

"Blacklist of chars not to recognize"

Definition at line 810 of file tesseractclass.h.

◆ tessedit_char_unblacklist

char* tesseract::Tesseract::tessedit_char_unblacklist = ""

"List of chars to override tessedit_char_blacklist"

Definition at line 814 of file tesseractclass.h.

◆ tessedit_char_whitelist

char* tesseract::Tesseract::tessedit_char_whitelist = ""

"Whitelist of chars to recognize"

Definition at line 812 of file tesseractclass.h.

◆ tessedit_consistent_reps

bool tesseract::Tesseract::tessedit_consistent_reps = true

"Force all rep chars the same"

Definition at line 1039 of file tesseractclass.h.

◆ tessedit_create_boxfile

bool tesseract::Tesseract::tessedit_create_boxfile = false

"Output text with boxes"

Definition at line 1061 of file tesseractclass.h.

◆ tessedit_create_hocr

bool tesseract::Tesseract::tessedit_create_hocr = false

"Write .html hOCR output file"

Definition at line 1019 of file tesseractclass.h.

◆ tessedit_create_pdf

bool tesseract::Tesseract::tessedit_create_pdf = false

"Write .pdf output file"

Definition at line 1021 of file tesseractclass.h.

◆ tessedit_create_tsv

bool tesseract::Tesseract::tessedit_create_tsv = false

"Write .tsv output file"

Definition at line 1020 of file tesseractclass.h.

◆ tessedit_create_txt

bool tesseract::Tesseract::tessedit_create_txt = false

"Write .txt output file"

Definition at line 1018 of file tesseractclass.h.

◆ tessedit_debug_block_rejection

bool tesseract::Tesseract::tessedit_debug_block_rejection = false

"Block and Row stats"

Definition at line 855 of file tesseractclass.h.

◆ tessedit_debug_doc_rejection

bool tesseract::Tesseract::tessedit_debug_doc_rejection = false

"Page stats"

Definition at line 940 of file tesseractclass.h.

◆ tessedit_debug_fonts

bool tesseract::Tesseract::tessedit_debug_fonts = false

"Output font info per char"

Definition at line 854 of file tesseractclass.h.

◆ tessedit_debug_quality_metrics

bool tesseract::Tesseract::tessedit_debug_quality_metrics = false

"Output data to debug file"

Definition at line 942 of file tesseractclass.h.

◆ tessedit_display_outwords

bool tesseract::Tesseract::tessedit_display_outwords = false

"Draw output words"

Definition at line 843 of file tesseractclass.h.

◆ tessedit_dont_blkrej_good_wds

bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 929 of file tesseractclass.h.

◆ tessedit_dont_rowrej_good_wds

bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 931 of file tesseractclass.h.

◆ tessedit_dump_choices

bool tesseract::Tesseract::tessedit_dump_choices = false

"Dump char choices"

Definition at line 844 of file tesseractclass.h.

◆ tessedit_dump_pageseg_images

bool tesseract::Tesseract::tessedit_dump_pageseg_images = false

"Dump intermediate images made during page segmentation"

Definition at line 801 of file tesseractclass.h.

◆ tessedit_enable_bigram_correction

bool tesseract::Tesseract::tessedit_enable_bigram_correction = true

"Enable correction based on the word bigram dictionary."

Definition at line 857 of file tesseractclass.h.

◆ tessedit_enable_dict_correction

bool tesseract::Tesseract::tessedit_enable_dict_correction = false

"Enable single word correction based on the dictionary."

Definition at line 859 of file tesseractclass.h.

◆ tessedit_enable_doc_dict

bool tesseract::Tesseract::tessedit_enable_doc_dict = true

"Add words to the document dictionary"

Definition at line 853 of file tesseractclass.h.

◆ tessedit_fix_fuzzy_spaces

bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true

"Try to improve fuzzy spaces"

Definition at line 847 of file tesseractclass.h.

◆ tessedit_fix_hyphens

bool tesseract::Tesseract::tessedit_fix_hyphens = true

"Crunch double hyphens?"

Definition at line 850 of file tesseractclass.h.

◆ tessedit_flip_0O

bool tesseract::Tesseract::tessedit_flip_0O = true

"Contextual 0O O0 flips"

Definition at line 1042 of file tesseractclass.h.

◆ tessedit_good_doc_still_rowrej_wd

double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1

"rej good doc wd if more than this fraction rejected"

Definition at line 937 of file tesseractclass.h.

◆ tessedit_good_quality_unrej

bool tesseract::Tesseract::tessedit_good_quality_unrej = true

"Reduce rejection on good docs"

Definition at line 913 of file tesseractclass.h.

◆ tessedit_image_border

int tesseract::Tesseract::tessedit_image_border = 2

"Rej blbs near image edge limit"

Definition at line 1056 of file tesseractclass.h.

◆ tessedit_init_config_only

bool tesseract::Tesseract::tessedit_init_config_only = false

"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."

Definition at line 1082 of file tesseractclass.h.

◆ tessedit_load_sublangs

char* tesseract::Tesseract::tessedit_load_sublangs = ""

"List of languages to load with this one"

Definition at line 1069 of file tesseractclass.h.

◆ tessedit_lower_flip_hyphen

double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5

"Aspect ratio dot/hyphen test"

Definition at line 1044 of file tesseractclass.h.

◆ tessedit_make_boxes_from_boxes

bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false

"Generate more boxes from boxed chars"

Definition at line 797 of file tesseractclass.h.

◆ tessedit_matcher_log

bool tesseract::Tesseract::tessedit_matcher_log = false

"Log matcher activity"

Definition at line 896 of file tesseractclass.h.

◆ tessedit_minimal_rej_pass1

bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false

"Do minimal rejection on pass 1 output"

Definition at line 894 of file tesseractclass.h.

◆ tessedit_minimal_rejection

bool tesseract::Tesseract::tessedit_minimal_rejection = false

"Only reject tess failures"

Definition at line 1033 of file tesseractclass.h.

◆ tessedit_ocr_engine_mode

int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT

"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available."

Definition at line 808 of file tesseractclass.h.

◆ tessedit_ok_mode

int tesseract::Tesseract::tessedit_ok_mode = 5

"Acceptance decision algorithm"

Definition at line 1111 of file tesseractclass.h.

◆ tessedit_override_permuter

bool tesseract::Tesseract::tessedit_override_permuter = true

"According to dict_word"

Definition at line 1067 of file tesseractclass.h.

◆ tessedit_page_number

int tesseract::Tesseract::tessedit_page_number = -1

"-1 -> All pages, else specific page to process"

Definition at line 1063 of file tesseractclass.h.

◆ tessedit_pageseg_mode

int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK

"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"

Definition at line 805 of file tesseractclass.h.

◆ tessedit_parallelize

int tesseract::Tesseract::tessedit_parallelize = 0

"Run in parallel where possible"

Definition at line 1092 of file tesseractclass.h.

◆ tessedit_prefer_joined_punct

bool tesseract::Tesseract::tessedit_prefer_joined_punct = false

"Reward punctation joins"

Definition at line 985 of file tesseractclass.h.

◆ tessedit_preserve_blk_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true

"Only rej partially rejected words in block rejection"

Definition at line 925 of file tesseractclass.h.

◆ tessedit_preserve_min_wd_len

int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2

"Only preserve wds longer than this"

Definition at line 933 of file tesseractclass.h.

◆ tessedit_preserve_row_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true

"Only rej partially rejected words in row rejection"

Definition at line 927 of file tesseractclass.h.

◆ tessedit_redo_xheight

bool tesseract::Tesseract::tessedit_redo_xheight = true

"Check/Correct x-height"

Definition at line 851 of file tesseractclass.h.

◆ tessedit_reject_bad_qual_wds

bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true

"Reject all bad quality wds"

Definition at line 939 of file tesseractclass.h.

◆ tessedit_reject_block_percent

double tesseract::Tesseract::tessedit_reject_block_percent = 45.00

"%rej allowed before rej whole block"

Definition at line 918 of file tesseractclass.h.

◆ tessedit_reject_doc_percent

double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00

"%rej allowed before rej whole doc"

Definition at line 916 of file tesseractclass.h.

◆ tessedit_reject_mode

int tesseract::Tesseract::tessedit_reject_mode = 0

"Rejection algorithm"

Definition at line 1040 of file tesseractclass.h.

◆ tessedit_reject_row_percent

double tesseract::Tesseract::tessedit_reject_row_percent = 40.00

"%rej allowed before rej whole row"

Definition at line 920 of file tesseractclass.h.

◆ tessedit_rejection_debug

bool tesseract::Tesseract::tessedit_rejection_debug = false

"Adaption debug"

Definition at line 1041 of file tesseractclass.h.

◆ tessedit_resegment_from_boxes

bool tesseract::Tesseract::tessedit_resegment_from_boxes = false

"Take segmentation and labeling from box file"

Definition at line 791 of file tesseractclass.h.

◆ tessedit_resegment_from_line_boxes

bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false

"Conversion of word/line box file to char box file"

Definition at line 793 of file tesseractclass.h.

◆ tessedit_row_rej_good_docs

bool tesseract::Tesseract::tessedit_row_rej_good_docs = true

"Apply row rejection to good docs"

Definition at line 935 of file tesseractclass.h.

◆ tessedit_tess_adaption_mode

int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27

"Adaptation decision algorithm for tess"

Definition at line 892 of file tesseractclass.h.

◆ tessedit_test_adaption

bool tesseract::Tesseract::tessedit_test_adaption = false

"Test adaption criteria"

Definition at line 895 of file tesseractclass.h.

◆ tessedit_test_adaption_mode

int tesseract::Tesseract::tessedit_test_adaption_mode = 3

"Adaptation decision algorithm for tess"

Definition at line 898 of file tesseractclass.h.

◆ tessedit_timing_debug

bool tesseract::Tesseract::tessedit_timing_debug = false

"Print timing stats"

Definition at line 845 of file tesseractclass.h.

◆ tessedit_train_from_boxes

bool tesseract::Tesseract::tessedit_train_from_boxes = false

"Generate training data from boxed chars"

Definition at line 795 of file tesseractclass.h.

◆ tessedit_train_line_recognizer

bool tesseract::Tesseract::tessedit_train_line_recognizer = false

"Break input into lines and remap boxes if present"

Definition at line 799 of file tesseractclass.h.

◆ tessedit_unrej_any_wd

bool tesseract::Tesseract::tessedit_unrej_any_wd = false

"Don't bother with word plausibility"

Definition at line 849 of file tesseractclass.h.

◆ tessedit_upper_flip_hyphen

double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8

"Aspect ratio dot/hyphen test"

Definition at line 1046 of file tesseractclass.h.

◆ tessedit_use_primary_params_model

bool tesseract::Tesseract::tessedit_use_primary_params_model = false

"In multilingual mode use params model of the primary language"

Definition at line 1071 of file tesseractclass.h.

◆ tessedit_use_reject_spaces

bool tesseract::Tesseract::tessedit_use_reject_spaces = true

"Reject spaces?"

Definition at line 914 of file tesseractclass.h.

◆ tessedit_whole_wd_rej_row_percent

double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00

"Number of row rejects in whole word rejects" "which prevents whole row rejection"

Definition at line 923 of file tesseractclass.h.

◆ tessedit_word_for_word

bool tesseract::Tesseract::tessedit_word_for_word = false

"Make output have exactly one word per WERD"

Definition at line 1036 of file tesseractclass.h.

◆ tessedit_write_block_separators

bool tesseract::Tesseract::tessedit_write_block_separators = false

"Write block separators in output"

Definition at line 1014 of file tesseractclass.h.

◆ tessedit_write_images

bool tesseract::Tesseract::tessedit_write_images = false

"Capture the image from the IPE"

Definition at line 1064 of file tesseractclass.h.

◆ tessedit_write_params_to_file

char* tesseract::Tesseract::tessedit_write_params_to_file = ""

"Write all parameters to the given file."

Definition at line 826 of file tesseractclass.h.

◆ tessedit_write_rep_codes

bool tesseract::Tesseract::tessedit_write_rep_codes = false

"Write repetition char code"

Definition at line 1016 of file tesseractclass.h.

◆ tessedit_write_unlv

bool tesseract::Tesseract::tessedit_write_unlv = false

"Write .unlv output file"

Definition at line 1017 of file tesseractclass.h.

◆ tessedit_zero_kelvin_rejection

bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false

"Don't reject ANYTHING AT ALL"

Definition at line 1038 of file tesseractclass.h.

◆ tessedit_zero_rejection

bool tesseract::Tesseract::tessedit_zero_rejection = false

"Don't reject ANYTHING"

Definition at line 1034 of file tesseractclass.h.

◆ test_pt

bool tesseract::Tesseract::test_pt = false

"Test for point"

Definition at line 899 of file tesseractclass.h.

◆ test_pt_x

double tesseract::Tesseract::test_pt_x = 99999.99

"xcoord"

Definition at line 900 of file tesseractclass.h.

◆ test_pt_y

double tesseract::Tesseract::test_pt_y = 99999.99

"ycoord"

Definition at line 901 of file tesseractclass.h.

◆ textonly_pdf

bool tesseract::Tesseract::textonly_pdf = false

"Create PDF with only one invisible text layer"

Definition at line 1023 of file tesseractclass.h.

◆ textord_equation_detect

bool tesseract::Tesseract::textord_equation_detect = false

"Turn on equation detector"

Definition at line 1083 of file tesseractclass.h.

◆ textord_tabfind_aligned_gap_fraction

double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75

"Fraction of height used as a minimum gap for aligned blobs."

Definition at line 1091 of file tesseractclass.h.

◆ textord_tabfind_force_vertical_text

bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false

"Force using vertical text page mode"

Definition at line 1086 of file tesseractclass.h.

◆ textord_tabfind_show_vlines

bool tesseract::Tesseract::textord_tabfind_show_vlines = false

"Debug line finding"

Definition at line 1076 of file tesseractclass.h.

◆ textord_tabfind_vertical_horizontal_mix

bool tesseract::Tesseract::textord_tabfind_vertical_horizontal_mix = true

"find horizontal lines such as headers in vertical page mode"

Definition at line 1110 of file tesseractclass.h.

◆ textord_tabfind_vertical_text

bool tesseract::Tesseract::textord_tabfind_vertical_text = true

"Enable vertical detection"

Definition at line 1084 of file tesseractclass.h.

◆ textord_tabfind_vertical_text_ratio

double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5

"Fraction of textlines deemed vertical to use vertical page " "mode"

Definition at line 1089 of file tesseractclass.h.

◆ textord_use_cjk_fp_model

bool tesseract::Tesseract::textord_use_cjk_fp_model = FALSE

"Use CJK fixed pitch model"

Definition at line 1077 of file tesseractclass.h.

◆ unlv_tilde_crunching

bool tesseract::Tesseract::unlv_tilde_crunching = true

"Mark v.bad words for tilde crunch"

Definition at line 947 of file tesseractclass.h.

◆ unrecognised_char

char* tesseract::Tesseract::unrecognised_char = "|"

"Output char for unidentified blobs"

Definition at line 1025 of file tesseractclass.h.

◆ use_new_state_cost

bool tesseract::Tesseract::use_new_state_cost = FALSE

"use new state cost heuristics for segmentation state evaluation"

Definition at line 1142 of file tesseractclass.h.

◆ x_ht_acceptance_tolerance

int tesseract::Tesseract::x_ht_acceptance_tolerance = 8

"Max allowed deviation of blob top outside of font data"

Definition at line 991 of file tesseractclass.h.

◆ x_ht_min_change

int tesseract::Tesseract::x_ht_min_change = 8

"Min change in xht before actually trying it"

Definition at line 992 of file tesseractclass.h.

The documentation for this class was generated from the following files:

/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/tesseractclass.h
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/adaptions.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/applybox.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/control.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/docqual.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/fixspace.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/fixxht.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/linerec.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/output.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/pagesegmain.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/pagewalk.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/par_control.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/pgedit.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/recogtraining.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/reject.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/superscript.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/tessbox.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/tessedit.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/tesseractclass.cpp
/home/stefan/src/github/tesseract-ocr/tesseract/ccmain/tfacepp.cpp

Public Member Functions

Public Attributes

Additional Inherited Members

Detailed Description

Constructor & Destructor Documentation

◆ Tesseract()

◆ ~Tesseract()

Member Function Documentation

◆ acceptable_number_string()

◆ acceptable_word_string()

◆ alpha_count()

◆ ambigs_classify_and_output()

◆ AnyLSTMLang()

◆ AnyTessLang()

◆ ApplyBoxes()

◆ ApplyBoxTraining()

◆ AssignDiacriticsToNewBlobs()

◆ AssignDiacriticsToOverlappingBlobs()

◆ AutoPageSeg()

◆ BelievableSuperscript()

◆ BestPix()

◆ bigram_correction_pass()

◆ blamer_pass()

◆ blob_feature_display()

◆ blob_noise_score()

◆ break_noisiest_blob_word()

◆ build_menu_new()

◆ check_debug_pt()

◆ classify_word_and_language()

◆ classify_word_pass1()

◆ classify_word_pass2()

◆ ClassifyBlobAsWord()

◆ ClassifyBlobPlusOutlines()

◆ Clear()

◆ ComputeCompatibleXheight()

◆ convert_bad_unlv_chs()

◆ ConvertStringToUnichars()

◆ CorrectClassifyWords()

◆ count_alphanums() [1/2]

◆ count_alphanums() [2/2]

◆ count_alphas()

◆ count_outline_errs()

◆ CountMisfitTops()

◆ debug_word()

◆ dictionary_correction_pass()

◆ digit_or_numeric_punct()

◆ do_re_display()

◆ doc_and_block_rejection()

◆ dont_allow_1Il()

◆ dump_words()

◆ end_tesseract()

◆ eval_word_spacing()

◆ failure_count()

◆ FindSegmentation()

◆ first_alphanum_index()

◆ first_alphanum_offset()

◆ fix_fuzzy_space_list()

◆ fix_fuzzy_spaces()

◆ fix_noisy_space_list()

◆ fix_rep_char()

◆ fix_sp_fp_word()

◆ fixspace_thinks_word_done()

◆ flip_0O()

◆ flip_hyphens()

◆ font_recognition_pass()

◆ fp_eval_word_spacing()

◆ garbage_word()

◆ get_rep_char()

◆ get_sub_lang()

◆ GetLineData()

◆ GetRectImage()

◆ GetSubAndSuperscriptCandidates()

◆ ImageHeight()

◆ ImageWidth()

◆ init_recog_training()

◆ init_tesseract() [1/2]

◆ init_tesseract() [2/2]

◆ init_tesseract_internal()

◆ init_tesseract_lang_data()

◆ init_tesseract_lm()