tesseract  4.00.00dev
tesseract::Tesseract Class Reference

#include <tesseractclass.h>

Inheritance diagram for tesseract::Tesseract:
tesseract::Wordrec tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil

Public Member Functions

 Tesseract ()
 
 ~Tesseract ()
 
void Clear ()
 
void ResetAdaptiveClassifier ()
 
void ResetDocumentDictionary ()
 
void SetEquationDetect (EquationDetect *detector)
 
const FCOORDreskew () const
 
Pix ** mutable_pix_binary ()
 
Pix * pix_binary () const
 
Pix * pix_grey () const
 
void set_pix_grey (Pix *grey_pix)
 
Pix * pix_original () const
 
void set_pix_original (Pix *original_pix)
 
Pix * BestPix () const
 
void set_pix_thresholds (Pix *thresholds)
 
int source_resolution () const
 
void set_source_resolution (int ppi)
 
int ImageWidth () const
 
int ImageHeight () const
 
Pix * scaled_color () const
 
int scaled_factor () const
 
void SetScaledColor (int factor, Pix *color)
 
const Textordtextord () const
 
Textordmutable_textord ()
 
bool right_to_left () const
 
int num_sub_langs () const
 
Tesseractget_sub_lang (int index) const
 
bool AnyTessLang () const
 
bool AnyLSTMLang () const
 
void SetBlackAndWhitelist ()
 
void PrepareForPageseg ()
 
void PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr)
 
int SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr)
 
void SetupWordScripts (BLOCK_LIST *blocks)
 
int AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
 
ColumnFinderSetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)
 
void PrerecAllWordsPar (const GenericVector< WordData > &words)
 
void TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list)
 
void TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
 
ImageDataGetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
 
ImageDataGetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
 
void LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
 
void SearchWords (PointerVector< WERD_RES > *words)
 
bool ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
 
void SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
 
void SetupWordPassN (int pass_n, WordData *word)
 
bool RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
 
bool recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
 
void rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
 
void bigram_correction_pass (PAGE_RES *page_res)
 
void blamer_pass (PAGE_RES *page_res)
 
void script_pos_pass (PAGE_RES *page_res)
 
int RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
 
bool ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
 
void AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
 
void AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
 
bool SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
 
float ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
 
float ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
 
void classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
 
void classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box)
 
void fix_rep_char (PAGE_RES_IT *page_res_it)
 
ACCEPTABLE_WERD_TYPE acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths)
 
void match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
 
void classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
 
void ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
 
bool RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row)
 
bool TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
 
BOOL8 recog_interactive (PAGE_RES_IT *pr_it)
 
void set_word_fonts (WERD_RES *word)
 
void font_recognition_pass (PAGE_RES *page_res)
 
void dictionary_correction_pass (PAGE_RES *page_res)
 
BOOL8 check_debug_pt (WERD_RES *word, int location)
 
bool SubAndSuperscriptFix (WERD_RES *word_res)
 
void GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
 
WERD_RESTrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
 
bool BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
 
void output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
 
void write_results (PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
 
void set_unlv_suspects (WERD_RES *word)
 
UNICHAR_ID get_rep_char (WERD_RES *word)
 
BOOL8 acceptable_number_string (const char *s, const char *lengths)
 
inT16 count_alphanums (const WERD_CHOICE &word)
 
inT16 count_alphas (const WERD_CHOICE &word)
 
void read_config_file (const char *filename, SetParamConstraint constraint)
 
int init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
int init_tesseract (const char *datapath, const char *language, OcrEngineMode oem)
 
int init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void SetupUniversalFontIds ()
 
int init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr)
 
void recognize_page (STRING &image_name)
 
void end_tesseract ()
 
bool init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
 
void ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
 
SVMenuNodebuild_menu_new ()
 
void pgeditor_main (int width, int height, PAGE_RES *page_res)
 
void process_image_event (const SVEvent &event)
 
BOOL8 process_cmd_win_event (inT32 cmd_event, char *new_value)
 
void debug_word (PAGE_RES *page_res, const TBOX &selection_box)
 
void do_re_display (BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
 
BOOL8 word_display (PAGE_RES_IT *pr_it)
 
BOOL8 word_bln_display (PAGE_RES_IT *pr_it)
 
BOOL8 word_blank_and_set_display (PAGE_RES_IT *pr_its)
 
BOOL8 word_set_display (PAGE_RES_IT *pr_it)
 
BOOL8 word_dumper (PAGE_RES_IT *pr_it)
 
void blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box)
 
void make_reject_map (WERD_RES *word, ROW *row, inT16 pass)
 
BOOL8 one_ell_conflict (WERD_RES *word_res, BOOL8 update_map)
 
inT16 first_alphanum_index (const char *word, const char *word_lengths)
 
inT16 first_alphanum_offset (const char *word, const char *word_lengths)
 
inT16 alpha_count (const char *word, const char *word_lengths)
 
BOOL8 word_contains_non_1_digit (const char *word, const char *word_lengths)
 
void dont_allow_1Il (WERD_RES *word)
 
inT16 count_alphanums (WERD_RES *word)
 
void flip_0O (WERD_RES *word)
 
BOOL8 non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
BOOL8 non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
 
BOOL8 repeated_nonalphanum_wd (WERD_RES *word, ROW *row)
 
void nn_match_word (WERD_RES *word, ROW *row)
 
void nn_recover_rejects (WERD_RES *word, ROW *row)
 
void set_done (WERD_RES *word, inT16 pass)
 
inT16 safe_dict_word (const WERD_RES *werd_res)
 
void flip_hyphens (WERD_RES *word)
 
void reject_I_1_L (WERD_RES *word)
 
void reject_edge_blobs (WERD_RES *word)
 
void reject_mostly_rejects (WERD_RES *word)
 
BOOL8 word_adaptable (WERD_RES *word, uinT16 mode)
 
void recog_word_recursive (WERD_RES *word)
 
void recog_word (WERD_RES *word)
 
void split_and_recog_word (WERD_RES *word)
 
void split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
 
void join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
 
void match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block)
 
inT16 fp_eval_word_spacing (WERD_RES_LIST &word_res_list)
 
void dump_words (WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
 
BOOL8 fixspace_thinks_word_done (WERD_RES *word)
 
GARBAGE_LEVEL garbage_word (WERD_RES *word, BOOL8 ok_dict_word)
 
BOOL8 potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
 
void tilde_crunch (PAGE_RES_IT &page_res_it)
 
void unrej_good_quality_words (PAGE_RES_IT &page_res_it)
 
void doc_and_block_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
 
void quality_based_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
 
void convert_bad_unlv_chs (WERD_RES *word_res)
 
void tilde_delete (PAGE_RES_IT &page_res_it)
 
inT16 word_blob_quality (WERD_RES *word, ROW *row)
 
void word_char_quality (WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
 
void unrej_good_chs (WERD_RES *word, ROW *row)
 
inT16 count_outline_errs (char c, inT16 outline_count)
 
inT16 word_outline_errs (WERD_RES *word)
 
BOOL8 terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level)
 
CRUNCH_MODE word_deletable (WERD_RES *word, inT16 &delete_mode)
 
inT16 failure_count (WERD_RES *word)
 
BOOL8 noise_outlines (TWERD *word)
 
void tess_segment_pass_n (int pass_n, WERD_RES *word)
 
PAGE_RESApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list)
 
void PreenXHeights (BLOCK_LIST *block_list)
 
PAGE_RESSetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
 
void MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
 
bool ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
 
bool ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
 
void ReSegmentByClassification (PAGE_RES *page_res)
 
bool ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
 
bool FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
 
void SearchForText (const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
 
void TidyUp (PAGE_RES *page_res)
 
void ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
 
void CorrectClassifyWords (PAGE_RES *page_res)
 
void ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res)
 
int CountMisfitTops (WERD_RES *word_res)
 
float ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift)
 
FILE * init_recog_training (const STRING &fname)
 
void recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file)
 
void ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
 
eval_word_spacing()

The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect.

Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred.

The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space.

Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined.

The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2.

BOOL8 digit_or_numeric_punct (WERD_RES *word, int char_position)
 
inT16 eval_word_spacing (WERD_RES_LIST &word_res_list)
 
fix_sp_fp_word()

Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words.

void fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
 
inT16 worst_noise_blob (WERD_RES *word_res, float *worst_noise_score)
 
float blob_noise_score (TBLOB *blob)
 
void break_noisiest_blob_word (WERD_RES_LIST &words)
 
fix_fuzzy_spaces()

Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.

Parameters
monitorprogress monitor
word_countcount of words in doc
[out]page_res
void fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
 
void fix_fuzzy_spaces (ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
 
process_selected_words()

Walk the current block list applying the specified word processor function to each word that overlaps the selection_box.

void process_selected_words (PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
 
tess_add_doc_word

Add the given word to the document dictionary

void tess_add_doc_word (WERD_CHOICE *word_choice)
 
tess_acceptable_word
Returns
true if the word is regarded as "good enough".
Parameters
word_choiceafter context
raw_choicebefore context
bool tess_acceptable_word (WERD_RES *word)
 
- Public Member Functions inherited from tesseract::Wordrec
 Wordrec ()
 
virtual ~Wordrec ()
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void WordSearch (WERD_RES *word_res)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
 
SEAMchop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (register SPLIT *split)
 
PRIORITY grade_sharpness (register SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, inT16 num_blobs)
 
void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
void cc_recog (WERD_RES *word)
 
void program_editdown (inT32 elasped_time)
 
void set_pass1 ()
 
void set_pass2 ()
 
int end_recog ()
 
BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
int dict_word (const WERD_CHOICE &word)
 
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
PRIORITY point_priority (EDGEPT *point)
 
void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
void prioritize_points (TESSLINE *outline, PointHeap *points)
 
void new_min_point (EDGEPT *local_min, PointHeap *points)
 
void new_max_point (EDGEPT *local_max, PointHeap *points)
 
void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
 
void chop_word_main (WERD_RES *word)
 
void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
virtual ~Classify ()
 
DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (FLOAT32 Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uinT8 *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()
 
 ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()
 
 ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool tessedit_resegment_from_boxes = false
 
bool tessedit_resegment_from_line_boxes = false
 
bool tessedit_train_from_boxes = false
 
bool tessedit_make_boxes_from_boxes = false
 
bool tessedit_train_line_recognizer = false
 
bool tessedit_dump_pageseg_images = false
 
int tessedit_pageseg_mode = PSM_SINGLE_BLOCK
 
int tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT
 
char * tessedit_char_blacklist = ""
 
char * tessedit_char_whitelist = ""
 
char * tessedit_char_unblacklist = ""
 
bool tessedit_ambigs_training = false
 
int pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
int ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT
 
char * tessedit_write_params_to_file = ""
 
bool tessedit_adaption_debug = false
 
int bidi_debug = 0
 
int applybox_debug = 1
 
int applybox_page = 0
 
char * applybox_exposure_pattern = ".exp"
 
bool applybox_learn_chars_and_char_frags_mode = false
 
bool applybox_learn_ngrams_mode = false
 
bool tessedit_display_outwords = false
 
bool tessedit_dump_choices = false
 
bool tessedit_timing_debug = false
 
bool tessedit_fix_fuzzy_spaces = true
 
bool tessedit_unrej_any_wd = false
 
bool tessedit_fix_hyphens = true
 
bool tessedit_redo_xheight = true
 
bool tessedit_enable_doc_dict = true
 
bool tessedit_debug_fonts = false
 
bool tessedit_debug_block_rejection = false
 
bool tessedit_enable_bigram_correction = true
 
bool tessedit_enable_dict_correction = false
 
int tessedit_bigram_debug = 0
 
bool enable_noise_removal = true
 
int debug_noise_removal = 0
 
double noise_cert_basechar = -8.0
 
double noise_cert_disjoint = -2.5
 
double noise_cert_punc = -2.5
 
double noise_cert_factor = 0.375
 
int noise_maxperblob = 8
 
int noise_maxperword = 16
 
int debug_x_ht_level = 0
 
bool debug_acceptable_wds = false
 
char * chs_leading_punct = "('`\""
 
char * chs_trailing_punct1 = ").,;:?!"
 
char * chs_trailing_punct2 = ")'`\""
 
double quality_rej_pc = 0.08
 
double quality_blob_pc = 0.0
 
double quality_outline_pc = 1.0
 
double quality_char_pc = 0.95
 
int quality_min_initial_alphas_reqd = 2
 
int tessedit_tess_adaption_mode = 0x27
 
bool tessedit_minimal_rej_pass1 = false
 
bool tessedit_test_adaption = false
 
bool tessedit_matcher_log = false
 
int tessedit_test_adaption_mode = 3
 
bool test_pt = false
 
double test_pt_x = 99999.99
 
double test_pt_y = 99999.99
 
int multilang_debug_level = 0
 
int paragraph_debug_level = 0
 
bool paragraph_text_based = true
 
bool lstm_use_matrix = 1
 
char * outlines_odd = "%| "
 
char * outlines_2 = "ij!?%\":;"
 
bool docqual_excuse_outline_errs = false
 
bool tessedit_good_quality_unrej = true
 
bool tessedit_use_reject_spaces = true
 
double tessedit_reject_doc_percent = 65.00
 
double tessedit_reject_block_percent = 45.00
 
double tessedit_reject_row_percent = 40.00
 
double tessedit_whole_wd_rej_row_percent = 70.00
 
bool tessedit_preserve_blk_rej_perfect_wds = true
 
bool tessedit_preserve_row_rej_perfect_wds = true
 
bool tessedit_dont_blkrej_good_wds = false
 
bool tessedit_dont_rowrej_good_wds = false
 
int tessedit_preserve_min_wd_len = 2
 
bool tessedit_row_rej_good_docs = true
 
double tessedit_good_doc_still_rowrej_wd = 1.1
 
bool tessedit_reject_bad_qual_wds = true
 
bool tessedit_debug_doc_rejection = false
 
bool tessedit_debug_quality_metrics = false
 
bool bland_unrej = false
 
double quality_rowrej_pc = 1.1
 
bool unlv_tilde_crunching = true
 
bool hocr_font_info = false
 
bool crunch_early_merge_tess_fails = true
 
bool crunch_early_convert_bad_unlv_chs = false
 
double crunch_terrible_rating = 80.0
 
bool crunch_terrible_garbage = true
 
double crunch_poor_garbage_cert = -9.0
 
double crunch_poor_garbage_rate = 60
 
double crunch_pot_poor_rate = 40
 
double crunch_pot_poor_cert = -8.0
 
bool crunch_pot_garbage = true
 
double crunch_del_rating = 60
 
double crunch_del_cert = -10.0
 
double crunch_del_min_ht = 0.7
 
double crunch_del_max_ht = 3.0
 
double crunch_del_min_width = 3.0
 
double crunch_del_high_word = 1.5
 
double crunch_del_low_word = 0.5
 
double crunch_small_outlines_size = 0.6
 
int crunch_rating_max = 10
 
int crunch_pot_indicators = 1
 
bool crunch_leave_ok_strings = true
 
bool crunch_accept_ok = true
 
bool crunch_leave_accept_strings = false
 
bool crunch_include_numerals = false
 
int crunch_leave_lc_strings = 4
 
int crunch_leave_uc_strings = 4
 
int crunch_long_repetitions = 3
 
int crunch_debug = 0
 
int fixsp_non_noise_limit = 1
 
double fixsp_small_outlines_size = 0.28
 
bool tessedit_prefer_joined_punct = false
 
int fixsp_done_mode = 1
 
int debug_fix_space_level = 0
 
char * numeric_punctuation = ".,"
 
int x_ht_acceptance_tolerance = 8
 
int x_ht_min_change = 8
 
int superscript_debug = 0
 
double superscript_worse_certainty = 2.0
 
double superscript_bettered_certainty = 0.97
 
double superscript_scaledown_ratio = 0.4
 
double subscript_max_y_top = 0.5
 
double superscript_min_y_bottom = 0.3
 
bool tessedit_write_block_separators = false
 
bool tessedit_write_rep_codes = false
 
bool tessedit_write_unlv = false
 
bool tessedit_create_txt = false
 
bool tessedit_create_hocr = false
 
bool tessedit_create_tsv = false
 
bool tessedit_create_pdf = false
 
bool textonly_pdf = false
 
char * unrecognised_char = "|"
 
int suspect_level = 99
 
int suspect_space_level = 100
 
int suspect_short_words = 2
 
bool suspect_constrain_1Il = false
 
double suspect_rating_per_ch = 999.9
 
double suspect_accept_rating = -999.9
 
bool tessedit_minimal_rejection = false
 
bool tessedit_zero_rejection = false
 
bool tessedit_word_for_word = false
 
bool tessedit_zero_kelvin_rejection = false
 
bool tessedit_consistent_reps = true
 
int tessedit_reject_mode = 0
 
bool tessedit_rejection_debug = false
 
bool tessedit_flip_0O = true
 
double tessedit_lower_flip_hyphen = 1.5
 
double tessedit_upper_flip_hyphen = 1.8
 
bool rej_trust_doc_dawg = false
 
bool rej_1Il_use_dict_word = false
 
bool rej_1Il_trust_permuter_type = true
 
bool rej_use_tess_accepted = true
 
bool rej_use_tess_blanks = true
 
bool rej_use_good_perm = true
 
bool rej_use_sensible_wd = false
 
bool rej_alphas_in_number_perm = false
 
double rej_whole_of_mostly_reject_word_fract = 0.85
 
int tessedit_image_border = 2
 
char * ok_repeated_ch_non_alphanum_wds = "-?*\075"
 
char * conflict_set_I_l_1 = "Il1[]"
 
int min_sane_x_ht_pixels = 8
 
bool tessedit_create_boxfile = false
 
int tessedit_page_number = -1
 
bool tessedit_write_images = false
 
bool interactive_display_mode = false
 
char * file_type = ".tif"
 
bool tessedit_override_permuter = true
 
char * tessedit_load_sublangs = ""
 
bool tessedit_use_primary_params_model = false
 
double min_orientation_margin = 7.0
 
bool textord_tabfind_show_vlines = false
 
bool textord_use_cjk_fp_model = FALSE
 
bool poly_allow_detailed_fx = false
 
bool tessedit_init_config_only = false
 
bool textord_equation_detect = false
 
bool textord_tabfind_vertical_text = true
 
bool textord_tabfind_force_vertical_text = false
 
double textord_tabfind_vertical_text_ratio = 0.5
 
double textord_tabfind_aligned_gap_fraction = 0.75
 
int tessedit_parallelize = 0
 
bool preserve_interword_spaces = false
 
bool include_page_breaks = false
 
char * page_separator = "\f"
 
bool textord_tabfind_vertical_horizontal_mix = true
 
int tessedit_ok_mode = 5
 
bool load_fixed_length_dawgs = true
 
int segment_debug = 0
 
bool permute_debug = 0
 
double bestrate_pruning_factor = 2.0
 
bool permute_script_word = 0
 
bool segment_segcost_rating = 0
 
double segment_reward_script = 0.95
 
bool permute_fixed_length_dawg = 0
 
bool permute_chartype_word = 0
 
double segment_reward_chartype = 0.97
 
double segment_reward_ngram_best_choice = 0.99
 
bool ngram_permuter_activated = false
 
bool permute_only_top = false
 
int language_model_fixed_length_choices_depth = 3
 
bool use_new_state_cost = FALSE
 
double heuristic_segcost_rating_base = 1.25
 
double heuristic_weight_rating = 1
 
double heuristic_weight_width = 1000.0
 
double heuristic_weight_seamcut = 0
 
double heuristic_max_char_wh_ratio = 2.0
 
bool enable_new_segsearch = false
 
double segsearch_max_fixed_pitch_char_wh_ratio = 2.0
 
- Public Attributes inherited from tesseract::Wordrec
bool merge_fragments_in_matrix = TRUE
 
bool wordrec_no_block = FALSE
 
bool wordrec_enable_assoc = TRUE
 
bool force_word_assoc = FALSE
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = FALSE
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = FALSE
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< intblame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
char * m_data_sub_dir = "tessdata/"
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Member Functions inherited from tesseract::Wordrec
bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 164 of file tesseractclass.h.

Constructor & Destructor Documentation

◆ Tesseract()

tesseract::Tesseract::Tesseract ( )

Definition at line 54 of file tesseractclass.cpp.

56  "Take segmentation and labeling from box file",
57  this->params()),
59  "Conversion of word/line box file to char box file",
60  this->params()),
62  "Generate training data from boxed chars", this->params()),
64  "Generate more boxes from boxed chars", this->params()),
66  "Break input into lines and remap boxes if present",
67  this->params()),
69  "Dump intermediate images made during page segmentation",
70  this->params()),
71  // The default for pageseg_mode is the old behaviour, so as not to
72  // upset anything that relies on that.
73  INT_MEMBER(
75  "Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block,"
76  " 5=line, 6=word, 7=char"
77  " (Values from PageSegMode enum in publictypes.h)",
78  this->params()),
80  "Which OCR engine(s) to run (Tesseract, LSTM, both)."
81  " Defaults to loading and running the most accurate"
82  " available.",
83  this->params()),
85  "Blacklist of chars not to recognize", this->params()),
87  "Whitelist of chars to recognize", this->params()),
89  "List of chars to override tessedit_char_blacklist",
90  this->params()),
92  "Perform training for ambiguities", this->params()),
95  "Whether to use the top-line splitting process for Devanagari "
96  "documents while performing page-segmentation.",
97  this->params()),
100  "Whether to use the top-line splitting process for Devanagari "
101  "documents while performing ocr.",
102  this->params()),
104  "Write all parameters to the given file.", this->params()),
106  "Generate and print debug"
107  " information for adaption",
108  this->params()),
109  INT_MEMBER(bidi_debug, 0, "Debug level for BiDi", this->params()),
110  INT_MEMBER(applybox_debug, 1, "Debug level", this->params()),
111  INT_MEMBER(applybox_page, 0, "Page number to apply boxes from",
112  this->params()),
114  "Exposure value follows"
115  " this pattern in the image filename. The name of the image"
116  " files are expected to be in the form"
117  " [lang].[fontname].exp[num].tif",
118  this->params()),
120  "Learn both character fragments (as is done in the"
121  " special low exposure mode) as well as unfragmented"
122  " characters.",
123  this->params()),
125  "Each bounding box"
126  " is assumed to contain ngrams. Only learn the ngrams"
127  " whose outlines overlap horizontally.",
128  this->params()),
129  BOOL_MEMBER(tessedit_display_outwords, false, "Draw output words",
130  this->params()),
131  BOOL_MEMBER(tessedit_dump_choices, false, "Dump char choices",
132  this->params()),
133  BOOL_MEMBER(tessedit_timing_debug, false, "Print timing stats",
134  this->params()),
136  "Try to improve fuzzy spaces", this->params()),
138  "Don't bother with word plausibility", this->params()),
139  BOOL_MEMBER(tessedit_fix_hyphens, true, "Crunch double hyphens?",
140  this->params()),
141  BOOL_MEMBER(tessedit_redo_xheight, true, "Check/Correct x-height",
142  this->params()),
144  "Add words to the document dictionary", this->params()),
145  BOOL_MEMBER(tessedit_debug_fonts, false, "Output font info per char",
146  this->params()),
147  BOOL_MEMBER(tessedit_debug_block_rejection, false, "Block and Row stats",
148  this->params()),
150  "Enable correction based on the word bigram dictionary.",
151  this->params()),
153  "Enable single word correction based on the dictionary.",
154  this->params()),
156  "Amount of debug output for bigram correction.",
157  this->params()),
159  "Remove and conditionally reassign small outlines when they"
160  " confuse layout analysis, determining diacritics vs noise",
161  this->params()),
162  INT_MEMBER(debug_noise_removal, 0, "Debug reassignment of small outlines",
163  this->params()),
164  // Worst (min) certainty, for which a diacritic is allowed to make the
165  // base
166  // character worse and still be included.
168  "Hingepoint for base char certainty", this->params()),
169  // Worst (min) certainty, for which a non-overlapping diacritic is allowed
170  // to make the base character worse and still be included.
172  "Hingepoint for disjoint certainty", this->params()),
173  // Worst (min) certainty, for which a diacritic is allowed to make a new
174  // stand-alone blob.
176  "Threshold for new punc char certainty", this->params()),
177  // Factor of certainty margin for adding diacritics to not count as worse.
179  "Scaling on certainty diff from Hingepoint",
180  this->params()),
181  INT_MEMBER(noise_maxperblob, 8, "Max diacritics to apply to a blob",
182  this->params()),
183  INT_MEMBER(noise_maxperword, 16, "Max diacritics to apply to a word",
184  this->params()),
185  INT_MEMBER(debug_x_ht_level, 0, "Reestimate debug", this->params()),
186  BOOL_MEMBER(debug_acceptable_wds, false, "Dump word pass/fail chk",
187  this->params()),
188  STRING_MEMBER(chs_leading_punct, "('`\"", "Leading punctuation",
189  this->params()),
190  STRING_MEMBER(chs_trailing_punct1, ").,;:?!", "1st Trailing punctuation",
191  this->params()),
192  STRING_MEMBER(chs_trailing_punct2, ")'`\"", "2nd Trailing punctuation",
193  this->params()),
195  "good_quality_doc lte rejection limit", this->params()),
197  "good_quality_doc gte good blobs limit", this->params()),
199  "good_quality_doc lte outline error limit", this->params()),
201  "good_quality_doc gte good char limit", this->params()),
202  INT_MEMBER(quality_min_initial_alphas_reqd, 2, "alphas in a good word",
203  this->params()),
205  "Adaptation decision algorithm for tess", this->params()),
207  "Do minimal rejection on pass 1 output", this->params()),
208  BOOL_MEMBER(tessedit_test_adaption, false, "Test adaption criteria",
209  this->params()),
210  BOOL_MEMBER(tessedit_matcher_log, false, "Log matcher activity",
211  this->params()),
213  "Adaptation decision algorithm for tess", this->params()),
214  BOOL_MEMBER(test_pt, false, "Test for point", this->params()),
215  double_MEMBER(test_pt_x, 99999.99, "xcoord", this->params()),
216  double_MEMBER(test_pt_y, 99999.99, "ycoord", this->params()),
217  INT_MEMBER(multilang_debug_level, 0, "Print multilang debug info.",
218  this->params()),
219  INT_MEMBER(paragraph_debug_level, 0, "Print paragraph debug info.",
220  this->params()),
222  "Run paragraph detection on the post-text-recognition "
223  "(more accurate)",
224  this->params()),
226  "Use ratings matrix/beam search with lstm", this->params()),
227  STRING_MEMBER(outlines_odd, "%| ", "Non standard number of outlines",
228  this->params()),
229  STRING_MEMBER(outlines_2, "ij!?%\":;", "Non standard number of outlines",
230  this->params()),
232  "Allow outline errs in unrejection?", this->params()),
234  "Reduce rejection on good docs", this->params()),
235  BOOL_MEMBER(tessedit_use_reject_spaces, true, "Reject spaces?",
236  this->params()),
238  "%rej allowed before rej whole doc", this->params()),
240  "%rej allowed before rej whole block", this->params()),
242  "%rej allowed before rej whole row", this->params()),
244  "Number of row rejects in whole word rejects"
245  "which prevents whole row rejection",
246  this->params()),
248  "Only rej partially rejected words in block rejection",
249  this->params()),
251  "Only rej partially rejected words in row rejection",
252  this->params()),
254  "Use word segmentation quality metric", this->params()),
256  "Use word segmentation quality metric", this->params()),
258  "Only preserve wds longer than this", this->params()),
260  "Apply row rejection to good docs", this->params()),
262  "rej good doc wd if more than this fraction rejected",
263  this->params()),
265  "Reject all bad quality wds", this->params()),
266  BOOL_MEMBER(tessedit_debug_doc_rejection, false, "Page stats",
267  this->params()),
269  "Output data to debug file", this->params()),
270  BOOL_MEMBER(bland_unrej, false, "unrej potential with no checks",
271  this->params()),
273  "good_quality_doc gte good char limit", this->params()),
275  "Mark v.bad words for tilde crunch", this->params()),
276  BOOL_MEMBER(hocr_font_info, false, "Add font info to hocr output",
277  this->params()),
278  BOOL_MEMBER(crunch_early_merge_tess_fails, true, "Before word crunch?",
279  this->params()),
281  "Take out ~^ early?", this->params()),
282  double_MEMBER(crunch_terrible_rating, 80.0, "crunch rating lt this",
283  this->params()),
284  BOOL_MEMBER(crunch_terrible_garbage, true, "As it says", this->params()),
286  "crunch garbage cert lt this", this->params()),
288  "crunch garbage rating lt this", this->params()),
289  double_MEMBER(crunch_pot_poor_rate, 40, "POTENTIAL crunch rating lt this",
290  this->params()),
291  double_MEMBER(crunch_pot_poor_cert, -8.0, "POTENTIAL crunch cert lt this",
292  this->params()),
293  BOOL_MEMBER(crunch_pot_garbage, true, "POTENTIAL crunch garbage",
294  this->params()),
295  double_MEMBER(crunch_del_rating, 60, "POTENTIAL crunch rating lt this",
296  this->params()),
297  double_MEMBER(crunch_del_cert, -10.0, "POTENTIAL crunch cert lt this",
298  this->params()),
299  double_MEMBER(crunch_del_min_ht, 0.7, "Del if word ht lt xht x this",
300  this->params()),
301  double_MEMBER(crunch_del_max_ht, 3.0, "Del if word ht gt xht x this",
302  this->params()),
304  "Del if word width lt xht x this", this->params()),
306  "Del if word gt xht x this above bl", this->params()),
308  "Del if word gt xht x this below bl", this->params()),
309  double_MEMBER(crunch_small_outlines_size, 0.6, "Small if lt xht x this",
310  this->params()),
311  INT_MEMBER(crunch_rating_max, 10, "For adj length in rating per ch",
312  this->params()),
314  "How many potential indicators needed", this->params()),
315  BOOL_MEMBER(crunch_leave_ok_strings, true, "Don't touch sensible strings",
316  this->params()),
317  BOOL_MEMBER(crunch_accept_ok, true, "Use acceptability in okstring",
318  this->params()),
320  "Don't pot crunch sensible strings", this->params()),
321  BOOL_MEMBER(crunch_include_numerals, false, "Fiddle alpha figures",
322  this->params()),
324  "Don't crunch words with long lower case strings",
325  this->params()),
327  "Don't crunch words with long lower case strings",
328  this->params()),
330  "Crunch words with long repetitions", this->params()),
331  INT_MEMBER(crunch_debug, 0, "As it says", this->params()),
333  "How many non-noise blbs either side?", this->params()),
334  double_MEMBER(fixsp_small_outlines_size, 0.28, "Small if lt xht x this",
335  this->params()),
337  "Reward punctation joins", this->params()),
338  INT_MEMBER(fixsp_done_mode, 1, "What constitues done for spacing",
339  this->params()),
340  INT_MEMBER(debug_fix_space_level, 0, "Contextual fixspace debug",
341  this->params()),
343  "Punct. chs expected WITHIN numbers", this->params()),
345  "Max allowed deviation of blob top outside of font data",
346  this->params()),
348  "Min change in xht before actually trying it", this->params()),
350  "Debug level for sub & superscript fixer", this->params()),
353  "How many times worse "
354  "certainty does a superscript position glyph need to be for "
355  "us to try classifying it as a char with a different "
356  "baseline?",
357  this->params()),
360  "What reduction in "
361  "badness do we think sufficient to choose a superscript "
362  "over what we'd thought. For example, a value of 0.6 means "
363  "we want to reduce badness of certainty by at least 40%",
364  this->params()),
366  "A superscript scaled down more than this is unbelievably "
367  "small. For example, 0.3 means we expect the font size to "
368  "be no smaller than 30% of the text line font size.",
369  this->params()),
371  "Maximum top of a character measured as a multiple of "
372  "x-height above the baseline for us to reconsider whether "
373  "it's a subscript.",
374  this->params()),
376  "Minimum bottom of a character measured as a multiple of "
377  "x-height above the baseline for us to reconsider whether "
378  "it's a superscript.",
379  this->params()),
381  "Write block separators in output", this->params()),
382  BOOL_MEMBER(tessedit_write_rep_codes, false, "Write repetition char code",
383  this->params()),
384  BOOL_MEMBER(tessedit_write_unlv, false, "Write .unlv output file",
385  this->params()),
386  BOOL_MEMBER(tessedit_create_txt, false, "Write .txt output file",
387  this->params()),
388  BOOL_MEMBER(tessedit_create_hocr, false, "Write .html hOCR output file",
389  this->params()),
390  BOOL_MEMBER(tessedit_create_tsv, false, "Write .tsv output file",
391  this->params()),
392  BOOL_MEMBER(tessedit_create_pdf, false, "Write .pdf output file",
393  this->params()),
394  BOOL_MEMBER(textonly_pdf, false,
395  "Create PDF with only one invisible text layer",
396  this->params()),
398  "Output char for unidentified blobs", this->params()),
399  INT_MEMBER(suspect_level, 99, "Suspect marker level", this->params()),
401  "Min suspect level for rejecting spaces", this->params()),
403  "Don't suspect dict wds longer than this", this->params()),
404  BOOL_MEMBER(suspect_constrain_1Il, false, "UNLV keep 1Il chars rejected",
405  this->params()),
407  "Don't touch bad rating limit", this->params()),
408  double_MEMBER(suspect_accept_rating, -999.9, "Accept good rating limit",
409  this->params()),
411  "Only reject tess failures", this->params()),
412  BOOL_MEMBER(tessedit_zero_rejection, false, "Don't reject ANYTHING",
413  this->params()),
415  "Make output have exactly one word per WERD", this->params()),
417  "Don't reject ANYTHING AT ALL", this->params()),
419  "Force all rep chars the same", this->params()),
420  INT_MEMBER(tessedit_reject_mode, 0, "Rejection algorithm",
421  this->params()),
422  BOOL_MEMBER(tessedit_rejection_debug, false, "Adaption debug",
423  this->params()),
424  BOOL_MEMBER(tessedit_flip_0O, true, "Contextual 0O O0 flips",
425  this->params()),
427  "Aspect ratio dot/hyphen test", this->params()),
429  "Aspect ratio dot/hyphen test", this->params()),
431  "Use DOC dawg in 11l conf. detector", this->params()),
432  BOOL_MEMBER(rej_1Il_use_dict_word, false, "Use dictword test",
433  this->params()),
434  BOOL_MEMBER(rej_1Il_trust_permuter_type, true, "Don't double check",
435  this->params()),
436  BOOL_MEMBER(rej_use_tess_accepted, true, "Individual rejection control",
437  this->params()),
438  BOOL_MEMBER(rej_use_tess_blanks, true, "Individual rejection control",
439  this->params()),
440  BOOL_MEMBER(rej_use_good_perm, true, "Individual rejection control",
441  this->params()),
442  BOOL_MEMBER(rej_use_sensible_wd, false, "Extend permuter check",
443  this->params()),
444  BOOL_MEMBER(rej_alphas_in_number_perm, false, "Extend permuter check",
445  this->params()),
447  "if >this fract", this->params()),
448  INT_MEMBER(tessedit_image_border, 2, "Rej blbs near image edge limit",
449  this->params()),
451  "Allow NN to unrej", this->params()),
452  STRING_MEMBER(conflict_set_I_l_1, "Il1[]", "Il1 conflict set",
453  this->params()),
454  INT_MEMBER(min_sane_x_ht_pixels, 8, "Reject any x-ht lt or eq than this",
455  this->params()),
456  BOOL_MEMBER(tessedit_create_boxfile, false, "Output text with boxes",
457  this->params()),
459  "-1 -> All pages"
460  " , else specific page to process",
461  this->params()),
463  "Capture the image from the IPE", this->params()),
464  BOOL_MEMBER(interactive_display_mode, false, "Run interactively?",
465  this->params()),
466  STRING_MEMBER(file_type, ".tif", "Filename extension", this->params()),
467  BOOL_MEMBER(tessedit_override_permuter, true, "According to dict_word",
468  this->params()),
470  "List of languages to load with this one", this->params()),
472  "In multilingual mode use params model of the"
473  " primary language",
474  this->params()),
476  "Min acceptable orientation margin", this->params()),
477  BOOL_MEMBER(textord_tabfind_show_vlines, false, "Debug line finding",
478  this->params()),
479  BOOL_MEMBER(textord_use_cjk_fp_model, FALSE, "Use CJK fixed pitch model",
480  this->params()),
482  "Allow feature extractors to see the original outline",
483  this->params()),
485  "Only initialize with the config file. Useful if the "
486  "instance is not going to be used for OCR but say only "
487  "for layout analysis.",
488  this->params()),
489  BOOL_MEMBER(textord_equation_detect, false, "Turn on equation detector",
490  this->params()),
492  "Enable vertical detection", this->params()),
494  "Force using vertical text page mode", this->params()),
497  "Fraction of textlines deemed vertical to use vertical page "
498  "mode",
499  this->params()),
502  "Fraction of height used as a minimum gap for aligned blobs.",
503  this->params()),
504  INT_MEMBER(tessedit_parallelize, 0, "Run in parallel where possible",
505  this->params()),
507  "Preserve multiple interword spaces", this->params()),
509  "Include page separator string in output text after each "
510  "image/page.",
511  this->params()),
513  "Page separator (default is form feed control character)",
514  this->params()),
515 
516  // The following parameters were deprecated and removed from their
517  // original
518  // locations. The parameters are temporarily kept here to give Tesseract
519  // users a chance to updated their [lang].traineddata and config files
520  // without introducing failures during Tesseract initialization.
521  // TODO(ocr-team): remove these parameters from the code once we are
522  // reasonably sure that Tesseract users have updated their data files.
523  //
524  // BEGIN DEPRECATED PARAMETERS
526  "find horizontal lines such as headers in vertical page mode",
527  this->params()),
528  INT_MEMBER(tessedit_ok_mode, 5, "Acceptance decision algorithm",
529  this->params()),
531  "Load fixed length dawgs"
532  " (e.g. for non-space delimited languages)",
533  this->params()),
534  INT_MEMBER(segment_debug, 0, "Debug the whole segmentation process",
535  this->params()),
536  BOOL_MEMBER(permute_debug, 0, "Debug char permutation process",
537  this->params()),
539  "Multiplying factor of"
540  " current best rate to prune other hypotheses",
541  this->params()),
543  "Turn on word script consistency permuter", this->params()),
545  "incorporate segmentation cost in word rating?",
546  this->params()),
548  "Score multipler for script consistency within a word. "
549  "Being a 'reward' factor, it should be <= 1. "
550  "Smaller value implies bigger reward.",
551  this->params()),
553  "Turn on fixed-length phrasebook search permuter",
554  this->params()),
556  "Turn on character type (property) consistency permuter",
557  this->params()),
559  "Score multipler for char type consistency within a word. ",
560  this->params()),
562  "Score multipler for ngram permuter's best choice"
563  " (only used in the Han script path).",
564  this->params()),
566  "Activate character-level n-gram-based permuter",
567  this->params()),
568  BOOL_MEMBER(permute_only_top, false, "Run only the top choice permuter",
569  this->params()),
571  "Depth of blob choice lists to explore"
572  " when fixed length dawgs are on",
573  this->params()),
575  "use new state cost heuristics for segmentation state"
576  " evaluation",
577  this->params()),
579  "base factor for adding segmentation cost into word rating."
580  "It's a multiplying factor, the larger the value above 1, "
581  "the bigger the effect of segmentation cost.",
582  this->params()),
584  "weight associated with char rating in combined cost of"
585  "state",
586  this->params()),
588  "weight associated with width evidence in combined cost of"
589  " state",
590  this->params()),
592  "weight associated with seam cut in combined cost of state",
593  this->params()),
595  "max char width-to-height ratio allowed in segmentation",
596  this->params()),
598  "Enable new segmentation search path.", this->params()),
600  "Maximum character width-to-height ratio for"
601  " fixed-pitch fonts",
602  this->params()),
603  // END DEPRECATED PARAMETERS
604 
605  backup_config_file_(NULL),
606  pix_binary_(NULL),
607  pix_grey_(NULL),
608  pix_original_(NULL),
609  pix_thresholds_(NULL),
610  source_resolution_(0),
611  textord_(this),
612  right_to_left_(false),
613  scaled_color_(NULL),
614  scaled_factor_(-1),
615  deskew_(1.0f, 0.0f),
616  reskew_(1.0f, 0.0f),
617  most_recently_used_(this),
618  font_table_size_(0),
619  equ_detect_(NULL),
620 #ifndef ANDROID_BUILD
621  lstm_recognizer_(NULL),
622 #endif
623  train_line_page_num_(0) {
624 }
double rej_whole_of_mostly_reject_word_fract
double textord_tabfind_vertical_text_ratio
double heuristic_segcost_rating_base
ParamsVectors * params()
Definition: ccutil.h:62
int language_model_fixed_length_choices_depth
char * ok_repeated_ch_non_alphanum_wds
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:309
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:160
double superscript_worse_certainty
double tessedit_reject_doc_percent
bool tessedit_preserve_row_rej_perfect_wds
bool tessedit_enable_bigram_correction
double segsearch_max_fixed_pitch_char_wh_ratio
#define FALSE
Definition: capi.h:46
bool textord_tabfind_vertical_horizontal_mix
#define INT_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:312
double tessedit_good_doc_still_rowrej_wd
bool tessedit_preserve_blk_rej_perfect_wds
#define STRING_MEMBER(name, val, comment, vec)
Definition: params.h:306
bool crunch_early_convert_bad_unlv_chs
double segment_reward_ngram_best_choice
double tessedit_whole_wd_rej_row_percent
char * tessedit_write_params_to_file
double superscript_bettered_certainty
#define BOOL_INIT_MEMBER(name, val, comment, vec)
Definition: params.h:315
double tessedit_reject_block_percent
double textord_tabfind_aligned_gap_fraction
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:303
double tessedit_reject_row_percent
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:300
bool tessedit_resegment_from_line_boxes
bool applybox_learn_chars_and_char_frags_mode
bool textord_tabfind_force_vertical_text

◆ ~Tesseract()

tesseract::Tesseract::~Tesseract ( )

Definition at line 626 of file tesseractclass.cpp.

626  {
627  Clear();
628  pixDestroy(&pix_original_);
629  end_tesseract();
630  sub_langs_.delete_data_pointers();
631 #ifndef ANDROID_BUILD
632  delete lstm_recognizer_;
633  lstm_recognizer_ = NULL;
634 #endif
635 }

Member Function Documentation

◆ acceptable_number_string()

BOOL8 tesseract::Tesseract::acceptable_number_string ( const char *  s,
const char *  lengths 
)

Definition at line 419 of file output.cpp.

420  {
421  BOOL8 prev_digit = FALSE;
422 
423  if (*lengths == 1 && *s == '(')
424  s++;
425 
426  if (*lengths == 1 &&
427  ((*s == '$') || (*s == '.') || (*s == '+') || (*s == '-')))
428  s++;
429 
430  for (; *s != '\0'; s += *(lengths++)) {
431  if (unicharset.get_isdigit(s, *lengths))
432  prev_digit = TRUE;
433  else if (prev_digit &&
434  (*lengths == 1 && ((*s == '.') || (*s == ',') || (*s == '-'))))
435  prev_digit = FALSE;
436  else if (prev_digit && *lengths == 1 &&
437  (*(s + *lengths) == '\0') && ((*s == '%') || (*s == ')')))
438  return TRUE;
439  else if (prev_digit &&
440  *lengths == 1 && (*s == '%') &&
441  (*(lengths + 1) == 1 && *(s + *lengths) == ')') &&
442  (*(s + *lengths + *(lengths + 1)) == '\0'))
443  return TRUE;
444  else
445  return FALSE;
446  }
447  return TRUE;
448 }
#define TRUE
Definition: capi.h:45
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
UNICHARSET unicharset
Definition: ccutil.h:68

◆ acceptable_word_string()

ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string ( const UNICHARSET char_set,
const char *  s,
const char *  lengths 
)

Definition at line 1690 of file control.cpp.

1691  {
1692  int i = 0;
1693  int offset = 0;
1694  int leading_punct_count;
1695  int upper_count = 0;
1696  int hyphen_pos = -1;
1698 
1699  if (strlen (lengths) > 20)
1700  return word_type;
1701 
1702  /* Single Leading punctuation char*/
1703 
1704  if (s[offset] != '\0' && STRING(chs_leading_punct).contains(s[offset]))
1705  offset += lengths[i++];
1706  leading_punct_count = i;
1707 
1708  /* Initial cap */
1709  while (s[offset] != '\0' && char_set.get_isupper(s + offset, lengths[i])) {
1710  offset += lengths[i++];
1711  upper_count++;
1712  }
1713  if (upper_count > 1) {
1714  word_type = AC_UPPER_CASE;
1715  } else {
1716  /* Lower case word, possibly with an initial cap */
1717  while (s[offset] != '\0' && char_set.get_islower(s + offset, lengths[i])) {
1718  offset += lengths[i++];
1719  }
1720  if (i - leading_punct_count < quality_min_initial_alphas_reqd)
1721  goto not_a_word;
1722  /*
1723  Allow a single hyphen in a lower case word
1724  - don't trust upper case - I've seen several cases of "H" -> "I-I"
1725  */
1726  if (lengths[i] == 1 && s[offset] == '-') {
1727  hyphen_pos = i;
1728  offset += lengths[i++];
1729  if (s[offset] != '\0') {
1730  while ((s[offset] != '\0') &&
1731  char_set.get_islower(s + offset, lengths[i])) {
1732  offset += lengths[i++];
1733  }
1734  if (i < hyphen_pos + 3)
1735  goto not_a_word;
1736  }
1737  } else {
1738  /* Allow "'s" in NON hyphenated lower case words */
1739  if (lengths[i] == 1 && (s[offset] == '\'') &&
1740  lengths[i + 1] == 1 && (s[offset + lengths[i]] == 's')) {
1741  offset += lengths[i++];
1742  offset += lengths[i++];
1743  }
1744  }
1745  if (upper_count > 0)
1746  word_type = AC_INITIAL_CAP;
1747  else
1748  word_type = AC_LOWER_CASE;
1749  }
1750 
1751  /* Up to two different, constrained trailing punctuation chars */
1752  if (lengths[i] == 1 && s[offset] != '\0' &&
1753  STRING(chs_trailing_punct1).contains(s[offset]))
1754  offset += lengths[i++];
1755  if (lengths[i] == 1 && s[offset] != '\0' && i > 0 &&
1756  s[offset - lengths[i - 1]] != s[offset] &&
1757  STRING(chs_trailing_punct2).contains (s[offset]))
1758  offset += lengths[i++];
1759 
1760  if (s[offset] != '\0')
1761  word_type = AC_UNACCEPTABLE;
1762 
1763  not_a_word:
1764 
1765  if (word_type == AC_UNACCEPTABLE) {
1766  /* Look for abbreviation string */
1767  i = 0;
1768  offset = 0;
1769  if (s[0] != '\0' && char_set.get_isupper(s, lengths[0])) {
1770  word_type = AC_UC_ABBREV;
1771  while (s[offset] != '\0' &&
1772  char_set.get_isupper(s + offset, lengths[i]) &&
1773  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1774  offset += lengths[i++];
1775  offset += lengths[i++];
1776  }
1777  }
1778  else if (s[0] != '\0' && char_set.get_islower(s, lengths[0])) {
1779  word_type = AC_LC_ABBREV;
1780  while (s[offset] != '\0' &&
1781  char_set.get_islower(s + offset, lengths[i]) &&
1782  lengths[i + 1] == 1 && s[offset + lengths[i]] == '.') {
1783  offset += lengths[i++];
1784  offset += lengths[i++];
1785  }
1786  }
1787  if (s[offset] != '\0')
1788  word_type = AC_UNACCEPTABLE;
1789  }
1790 
1791  return word_type;
1792 }
Unacceptable word.
Definition: control.h:36
ACCEPTABLE_WERD_TYPE
Definition: control.h:34
a.b.c.
Definition: control.h:40
voidpf uLong offset
Definition: ioapi.h:42
Definition: strngs.h:45
BOOL8 contains(const char c) const
Definition: strngs.cpp:189
A.B.C.
Definition: control.h:41
ALL but initial lc.
Definition: control.h:39
ALL lower case.
Definition: control.h:37
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
ALL upper case.
Definition: control.h:38
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458

◆ alpha_count()

inT16 tesseract::Tesseract::alpha_count ( const char *  word,
const char *  word_lengths 
)

Definition at line 495 of file reject.cpp.

496  {
497  inT16 i;
498  inT16 offset;
499  inT16 count = 0;
500 
501  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
502  if (unicharset.get_isalpha (word + offset, word_lengths[i]))
503  count++;
504  }
505  return count;
506 }
voidpf uLong offset
Definition: ioapi.h:42
int16_t inT16
Definition: host.h:36
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
UNICHARSET unicharset
Definition: ccutil.h:68
int count(LIST var_list)
Definition: oldlist.cpp:103

◆ ambigs_classify_and_output()

void tesseract::Tesseract::ambigs_classify_and_output ( const char *  label,
PAGE_RES_IT pr_it,
FILE *  output_file 
)

Definition at line 202 of file recogtraining.cpp.

204  {
205  // Classify word.
206  fflush(stdout);
207  WordData word_data(*pr_it);
208  SetupWordPassN(1, &word_data);
209  classify_word_and_language(1, pr_it, &word_data);
210  WERD_RES* werd_res = word_data.word;
211  WERD_CHOICE *best_choice = werd_res->best_choice;
212  ASSERT_HOST(best_choice != NULL);
213 
214  // Compute the number of unichars in the label.
215  GenericVector<UNICHAR_ID> encoding;
216  if (!unicharset.encode_string(label, true, &encoding, NULL, NULL)) {
217  tprintf("Not outputting illegal unichar %s\n", label);
218  return;
219  }
220 
221  // Dump all paths through the ratings matrix (which is normally small).
222  int dim = werd_res->ratings->dimension();
223  const BLOB_CHOICE** blob_choices = new const BLOB_CHOICE*[dim];
224  PrintMatrixPaths(0, dim, *werd_res->ratings, 0, blob_choices,
225  unicharset, label, output_file);
226  delete [] blob_choices;
227 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1285
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
MATRIX * ratings
Definition: pageres.h:215
UNICHARSET unicharset
Definition: ccutil.h:68
int dimension() const
Definition: matrix.h:521
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:234
WERD * word
Definition: pageres.h:175
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:174

◆ AnyLSTMLang()

bool tesseract::Tesseract::AnyLSTMLang ( ) const
inline

Definition at line 268 of file tesseractclass.h.

268  {
269  if (tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY) return true;
270  for (int i = 0; i < sub_langs_.size(); ++i) {
271  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_TESSERACT_ONLY)
272  return true;
273  }
274  return false;
275  }

◆ AnyTessLang()

bool tesseract::Tesseract::AnyTessLang ( ) const
inline

Definition at line 260 of file tesseractclass.h.

260  {
261  if (tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
262  for (int i = 0; i < sub_langs_.size(); ++i) {
263  if (sub_langs_[i]->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) return true;
264  }
265  return false;
266  }

◆ ApplyBoxes()

PAGE_RES * tesseract::Tesseract::ApplyBoxes ( const STRING fname,
bool  find_segmentation,
BLOCK_LIST *  block_list 
)

Definition at line 117 of file applybox.cpp.

119  {
120  GenericVector<TBOX> boxes;
121  GenericVector<STRING> texts, full_texts;
122  if (!ReadAllBoxes(applybox_page, true, fname, &boxes, &texts, &full_texts,
123  NULL)) {
124  return NULL; // Can't do it.
125  }
126 
127  int box_count = boxes.size();
128  int box_failures = 0;
129  // Add an empty everything to the end.
130  boxes.push_back(TBOX());
131  texts.push_back(STRING());
132  full_texts.push_back(STRING());
133 
134  // In word mode, we use the boxes to make a word for each box, but
135  // in blob mode we use the existing words and maximally chop them first.
136  PAGE_RES* page_res = find_segmentation ?
137  NULL : SetupApplyBoxes(boxes, block_list);
138  clear_any_old_text(block_list);
139 
140  for (int i = 0; i < boxes.size() - 1; i++) {
141  bool foundit = false;
142  if (page_res != NULL) {
143  if (i == 0) {
144  foundit = ResegmentCharBox(page_res, NULL, boxes[i], boxes[i + 1],
145  full_texts[i].string());
146  } else {
147  foundit = ResegmentCharBox(page_res, &boxes[i-1], boxes[i],
148  boxes[i + 1], full_texts[i].string());
149  }
150  } else {
151  foundit = ResegmentWordBox(block_list, boxes[i], boxes[i + 1],
152  texts[i].string());
153  }
154  if (!foundit) {
155  box_failures++;
156  ReportFailedBox(i, boxes[i], texts[i].string(),
157  "FAILURE! Couldn't find a matching blob");
158  }
159  }
160 
161  if (page_res == NULL) {
162  // In word/line mode, we now maximally chop all the words and resegment
163  // them with the classifier.
164  page_res = SetupApplyBoxes(boxes, block_list);
165  ReSegmentByClassification(page_res);
166  }
167  if (applybox_debug > 0) {
168  tprintf("APPLY_BOXES:\n");
169  tprintf(" Boxes read from boxfile: %6d\n", box_count);
170  if (box_failures > 0)
171  tprintf(" Boxes failed resegmentation: %6d\n", box_failures);
172  }
173  TidyUp(page_res);
174  return page_res;
175 }
void ReSegmentByClassification(PAGE_RES *page_res)
Definition: applybox.cpp:509
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
void TidyUp(PAGE_RES *page_res)
Definition: applybox.cpp:706
void ReportFailedBox(int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg)
Definition: applybox.cpp:764
Definition: strngs.h:45
bool ResegmentWordBox(BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:438
Definition: rect.h:30
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:50
PAGE_RES * SetupApplyBoxes(const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list)
Definition: applybox.cpp:217
bool ResegmentCharBox(PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text)
Definition: applybox.cpp:340

◆ ApplyBoxTraining()

void tesseract::Tesseract::ApplyBoxTraining ( const STRING fontname,
PAGE_RES page_res 
)

Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.

Definition at line 796 of file applybox.cpp.

796  {
797  PAGE_RES_IT pr_it(page_res);
798  int word_count = 0;
799  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
800  word_res = pr_it.forward()) {
801  LearnWord(fontname.string(), word_res);
802  ++word_count;
803  }
804  tprintf("Generated training data for %d words\n", word_count);
805 }
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
WERD * word
Definition: pageres.h:175
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244

◆ AssignDiacriticsToNewBlobs()

void tesseract::Tesseract::AssignDiacriticsToNewBlobs ( const GenericVector< C_OUTLINE *> &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< C_BLOB *> *  target_blobs 
)

Definition at line 1046 of file control.cpp.

1049  {
1050  GenericVector<bool> blob_wanted;
1051  word_wanted->init_to_size(outlines.size(), false);
1052  target_blobs->init_to_size(outlines.size(), NULL);
1053  // Check for outlines that need to be turned into stand-alone blobs.
1054  for (int i = 0; i < outlines.size(); ++i) {
1055  if (outlines[i] == NULL) continue;
1056  // Get a set of adjacent outlines that don't overlap any existing blob.
1057  blob_wanted.init_to_size(outlines.size(), false);
1058  int num_blob_outlines = 0;
1059  TBOX total_ol_box(outlines[i]->bounding_box());
1060  while (i < outlines.size() && outlines[i] != NULL) {
1061  blob_wanted[i] = true;
1062  total_ol_box += outlines[i]->bounding_box();
1063  ++i;
1064  ++num_blob_outlines;
1065  }
1066  // Find the insertion point.
1067  C_BLOB_IT blob_it(real_word->cblob_list());
1068  while (!blob_it.at_last() &&
1069  blob_it.data_relative(1)->bounding_box().left() <=
1070  total_ol_box.left()) {
1071  blob_it.forward();
1072  }
1073  // Choose which combination of them we actually want and where to put
1074  // them.
1075  if (debug_noise_removal)
1076  tprintf("Num blobless outlines = %d\n", num_blob_outlines);
1077  C_BLOB* left_blob = blob_it.data();
1078  TBOX left_box = left_blob->bounding_box();
1079  C_BLOB* right_blob = blob_it.at_last() ? NULL : blob_it.data_relative(1);
1080  if ((left_box.x_overlap(total_ol_box) || right_blob == NULL ||
1081  !right_blob->bounding_box().x_overlap(total_ol_box)) &&
1082  SelectGoodDiacriticOutlines(pass, noise_cert_disjoint, pr_it, left_blob,
1083  outlines, num_blob_outlines,
1084  &blob_wanted)) {
1085  if (debug_noise_removal) tprintf("Added to left blob\n");
1086  for (int j = 0; j < blob_wanted.size(); ++j) {
1087  if (blob_wanted[j]) {
1088  (*word_wanted)[j] = true;
1089  (*target_blobs)[j] = left_blob;
1090  }
1091  }
1092  } else if (right_blob != NULL &&
1093  (!left_box.x_overlap(total_ol_box) ||
1094  right_blob->bounding_box().x_overlap(total_ol_box)) &&
1096  right_blob, outlines,
1097  num_blob_outlines, &blob_wanted)) {
1098  if (debug_noise_removal) tprintf("Added to right blob\n");
1099  for (int j = 0; j < blob_wanted.size(); ++j) {
1100  if (blob_wanted[j]) {
1101  (*word_wanted)[j] = true;
1102  (*target_blobs)[j] = right_blob;
1103  }
1104  }
1105  } else if (SelectGoodDiacriticOutlines(pass, noise_cert_punc, pr_it, NULL,
1106  outlines, num_blob_outlines,
1107  &blob_wanted)) {
1108  if (debug_noise_removal) tprintf("Fitted between blobs\n");
1109  for (int j = 0; j < blob_wanted.size(); ++j) {
1110  if (blob_wanted[j]) {
1111  (*word_wanted)[j] = true;
1112  (*target_blobs)[j] = NULL;
1113  }
1114  }
1115  }
1116  }
1117 }
void init_to_size(int size, T t)
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1122
bool x_overlap(const TBOX &box) const
Definition: rect.h:391
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
C_BLOB_LIST * cblob_list()
Definition: werd.h:100

◆ AssignDiacriticsToOverlappingBlobs()

void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs ( const GenericVector< C_OUTLINE *> &  outlines,
int  pass,
WERD real_word,
PAGE_RES_IT pr_it,
GenericVector< bool > *  word_wanted,
GenericVector< bool > *  overlapped_any_blob,
GenericVector< C_BLOB *> *  target_blobs 
)

Definition at line 993 of file control.cpp.

997  {
998  GenericVector<bool> blob_wanted;
999  word_wanted->init_to_size(outlines.size(), false);
1000  overlapped_any_blob->init_to_size(outlines.size(), false);
1001  target_blobs->init_to_size(outlines.size(), NULL);
1002  // For each real blob, find the outlines that seriously overlap it.
1003  // A single blob could be several merged characters, so there can be quite
1004  // a few outlines overlapping, and the full engine needs to be used to chop
1005  // and join to get a sensible result.
1006  C_BLOB_IT blob_it(real_word->cblob_list());
1007  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list(); blob_it.forward()) {
1008  C_BLOB* blob = blob_it.data();
1009  TBOX blob_box = blob->bounding_box();
1010  blob_wanted.init_to_size(outlines.size(), false);
1011  int num_blob_outlines = 0;
1012  for (int i = 0; i < outlines.size(); ++i) {
1013  if (blob_box.major_x_overlap(outlines[i]->bounding_box()) &&
1014  !(*word_wanted)[i]) {
1015  blob_wanted[i] = true;
1016  (*overlapped_any_blob)[i] = true;
1017  ++num_blob_outlines;
1018  }
1019  }
1020  if (debug_noise_removal) {
1021  tprintf("%d noise outlines overlap blob at:", num_blob_outlines);
1022  blob_box.print();
1023  }
1024  // If any outlines overlap the blob, and not too many, classify the blob
1025  // (using the full engine, languages and all), and choose the maximal
1026  // combination of outlines that doesn't hurt the end-result classification
1027  // by too much. Mark them as wanted.
1028  if (0 < num_blob_outlines && num_blob_outlines < noise_maxperblob) {
1029  if (SelectGoodDiacriticOutlines(pass, noise_cert_basechar, pr_it, blob,
1030  outlines, num_blob_outlines,
1031  &blob_wanted)) {
1032  for (int i = 0; i < blob_wanted.size(); ++i) {
1033  if (blob_wanted[i]) {
1034  // Claim the outline and record where it is going.
1035  (*word_wanted)[i] = true;
1036  (*target_blobs)[i] = blob;
1037  }
1038  }
1039  }
1040  }
1041  }
1042 }
void init_to_size(int size, T t)
bool SelectGoodDiacriticOutlines(int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines)
Definition: control.cpp:1122
#define tprintf(...)
Definition: tprintf.h:31
bool major_x_overlap(const TBOX &box) const
Definition: rect.h:402
int size() const
Definition: genericvector.h:72
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
void print() const
Definition: rect.h:270

◆ AutoPageSeg()

int tesseract::Tesseract::AutoPageSeg ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
TO_BLOCK_LIST *  to_blocks,
BLOBNBOX_LIST *  diacritic_blobs,
Tesseract osd_tess,
OSResults osr 
)

Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.

Resolution (in ppi) is derived from the input image.

The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.

If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.

If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout anaylsis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.

If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).

Definition at line 204 of file pagesegmain.cpp.

207  {
208  Pix* photomask_pix = NULL;
209  Pix* musicmask_pix = NULL;
210  // The blocks made by the ColumnFinder. Moved to blocks before return.
211  BLOCK_LIST found_blocks;
212  TO_BLOCK_LIST temp_blocks;
213 
214  ColumnFinder* finder = SetupPageSegAndDetectOrientation(
215  pageseg_mode, blocks, osd_tess, osr, &temp_blocks, &photomask_pix,
216  &musicmask_pix);
217  int result = 0;
218  if (finder != NULL) {
219  TO_BLOCK_IT to_block_it(&temp_blocks);
220  TO_BLOCK* to_block = to_block_it.data();
221  if (musicmask_pix != NULL) {
222  // TODO(rays) pass the musicmask_pix into FindBlocks and mark music
223  // blocks separately. For now combine with photomask_pix.
224  pixOr(photomask_pix, photomask_pix, musicmask_pix);
225  }
226  if (equ_detect_) {
227  finder->SetEquationDetect(equ_detect_);
228  }
229  result = finder->FindBlocks(pageseg_mode, scaled_color_, scaled_factor_,
230  to_block, photomask_pix, pix_thresholds_,
231  pix_grey_, &pixa_debug_, &found_blocks,
232  diacritic_blobs, to_blocks);
233  if (result >= 0)
234  finder->GetDeskewVectors(&deskew_, &reskew_);
235  delete finder;
236  }
237  pixDestroy(&photomask_pix);
238  pixDestroy(&musicmask_pix);
239  if (result < 0) return result;
240 
241  blocks->clear();
242  BLOCK_IT block_it(blocks);
243  // Move the found blocks to the input/output blocks.
244  block_it.add_list_after(&found_blocks);
245  return result;
246 }
ColumnFinder * SetupPageSegAndDetectOrientation(PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix)

◆ BelievableSuperscript()

bool tesseract::Tesseract::BelievableSuperscript ( bool  debug,
const WERD_RES word,
float  certainty_threshold,
int left_ok,
int right_ok 
) const

Return whether this is believable superscript or subscript text.

We insist that:

  • there are no punctuation marks.
  • there are no italics.
  • no normal-sized character is smaller than superscript_scaledown_ratio of what it ought to be, and
  • each character is at least as certain as certainty_threshold.
Parameters
[in]debugIf true, spew debug output
[in]wordThe word whose best_choice we're evaluating
[in]certainty_thresholdIf any of the characters have less certainty than this, reject.
[out]left_okHow many left-side characters were ok?
[out]right_okHow many right-side characters were ok?
Returns
Whether the complete best choice is believable as a superscript.

Definition at line 520 of file superscript.cpp.

524  {
525  int initial_ok_run_count = 0;
526  int ok_run_count = 0;
527  float worst_certainty = 0.0f;
528  const WERD_CHOICE &wc = *word.best_choice;
529 
530  const UnicityTable<FontInfo>& fontinfo_table = get_fontinfo_table();
531  for (int i = 0; i < wc.length(); i++) {
532  TBLOB *blob = word.rebuild_word->blobs[i];
533  UNICHAR_ID unichar_id = wc.unichar_id(i);
534  float char_certainty = wc.certainty(i);
535  bool bad_certainty = char_certainty < certainty_threshold;
536  bool is_punc = wc.unicharset()->get_ispunctuation(unichar_id);
537  bool is_italic = word.fontinfo && word.fontinfo->is_italic();
538  BLOB_CHOICE *choice = word.GetBlobChoice(i);
539  if (choice && fontinfo_table.size() > 0) {
540  // Get better information from the specific choice, if available.
541  int font_id1 = choice->fontinfo_id();
542  bool font1_is_italic = font_id1 >= 0
543  ? fontinfo_table.get(font_id1).is_italic() : false;
544  int font_id2 = choice->fontinfo_id2();
545  is_italic = font1_is_italic &&
546  (font_id2 < 0 || fontinfo_table.get(font_id2).is_italic());
547  }
548 
549  float height_fraction = 1.0f;
550  float char_height = blob->bounding_box().height();
551  float normal_height = char_height;
552  if (wc.unicharset()->top_bottom_useful()) {
553  int min_bot, max_bot, min_top, max_top;
554  wc.unicharset()->get_top_bottom(unichar_id,
555  &min_bot, &max_bot,
556  &min_top, &max_top);
557  float hi_height = max_top - max_bot;
558  float lo_height = min_top - min_bot;
559  normal_height = (hi_height + lo_height) / 2;
560  if (normal_height >= kBlnXHeight) {
561  // Only ding characters that we have decent information for because
562  // they're supposed to be normal sized, not tiny specks or dashes.
563  height_fraction = char_height / normal_height;
564  }
565  }
566  bool bad_height = height_fraction < superscript_scaledown_ratio;
567 
568  if (debug) {
569  if (is_italic) {
570  tprintf(" Rejecting: superscript is italic.\n");
571  }
572  if (is_punc) {
573  tprintf(" Rejecting: punctuation present.\n");
574  }
575  const char *char_str = wc.unicharset()->id_to_unichar(unichar_id);
576  if (bad_certainty) {
577  tprintf(" Rejecting: don't believe character %s with certainty %.2f "
578  "which is less than threshold %.2f\n", char_str,
579  char_certainty, certainty_threshold);
580  }
581  if (bad_height) {
582  tprintf(" Rejecting: character %s seems too small @ %.2f versus "
583  "expected %.2f\n", char_str, char_height, normal_height);
584  }
585  }
586  if (bad_certainty || bad_height || is_punc || is_italic) {
587  if (ok_run_count == i) {
588  initial_ok_run_count = ok_run_count;
589  }
590  ok_run_count = 0;
591  } else {
592  ok_run_count++;
593  }
594  if (char_certainty < worst_certainty) {
595  worst_certainty = char_certainty;
596  }
597  }
598  bool all_ok = ok_run_count == wc.length();
599  if (all_ok && debug) {
600  tprintf(" Accept: worst revised certainty is %.2f\n", worst_certainty);
601  }
602  if (!all_ok) {
603  if (left_ok) *left_ok = initial_ok_run_count;
604  if (right_ok) *right_ok = ok_run_count;
605  }
606  return all_ok;
607 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:479
WERD_CHOICE * best_choice
Definition: pageres.h:219
inT16 fontinfo_id2() const
Definition: ratngs.h:88
int length() const
Definition: ratngs.h:301
int size() const
Return the size used.
const int kBlnXHeight
Definition: normalis.h:28
TWERD * rebuild_word
Definition: pageres.h:244
#define tprintf(...)
Definition: tprintf.h:31
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
inT16 fontinfo_id() const
Definition: ratngs.h:85
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:528
const FontInfo * fontinfo
Definition: pageres.h:288
float certainty() const
Definition: ratngs.h:328
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
inT16 height() const
Definition: rect.h:104
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742
bool is_italic() const
Definition: fontinfo.h:111
const T & get(int id) const
Return the object from an id.
TBOX bounding_box() const
Definition: blobs.cpp:482
bool top_bottom_useful() const
Definition: unicharset.h:497

◆ BestPix()

Pix* tesseract::Tesseract::BestPix ( ) const
inline

Definition at line 216 of file tesseractclass.h.

216 { return pix_original_; }

◆ bigram_correction_pass()

void tesseract::Tesseract::bigram_correction_pass ( PAGE_RES page_res)

Definition at line 450 of file control.cpp.

450  {
451  PAGE_RES_IT word_it(page_res);
452 
453  WERD_RES *w_prev = NULL;
454  WERD_RES *w = word_it.word();
455  while (1) {
456  w_prev = w;
457  while (word_it.forward() != NULL &&
458  (!word_it.word() || word_it.word()->part_of_combo)) {
459  // advance word_it, skipping over parts of combos
460  }
461  if (!word_it.word()) break;
462  w = word_it.word();
463  if (!w || !w_prev || w->uch_set != w_prev->uch_set) {
464  continue;
465  }
466  if (w_prev->word->flag(W_REP_CHAR) || w->word->flag(W_REP_CHAR)) {
467  if (tessedit_bigram_debug) {
468  tprintf("Skipping because one of the words is W_REP_CHAR\n");
469  }
470  continue;
471  }
472  // Two words sharing the same language model, excellent!
473  GenericVector<WERD_CHOICE *> overrides_word1;
474  GenericVector<WERD_CHOICE *> overrides_word2;
475 
476  STRING orig_w1_str = w_prev->best_choice->unichar_string();
477  STRING orig_w2_str = w->best_choice->unichar_string();
478  WERD_CHOICE prev_best(w->uch_set);
479  {
480  int w1start, w1end;
481  w_prev->best_choice->GetNonSuperscriptSpan(&w1start, &w1end);
482  prev_best = w_prev->best_choice->shallow_copy(w1start, w1end);
483  }
484  WERD_CHOICE this_best(w->uch_set);
485  {
486  int w2start, w2end;
487  w->best_choice->GetNonSuperscriptSpan(&w2start, &w2end);
488  this_best = w->best_choice->shallow_copy(w2start, w2end);
489  }
490 
491  if (w->tesseract->getDict().valid_bigram(prev_best, this_best)) {
492  if (tessedit_bigram_debug) {
493  tprintf("Top choice \"%s %s\" verified by bigram model.\n",
494  orig_w1_str.string(), orig_w2_str.string());
495  }
496  continue;
497  }
498  if (tessedit_bigram_debug > 2) {
499  tprintf("Examining alt choices for \"%s %s\".\n",
500  orig_w1_str.string(), orig_w2_str.string());
501  }
502  if (tessedit_bigram_debug > 1) {
503  if (!w_prev->best_choices.singleton()) {
504  w_prev->PrintBestChoices();
505  }
506  if (!w->best_choices.singleton()) {
507  w->PrintBestChoices();
508  }
509  }
510  float best_rating = 0.0;
511  int best_idx = 0;
512  WERD_CHOICE_IT prev_it(&w_prev->best_choices);
513  for (prev_it.mark_cycle_pt(); !prev_it.cycled_list(); prev_it.forward()) {
514  WERD_CHOICE *p1 = prev_it.data();
515  WERD_CHOICE strip1(w->uch_set);
516  {
517  int p1start, p1end;
518  p1->GetNonSuperscriptSpan(&p1start, &p1end);
519  strip1 = p1->shallow_copy(p1start, p1end);
520  }
521  WERD_CHOICE_IT w_it(&w->best_choices);
522  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
523  WERD_CHOICE *p2 = w_it.data();
524  WERD_CHOICE strip2(w->uch_set);
525  {
526  int p2start, p2end;
527  p2->GetNonSuperscriptSpan(&p2start, &p2end);
528  strip2 = p2->shallow_copy(p2start, p2end);
529  }
530  if (w->tesseract->getDict().valid_bigram(strip1, strip2)) {
531  overrides_word1.push_back(p1);
532  overrides_word2.push_back(p2);
533  if (overrides_word1.size() == 1 ||
534  p1->rating() + p2->rating() < best_rating) {
535  best_rating = p1->rating() + p2->rating();
536  best_idx = overrides_word1.size() - 1;
537  }
538  }
539  }
540  }
541  if (!overrides_word1.empty()) {
542  // Excellent, we have some bigram matches.
544  *overrides_word1[best_idx]) &&
546  *overrides_word2[best_idx])) {
547  if (tessedit_bigram_debug > 1) {
548  tprintf("Top choice \"%s %s\" verified (sans case) by bigram "
549  "model.\n", orig_w1_str.string(), orig_w2_str.string());
550  }
551  continue;
552  }
553  STRING new_w1_str = overrides_word1[best_idx]->unichar_string();
554  STRING new_w2_str = overrides_word2[best_idx]->unichar_string();
555  if (new_w1_str != orig_w1_str) {
556  w_prev->ReplaceBestChoice(overrides_word1[best_idx]);
557  }
558  if (new_w2_str != orig_w2_str) {
559  w->ReplaceBestChoice(overrides_word2[best_idx]);
560  }
561  if (tessedit_bigram_debug > 0) {
562  STRING choices_description;
563  int num_bigram_choices
564  = overrides_word1.size() * overrides_word2.size();
565  if (num_bigram_choices == 1) {
566  choices_description = "This was the unique bigram choice.";
567  } else {
568  if (tessedit_bigram_debug > 1) {
569  STRING bigrams_list;
570  const int kMaxChoicesToPrint = 20;
571  for (int i = 0; i < overrides_word1.size() &&
572  i < kMaxChoicesToPrint; i++) {
573  if (i > 0) { bigrams_list += ", "; }
574  WERD_CHOICE *p1 = overrides_word1[i];
575  WERD_CHOICE *p2 = overrides_word2[i];
576  bigrams_list += p1->unichar_string() + " " + p2->unichar_string();
577  if (i == kMaxChoicesToPrint) {
578  bigrams_list += " ...";
579  }
580  }
581  choices_description = "There were many choices: {";
582  choices_description += bigrams_list;
583  choices_description += "}";
584  } else {
585  choices_description.add_str_int("There were ", num_bigram_choices);
586  choices_description += " compatible bigrams.";
587  }
588  }
589  tprintf("Replaced \"%s %s\" with \"%s %s\" with bigram model. %s\n",
590  orig_w1_str.string(), orig_w2_str.string(),
591  new_w1_str.string(), new_w2_str.string(),
592  choices_description.string());
593  }
594  }
595  }
596 }
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
void ReplaceBestChoice(WERD_CHOICE *choice)
Definition: pageres.cpp:787
WERD_CHOICE * best_choice
Definition: pageres.h:219
Dict & getDict()
Definition: classify.h:65
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
bool empty() const
Definition: genericvector.h:90
int size() const
Definition: genericvector.h:72
tesseract::Tesseract * tesseract
Definition: pageres.h:266
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
bool valid_bigram(const WERD_CHOICE &word1, const WERD_CHOICE &word2) const
Definition: dict.cpp:783
Definition: strngs.h:45
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:375
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:392
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:791
const STRING & unichar_string() const
Definition: ratngs.h:539
void PrintBestChoices() const
Definition: pageres.cpp:709
WERD * word
Definition: pageres.h:175
const UNICHARSET * uch_set
Definition: pageres.h:192
float rating() const
Definition: ratngs.h:325

◆ blamer_pass()

void tesseract::Tesseract::blamer_pass ( PAGE_RES page_res)

Definition at line 694 of file control.cpp.

694  {
695  if (!wordrec_run_blamer) return;
696  PAGE_RES_IT page_res_it(page_res);
697  for (page_res_it.restart_page(); page_res_it.word() != NULL;
698  page_res_it.forward()) {
699  WERD_RES *word = page_res_it.word();
702  }
703  tprintf("Blame reasons:\n");
704  for (int bl = 0; bl < IRR_NUM_REASONS; ++bl) {
706  static_cast<IncorrectResultReason>(bl)),
707  page_res->blame_reasons[bl]);
708  }
709  if (page_res->misadaption_log.length() > 0) {
710  tprintf("Misadaption log:\n");
711  for (int i = 0; i < page_res->misadaption_log.length(); ++i) {
712  tprintf("%s\n", page_res->misadaption_log[i].string());
713  }
714  }
715 }
BlamerBundle * blamer_bundle
Definition: pageres.h:230
#define tprintf(...)
Definition: tprintf.h:31
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
static void LastChanceBlame(bool debug, WERD_RES *word)
Definition: blamer.cpp:547
int length() const
Definition: genericvector.h:85
WERD * word
Definition: pageres.h:175
bool wordrec_run_blamer
Definition: wordrec.h:168
bool wordrec_debug_blamer
Definition: wordrec.h:167
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
GenericVector< STRING > misadaption_log
Definition: pageres.h:73
GenericVector< int > blame_reasons
Definition: pageres.h:68

◆ blob_feature_display()

void tesseract::Tesseract::blob_feature_display ( PAGE_RES page_res,
const TBOX selection_box 
)

Definition at line 959 of file pgedit.cpp.

960  {
961  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
962  if (it != NULL) {
963  WERD_RES* word_res = it->word();
964  word_res->x_height = it->row()->row->x_height();
965  word_res->SetupForRecognition(unicharset, this, BestPix(),
970  it->row()->row, it->block()->block);
971  TWERD* bln_word = word_res->chopped_word;
972  TBLOB* bln_blob = bln_word->blobs[0];
973  INT_FX_RESULT_STRUCT fx_info;
976  Classify::ExtractFeatures(*bln_blob, classify_nonlinear_norm, &bl_features,
977  &cn_features, &fx_info, NULL);
978  // Display baseline features.
979  ScrollView* bl_win = CreateFeatureSpaceWindow("BL Features", 512, 0);
981  for (int f = 0; f < bl_features.size(); ++f)
982  RenderIntFeature(bl_win, &bl_features[f], ScrollView::GREEN);
983  bl_win->Update();
984  // Display cn features.
985  ScrollView* cn_win = CreateFeatureSpaceWindow("CN Features", 512, 0);
987  for (int f = 0; f < cn_features.size(); ++f)
988  RenderIntFeature(cn_win, &cn_features[f], ScrollView::GREEN);
989  cn_win->Update();
990 
991  it->DeleteCurrentWord();
992  delete it;
993  }
994 }
void ClearFeatureSpaceWindow(NORM_METHOD norm_method, ScrollView *window)
Definition: intproto.cpp:1033
ScrollView * CreateFeatureSpaceWindow(const char *name, int xpos, int ypos)
Definition: intproto.cpp:1858
bool classify_nonlinear_norm
Definition: classify.h:415
bool classify_bln_numeric_mode
Definition: classify.h:499
BLOCK * block
Definition: pageres.h:99
ROW * row
Definition: pageres.h:127
float x_height() const
Definition: ocrrow.h:61
int size() const
Definition: genericvector.h:72
ROW_RES * row() const
Definition: pageres.h:739
Definition: blobs.h:395
void RenderIntFeature(ScrollView *window, const INT_FEATURE_STRUCT *Feature, ScrollView::Color color)
Definition: intproto.cpp:1693
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:31
static void Update()
Definition: scrollview.cpp:715
Pix * BestPix() const
UNICHARSET unicharset
Definition: ccutil.h:68
void DeleteCurrentWord()
Definition: pageres.cpp:1451
static void ExtractFeatures(const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
Definition: intfx.cpp:445
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
WERD_RES * word() const
Definition: pageres.h:736
TWERD * chopped_word
Definition: pageres.h:201
float x_height
Definition: pageres.h:295
BLOCK_RES * block() const
Definition: pageres.h:742

◆ blob_noise_score()

float tesseract::Tesseract::blob_noise_score ( TBLOB blob)

Definition at line 760 of file fixspace.cpp.

760  {
761  TBOX box; // BB of outline
762  inT16 outline_count = 0;
763  inT16 max_dimension;
764  inT16 largest_outline_dimension = 0;
765 
766  for (TESSLINE* ol = blob->outlines; ol != NULL; ol= ol->next) {
767  outline_count++;
768  box = ol->bounding_box();
769  if (box.height() > box.width()) {
770  max_dimension = box.height();
771  } else {
772  max_dimension = box.width();
773  }
774 
775  if (largest_outline_dimension < max_dimension)
776  largest_outline_dimension = max_dimension;
777  }
778 
779  if (outline_count > 5) {
780  // penalise LOTS of blobs
781  largest_outline_dimension *= 2;
782  }
783 
784  box = blob->bounding_box();
785  if (box.bottom() > kBlnBaselineOffset * 4 ||
786  box.top() < kBlnBaselineOffset / 2) {
787  // Lax blob is if high or low
788  largest_outline_dimension /= 2;
789  }
790 
791  return largest_outline_dimension;
792 }
TESSLINE * next
Definition: blobs.h:258
TESSLINE * outlines
Definition: blobs.h:377
const int kBlnBaselineOffset
Definition: normalis.h:29
int16_t inT16
Definition: host.h:36
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
inT16 bottom() const
Definition: rect.h:61
TBOX bounding_box() const
Definition: blobs.cpp:482

◆ break_noisiest_blob_word()

void tesseract::Tesseract::break_noisiest_blob_word ( WERD_RES_LIST &  words)

break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.

Definition at line 615 of file fixspace.cpp.

615  {
616  WERD_RES_IT word_it(&words);
617  WERD_RES_IT worst_word_it;
618  float worst_noise_score = 9999;
619  int worst_blob_index = -1; // Noisiest blob of noisiest wd
620  int blob_index; // of wds noisiest blob
621  float noise_score; // of wds noisiest blob
622  WERD_RES *word_res;
623  C_BLOB_IT blob_it;
624  C_BLOB_IT rej_cblob_it;
625  C_BLOB_LIST new_blob_list;
626  C_BLOB_IT new_blob_it;
627  C_BLOB_IT new_rej_cblob_it;
628  WERD *new_word;
629  inT16 start_of_noise_blob;
630  inT16 i;
631 
632  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
633  blob_index = worst_noise_blob(word_it.data(), &noise_score);
634  if (blob_index > -1 && worst_noise_score > noise_score) {
635  worst_noise_score = noise_score;
636  worst_blob_index = blob_index;
637  worst_word_it = word_it;
638  }
639  }
640  if (worst_blob_index < 0) {
641  words.clear(); // signal termination
642  return;
643  }
644 
645  /* Now split the worst_word_it */
646 
647  word_res = worst_word_it.data();
648 
649  /* Move blobs before noise blob to a new bloblist */
650 
651  new_blob_it.set_to_list(&new_blob_list);
652  blob_it.set_to_list(word_res->word->cblob_list());
653  for (i = 0; i < worst_blob_index; i++, blob_it.forward()) {
654  new_blob_it.add_after_then_move(blob_it.extract());
655  }
656  start_of_noise_blob = blob_it.data()->bounding_box().left();
657  delete blob_it.extract(); // throw out noise blob
658 
659  new_word = new WERD(&new_blob_list, word_res->word);
660  new_word->set_flag(W_EOL, FALSE);
661  word_res->word->set_flag(W_BOL, FALSE);
662  word_res->word->set_blanks(1); // After break
663 
664  new_rej_cblob_it.set_to_list(new_word->rej_cblob_list());
665  rej_cblob_it.set_to_list(word_res->word->rej_cblob_list());
666  for (;
667  (!rej_cblob_it.empty() &&
668  (rej_cblob_it.data()->bounding_box().left() < start_of_noise_blob));
669  rej_cblob_it.forward()) {
670  new_rej_cblob_it.add_after_then_move(rej_cblob_it.extract());
671  }
672 
673  WERD_RES* new_word_res = new WERD_RES(new_word);
674  new_word_res->combination = TRUE;
675  worst_word_it.add_before_then_move(new_word_res);
676 
677  word_res->ClearResults();
678 }
#define TRUE
Definition: capi.h:45
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:680
Definition: werd.h:36
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
void ClearResults()
Definition: pageres.cpp:1142
int16_t inT16
Definition: host.h:36
BOOL8 combination
Definition: pageres.h:318
#define FALSE
Definition: capi.h:46
Definition: werd.h:35
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD * word
Definition: pageres.h:175
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
void set_blanks(uinT8 new_blanks)
Definition: werd.h:107
Definition: werd.h:60

◆ build_menu_new()

SVMenuNode * tesseract::Tesseract::build_menu_new ( )

Definition at line 257 of file pgedit.cpp.

257  {
258  SVMenuNode* parent_menu;
259  SVMenuNode* root_menu_item = new SVMenuNode();
260 
261  SVMenuNode* modes_menu_item = root_menu_item->AddChild("MODES");
262 
263  modes_menu_item->AddChild("Change Display", CHANGE_DISP_CMD_EVENT);
264  modes_menu_item->AddChild("Dump Word", DUMP_WERD_CMD_EVENT);
265  modes_menu_item->AddChild("Show Point", SHOW_POINT_CMD_EVENT);
266  modes_menu_item->AddChild("Show BL Norm Word", SHOW_BLN_WERD_CMD_EVENT);
267  modes_menu_item->AddChild("Config Words", DEBUG_WERD_CMD_EVENT);
268  modes_menu_item->AddChild("Recog Words", RECOG_WERDS);
269  modes_menu_item->AddChild("Recog Blobs", RECOG_PSEUDO);
270  modes_menu_item->AddChild("Show Blob Features", SHOW_BLOB_FEATURES);
271 
272  parent_menu = root_menu_item->AddChild("DISPLAY");
273 
274  parent_menu->AddChild("Blamer", BLAMER_CMD_EVENT, FALSE);
275  parent_menu->AddChild("Bounding Boxes", BOUNDING_BOX_CMD_EVENT, FALSE);
276  parent_menu->AddChild("Correct Text", CORRECT_TEXT_CMD_EVENT, FALSE);
277  parent_menu->AddChild("Polygonal Approx", POLYGONAL_CMD_EVENT, FALSE);
278  parent_menu->AddChild("Baseline Normalized", BL_NORM_CMD_EVENT, FALSE);
279  parent_menu->AddChild("Edge Steps", BITMAP_CMD_EVENT, TRUE);
280  parent_menu->AddChild("Subscripts", SHOW_SUBSCRIPT_CMD_EVENT);
281  parent_menu->AddChild("Superscripts", SHOW_SUPERSCRIPT_CMD_EVENT);
282  parent_menu->AddChild("Italics", SHOW_ITALIC_CMD_EVENT);
283  parent_menu->AddChild("Bold", SHOW_BOLD_CMD_EVENT);
284  parent_menu->AddChild("Underline", SHOW_UNDERLINE_CMD_EVENT);
285  parent_menu->AddChild("FixedPitch", SHOW_FIXEDPITCH_CMD_EVENT);
286  parent_menu->AddChild("Serifs", SHOW_SERIF_CMD_EVENT);
287  parent_menu->AddChild("SmallCaps", SHOW_SMALLCAPS_CMD_EVENT);
288  parent_menu->AddChild("DropCaps", SHOW_DROPCAPS_CMD_EVENT);
289 
290 
291  parent_menu = root_menu_item->AddChild("OTHER");
292 
293  parent_menu->AddChild("Quit", QUIT_CMD_EVENT);
294  parent_menu->AddChild("Show Image", IMAGE_CMD_EVENT, FALSE);
295  parent_menu->AddChild("ShowBlock Outlines", BLOCKS_CMD_EVENT, FALSE);
296  parent_menu->AddChild("Show Baselines", BASELINES_CMD_EVENT, FALSE);
297  parent_menu->AddChild("Uniform Display", UNIFORM_DISP_CMD_EVENT);
298  parent_menu->AddChild("Refresh Display", REFRESH_CMD_EVENT);
299 
300  return root_menu_item;
301 }
#define TRUE
Definition: capi.h:45
#define FALSE
Definition: capi.h:46
SVMenuNode * AddChild(const char *txt)
Definition: svmnode.cpp:59

◆ check_debug_pt()

BOOL8 tesseract::Tesseract::check_debug_pt ( WERD_RES word,
int  location 
)

Definition at line 1794 of file control.cpp.

1794  {
1795  BOOL8 show_map_detail = FALSE;
1796  inT16 i;
1797 
1798  if (!test_pt)
1799  return FALSE;
1800 
1801  tessedit_rejection_debug.set_value (FALSE);
1802  debug_x_ht_level.set_value(0);
1803 
1804  if (word->word->bounding_box ().contains (FCOORD (test_pt_x, test_pt_y))) {
1805  if (location < 0)
1806  return TRUE; // For breakpoint use
1807  tessedit_rejection_debug.set_value (TRUE);
1808  debug_x_ht_level.set_value(2);
1809  tprintf ("\n\nTESTWD::");
1810  switch (location) {
1811  case 0:
1812  tprintf ("classify_word_pass1 start\n");
1813  word->word->print();
1814  break;
1815  case 10:
1816  tprintf ("make_reject_map: initial map");
1817  break;
1818  case 20:
1819  tprintf ("make_reject_map: after NN");
1820  break;
1821  case 30:
1822  tprintf ("classify_word_pass2 - START");
1823  break;
1824  case 40:
1825  tprintf ("classify_word_pass2 - Pre Xht");
1826  break;
1827  case 50:
1828  tprintf ("classify_word_pass2 - END");
1829  show_map_detail = TRUE;
1830  break;
1831  case 60:
1832  tprintf ("fixspace");
1833  break;
1834  case 70:
1835  tprintf ("MM pass START");
1836  break;
1837  case 80:
1838  tprintf ("MM pass END");
1839  break;
1840  case 90:
1841  tprintf ("After Poor quality rejection");
1842  break;
1843  case 100:
1844  tprintf ("unrej_good_quality_words - START");
1845  break;
1846  case 110:
1847  tprintf ("unrej_good_quality_words - END");
1848  break;
1849  case 120:
1850  tprintf ("Write results pass");
1851  show_map_detail = TRUE;
1852  break;
1853  }
1854  if (word->best_choice != NULL) {
1855  tprintf(" \"%s\" ", word->best_choice->unichar_string().string());
1856  word->reject_map.print(debug_fp);
1857  tprintf("\n");
1858  if (show_map_detail) {
1859  tprintf("\"%s\"\n", word->best_choice->unichar_string().string());
1860  for (i = 0; word->best_choice->unichar_string()[i] != '\0'; i++) {
1861  tprintf("**** \"%c\" ****\n", word->best_choice->unichar_string()[i]);
1862  word->reject_map[i].full_print(debug_fp);
1863  }
1864  }
1865  } else {
1866  tprintf("null best choice\n");
1867  }
1868  tprintf ("Tess Accepted: %s\n", word->tess_accepted ? "TRUE" : "FALSE");
1869  tprintf ("Done flag: %s\n\n", word->done ? "TRUE" : "FALSE");
1870  return TRUE;
1871  } else {
1872  return FALSE;
1873  }
1874 }
#define TRUE
Definition: capi.h:45
Definition: points.h:189
void print(FILE *fp)
Definition: rejctmap.cpp:391
WERD_CHOICE * best_choice
Definition: pageres.h:219
void print()
Definition: werd.cpp:266
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
FILE * debug_fp
Definition: tessvars.cpp:24
TBOX bounding_box() const
Definition: werd.cpp:160
int16_t inT16
Definition: host.h:36
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
BOOL8 tess_accepted
Definition: pageres.h:280
bool contains(const FCOORD pt) const
Definition: rect.h:323
const STRING & unichar_string() const
Definition: ratngs.h:539
void full_print(FILE *fp)
Definition: rejctmap.cpp:403
WERD * word
Definition: pageres.h:175
REJMAP reject_map
Definition: pageres.h:271
BOOL8 done
Definition: pageres.h:282

◆ classify_word_and_language()

void tesseract::Tesseract::classify_word_and_language ( int  pass_n,
PAGE_RES_IT pr_it,
WordData word_data 
)

Definition at line 1285 of file control.cpp.

1286  {
1287  WordRecognizer recognizer = pass_n == 1 ? &Tesseract::classify_word_pass1
1289  // Best result so far.
1290  PointerVector<WERD_RES> best_words;
1291  // Points to the best result. May be word or in lang_words.
1292  WERD_RES* word = word_data->word;
1293  clock_t start_t = clock();
1294  bool debug = classify_debug_level > 0 || multilang_debug_level > 0;
1295  if (debug) {
1296  tprintf("%s word with lang %s at:",
1297  word->done ? "Already done" : "Processing",
1298  most_recently_used_->lang.string());
1299  word->word->bounding_box().print();
1300  }
1301  if (word->done) {
1302  // If done on pass1, leave it as-is.
1303  if (!word->tess_failed)
1304  most_recently_used_ = word->tesseract;
1305  return;
1306  }
1307  int sub = sub_langs_.size();
1308  if (most_recently_used_ != this) {
1309  // Get the index of the most_recently_used_.
1310  for (sub = 0; sub < sub_langs_.size() &&
1311  most_recently_used_ != sub_langs_[sub]; ++sub) {}
1312  }
1313  most_recently_used_->RetryWithLanguage(
1314  *word_data, recognizer, debug, &word_data->lang_words[sub], &best_words);
1315  Tesseract* best_lang_tess = most_recently_used_;
1316  if (!WordsAcceptable(best_words)) {
1317  // Try all the other languages to see if they are any better.
1318  if (most_recently_used_ != this &&
1319  this->RetryWithLanguage(*word_data, recognizer, debug,
1320  &word_data->lang_words[sub_langs_.size()],
1321  &best_words) > 0) {
1322  best_lang_tess = this;
1323  }
1324  for (int i = 0; !WordsAcceptable(best_words) && i < sub_langs_.size();
1325  ++i) {
1326  if (most_recently_used_ != sub_langs_[i] &&
1327  sub_langs_[i]->RetryWithLanguage(*word_data, recognizer, debug,
1328  &word_data->lang_words[i],
1329  &best_words) > 0) {
1330  best_lang_tess = sub_langs_[i];
1331  }
1332  }
1333  }
1334  most_recently_used_ = best_lang_tess;
1335  if (!best_words.empty()) {
1336  if (best_words.size() == 1 && !best_words[0]->combination) {
1337  // Move the best single result to the main word.
1338  word_data->word->ConsumeWordResults(best_words[0]);
1339  } else {
1340  // Words came from LSTM, and must be moved to the PAGE_RES properly.
1341  word_data->word = best_words.back();
1342  pr_it->ReplaceCurrentWord(&best_words);
1343  }
1344  ASSERT_HOST(word_data->word->box_word != NULL);
1345  } else {
1346  tprintf("no best words!!\n");
1347  }
1348  clock_t ocr_t = clock();
1349  if (tessedit_timing_debug) {
1350  tprintf("%s (ocr took %.2f sec)\n",
1351  word->best_choice->unichar_string().string(),
1352  static_cast<double>(ocr_t-start_t)/CLOCKS_PER_SEC);
1353  }
1354 }
void classify_word_pass1(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1362
BOOL8 tess_failed
Definition: pageres.h:272
WERD_CHOICE * best_choice
Definition: pageres.h:219
void ReplaceCurrentWord(tesseract::PointerVector< WERD_RES > *words)
Definition: pageres.cpp:1323
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
TBOX bounding_box() const
Definition: werd.cpp:160
tesseract::Tesseract * tesseract
Definition: pageres.h:266
#define ASSERT_HOST(x)
Definition: errcode.h:84
STRING lang
Definition: ccutil.h:66
void classify_word_pass2(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
Definition: control.cpp:1519
int RetryWithLanguage(const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words)
Definition: control.cpp:888
const STRING & unichar_string() const
Definition: ratngs.h:539
WERD * word
Definition: pageres.h:175
void print() const
Definition: rect.h:270
void(Tesseract::* WordRecognizer)(const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words)
BOOL8 done
Definition: pageres.h:282

◆ classify_word_pass1()

void tesseract::Tesseract::classify_word_pass1 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass1

Baseline normalize the word and pass it to Tess.

Definition at line 1362 of file control.cpp.

1364  {
1365  ROW* row = word_data.row;
1366  BLOCK* block = word_data.block;
1367  prev_word_best_choice_ = word_data.prev_word != NULL
1368  ? word_data.prev_word->word->best_choice : NULL;
1369 #ifndef ANDROID_BUILD
1372  if (!(*in_word)->odd_size || tessedit_ocr_engine_mode == OEM_LSTM_ONLY) {
1373  LSTMRecognizeWord(*block, row, *in_word, out_words);
1374  if (!out_words->empty())
1375  return; // Successful lstm recognition.
1376  }
1378  // No fallback allowed, so use a fake.
1379  (*in_word)->SetupFake(lstm_recognizer_->GetUnicharset());
1380  return;
1381  }
1382  // Fall back to tesseract for failed words or odd words.
1383  (*in_word)->SetupForRecognition(unicharset, this, BestPix(),
1384  OEM_TESSERACT_ONLY, NULL,
1387  poly_allow_detailed_fx, row, block);
1388  }
1389 #endif
1390  WERD_RES* word = *in_word;
1391  match_word_pass_n(1, word, row, block);
1392  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1393  word->tess_would_adapt = AdaptableWord(word);
1394  bool adapt_ok = word_adaptable(word, tessedit_tess_adaption_mode);
1395 
1396  if (adapt_ok) {
1397  // Send word to adaptive classifier for training.
1398  word->BestChoiceToCorrectText();
1399  LearnWord(NULL, word);
1400  // Mark misadaptions if running blamer.
1401  if (word->blamer_bundle != NULL) {
1404  }
1405  }
1406 
1407  if (tessedit_enable_doc_dict && !word->IsAmbiguous())
1409  }
1410 }
bool classify_bln_numeric_mode
Definition: classify.h:499
BOOL8 tess_failed
Definition: pageres.h:272
WERD_CHOICE * best_choice
Definition: pageres.h:219
BlamerBundle * blamer_bundle
Definition: pageres.h:230
const UNICHARSET & GetUnicharset() const
bool AdaptableWord(WERD_RES *word)
Definition: adaptmatch.cpp:836
void LSTMRecognizeWord(const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words)
Definition: linerec.cpp:224
void BestChoiceToCorrectText()
Definition: pageres.cpp:918
bool empty() const
Definition: genericvector.h:90
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
Pix * BestPix() const
BOOL8 tess_would_adapt
Definition: pageres.h:281
UNICHARSET unicharset
Definition: ccutil.h:68
void tess_add_doc_word(WERD_CHOICE *word_choice)
Definition: tessbox.cpp:79
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1576
WERD * word
Definition: pageres.h:175
bool IsAmbiguous()
Definition: pageres.cpp:444
bool wordrec_debug_blamer
Definition: wordrec.h:167
void LearnWord(const char *fontname, WERD_RES *word)
Definition: adaptmatch.cpp:244
void SetMisAdaptionDebug(const WERD_CHOICE *best_choice, bool debug)
Definition: blamer.cpp:574
Definition: ocrrow.h:32
Definition: ocrblock.h:30
BOOL8 word_adaptable(WERD_RES *word, uinT16 mode)
Definition: adaptions.cpp:45
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:415

◆ classify_word_pass2()

void tesseract::Tesseract::classify_word_pass2 ( const WordData word_data,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  out_words 
)

classify_word_pass2

Control what to do with the word in pass 2

Definition at line 1519 of file control.cpp.

1521  {
1522  // Return if we do not want to run Tesseract.
1524  return;
1525  }
1526  ROW* row = word_data.row;
1527  BLOCK* block = word_data.block;
1528  WERD_RES* word = *in_word;
1529  prev_word_best_choice_ = word_data.prev_word != NULL
1530  ? word_data.prev_word->word->best_choice : NULL;
1531 
1533  check_debug_pt(word, 30);
1534  if (!word->done) {
1535  word->caps_height = 0.0;
1536  if (word->x_height == 0.0f)
1537  word->x_height = row->x_height();
1538  match_word_pass_n(2, word, row, block);
1539  check_debug_pt(word, 40);
1540  }
1541 
1542  SubAndSuperscriptFix(word);
1543 
1544  if (!word->tess_failed && !word->word->flag(W_REP_CHAR)) {
1546  block->classify_rotation().y() == 0.0f) {
1547  // Use the tops and bottoms since they are available.
1548  TrainedXheightFix(word, block, row);
1549  }
1550 
1552  }
1553 #ifndef GRAPHICS_DISABLED
1555  if (fx_win == NULL)
1556  create_fx_win();
1557  clear_fx_win();
1558  word->rebuild_word->plot(fx_win);
1559  TBOX wbox = word->rebuild_word->bounding_box();
1560  fx_win->ZoomToRectangle(wbox.left(), wbox.top(),
1561  wbox.right(), wbox.bottom());
1563  }
1564 #endif
1566  check_debug_pt(word, 50);
1567 }
void plot(ScrollView *window)
Definition: blobs.cpp:916
BOOL8 tess_failed
Definition: pageres.h:272
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1794
TWERD * rebuild_word
Definition: pageres.h:244
bool TrainedXheightFix(WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1434
#define SUBLOC_NORM
Definition: errcode.h:59
float x_height() const
Definition: ocrrow.h:61
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
inT16 left() const
Definition: rect.h:68
float caps_height
Definition: pageres.h:296
void set_global_subloc_code(int loc_code)
Definition: globaloc.cpp:85
static void Update()
Definition: scrollview.cpp:715
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:765
EXTERN ScrollView * fx_win
Definition: drawfx.cpp:51
UNICHARSET unicharset
Definition: ccutil.h:68
void clear_fx_win()
Definition: drawfx.cpp:73
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1576
float y() const
Definition: points.h:212
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
bool SubAndSuperscriptFix(WERD_RES *word_res)
bool top_bottom_useful() const
Definition: unicharset.h:497
Definition: ocrrow.h:32
FCOORD classify_rotation() const
Definition: ocrblock.h:144
void create_fx_win()
Definition: drawfx.cpp:60
float x_height
Definition: pageres.h:295
Definition: ocrblock.h:30
bool script_has_xheight() const
Definition: unicharset.h:863
TBOX bounding_box() const
Definition: blobs.cpp:879
BOOL8 done
Definition: pageres.h:282
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:415

◆ ClassifyBlobAsWord()

float tesseract::Tesseract::ClassifyBlobAsWord ( int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str,
float *  c2 
)

Definition at line 1249 of file control.cpp.

1250  {
1251  WERD* real_word = pr_it->word()->word;
1252  WERD* word = real_word->ConstructFromSingleBlob(
1253  real_word->flag(W_BOL), real_word->flag(W_EOL), C_BLOB::deep_copy(blob));
1254  WERD_RES* word_res = pr_it->InsertSimpleCloneWord(*pr_it->word(), word);
1255  // Get a new iterator that points to the new word.
1256  PAGE_RES_IT it(pr_it->page_res);
1257  while (it.word() != word_res && it.word() != NULL) it.forward();
1258  ASSERT_HOST(it.word() == word_res);
1259  WordData wd(it);
1260  // Force full initialization.
1261  SetupWordPassN(1, &wd);
1262  classify_word_and_language(pass_n, &it, &wd);
1263  if (debug_noise_removal) {
1264  tprintf("word xheight=%g, row=%g, range=[%g,%g]\n", word_res->x_height,
1265  wd.row->x_height(), wd.word->raw_choice->min_x_height(),
1266  wd.word->raw_choice->max_x_height());
1267  }
1268  float cert = wd.word->raw_choice->certainty();
1269  float rat = wd.word->raw_choice->rating();
1270  *c2 = rat > 0.0f ? cert * cert / rat : 0.0f;
1271  *best_str = wd.word->raw_choice->unichar_string();
1272  it.DeleteCurrentWord();
1273  pr_it->ResetWordIterator();
1274  return cert;
1275 }
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1285
Definition: werd.h:36
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
static C_BLOB * deep_copy(const C_BLOB *src)
Definition: stepblob.h:113
PAGE_RES * page_res
Definition: pageres.h:661
Definition: werd.h:35
void ResetWordIterator()
Definition: pageres.cpp:1534
WERD * word
Definition: pageres.h:175
WERD_RES * word() const
Definition: pageres.h:736
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:174
WERD_RES * InsertSimpleCloneWord(const WERD_RES &clone_res, WERD *new_word)
Definition: pageres.cpp:1270
Definition: werd.h:60
WERD * ConstructFromSingleBlob(bool bol, bool eol, C_BLOB *blob)
Definition: werd.cpp:137
float x_height
Definition: pageres.h:295

◆ ClassifyBlobPlusOutlines()

float tesseract::Tesseract::ClassifyBlobPlusOutlines ( const GenericVector< bool > &  ok_outlines,
const GenericVector< C_OUTLINE *> &  outlines,
int  pass_n,
PAGE_RES_IT pr_it,
C_BLOB blob,
STRING best_str 
)

Definition at line 1207 of file control.cpp.

1210  {
1211  C_OUTLINE_IT ol_it;
1212  C_OUTLINE* first_to_keep = NULL;
1213  if (blob != NULL) {
1214  // Add the required outlines to the blob.
1215  ol_it.set_to_list(blob->out_list());
1216  first_to_keep = ol_it.data();
1217  }
1218  for (int i = 0; i < ok_outlines.size(); ++i) {
1219  if (ok_outlines[i]) {
1220  // This outline is to be added.
1221  if (blob == NULL) {
1222  blob = new C_BLOB(outlines[i]);
1223  ol_it.set_to_list(blob->out_list());
1224  } else {
1225  ol_it.add_before_stay_put(outlines[i]);
1226  }
1227  }
1228  }
1229  float c2;
1230  float cert = ClassifyBlobAsWord(pass_n, pr_it, blob, best_str, &c2);
1231  ol_it.move_to_first();
1232  if (first_to_keep == NULL) {
1233  // We created blob. Empty its outlines and delete it.
1234  for (; !ol_it.empty(); ol_it.forward()) ol_it.extract();
1235  delete blob;
1236  cert = -c2;
1237  } else {
1238  // Remove the outlines that we put in.
1239  for (; ol_it.data() != first_to_keep; ol_it.forward()) {
1240  ol_it.extract();
1241  }
1242  }
1243  return cert;
1244 }
C_OUTLINE_LIST * out_list()
Definition: stepblob.h:64
int size() const
Definition: genericvector.h:72
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1249

◆ Clear()

void tesseract::Tesseract::Clear ( )

Definition at line 637 of file tesseractclass.cpp.

637  {
638  STRING debug_name = imagebasename + "_debug.pdf";
639  pixa_debug_.WritePDF(debug_name.string());
640  pixDestroy(&pix_binary_);
641  pixDestroy(&pix_grey_);
642  pixDestroy(&pix_thresholds_);
643  pixDestroy(&scaled_color_);
644  deskew_ = FCOORD(1.0f, 0.0f);
645  reskew_ = FCOORD(1.0f, 0.0f);
646  splitter_.Clear();
647  scaled_factor_ = -1;
648  for (int i = 0; i < sub_langs_.size(); ++i)
649  sub_langs_[i]->Clear();
650 }
Definition: points.h:189
const char * string() const
Definition: strngs.cpp:198
Definition: strngs.h:45
void WritePDF(const char *filename)
Definition: debugpixa.h:36
STRING imagebasename
Definition: ccutil.h:65

◆ ComputeCompatibleXheight()

float tesseract::Tesseract::ComputeCompatibleXheight ( WERD_RES word_res,
float *  baseline_shift 
)

Definition at line 101 of file fixxht.cpp.

102  {
103  STATS top_stats(0, MAX_UINT8);
104  STATS shift_stats(-MAX_UINT8, MAX_UINT8);
105  int bottom_shift = 0;
106  int num_blobs = word_res->rebuild_word->NumBlobs();
107  do {
108  top_stats.clear();
109  shift_stats.clear();
110  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
111  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
112  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
113  if (unicharset.get_isalpha(class_id) ||
114  unicharset.get_isdigit(class_id)) {
115  int top = blob->bounding_box().top() + bottom_shift;
116  // Clip the top to the limit of normalized feature space.
117  if (top >= INT_FEAT_RANGE)
118  top = INT_FEAT_RANGE - 1;
119  int bottom = blob->bounding_box().bottom() + bottom_shift;
120  int min_bottom, max_bottom, min_top, max_top;
121  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
122  &min_top, &max_top);
123  // Chars with a wild top range would mess up the result so ignore them.
124  if (max_top - min_top > kMaxCharTopRange)
125  continue;
126  int misfit_dist = MAX((min_top - x_ht_acceptance_tolerance) - top,
127  top - (max_top + x_ht_acceptance_tolerance));
128  int height = top - kBlnBaselineOffset;
129  if (debug_x_ht_level >= 2) {
130  tprintf("Class %s: height=%d, bottom=%d,%d top=%d,%d, actual=%d,%d: ",
131  unicharset.id_to_unichar(class_id),
132  height, min_bottom, max_bottom, min_top, max_top,
133  bottom, top);
134  }
135  // Use only chars that fit in the expected bottom range, and where
136  // the range of tops is sensibly near the xheight.
137  if (min_bottom <= bottom + x_ht_acceptance_tolerance &&
138  bottom - x_ht_acceptance_tolerance <= max_bottom &&
139  min_top > kBlnBaselineOffset &&
140  max_top - kBlnBaselineOffset >= kBlnXHeight &&
141  misfit_dist > 0) {
142  // Compute the x-height position using proportionality between the
143  // actual height and expected height.
144  int min_xht = DivRounded(height * kBlnXHeight,
145  max_top - kBlnBaselineOffset);
146  int max_xht = DivRounded(height * kBlnXHeight,
147  min_top - kBlnBaselineOffset);
148  if (debug_x_ht_level >= 2) {
149  tprintf(" xht range min=%d, max=%d\n", min_xht, max_xht);
150  }
151  // The range of expected heights gets a vote equal to the distance
152  // of the actual top from the expected top.
153  for (int y = min_xht; y <= max_xht; ++y)
154  top_stats.add(y, misfit_dist);
155  } else if ((min_bottom > bottom + x_ht_acceptance_tolerance ||
156  bottom - x_ht_acceptance_tolerance > max_bottom) &&
157  bottom_shift == 0) {
158  // Get the range of required bottom shift.
159  int min_shift = min_bottom - bottom;
160  int max_shift = max_bottom - bottom;
161  if (debug_x_ht_level >= 2) {
162  tprintf(" bottom shift min=%d, max=%d\n", min_shift, max_shift);
163  }
164  // The range of expected shifts gets a vote equal to the min distance
165  // of the actual bottom from the expected bottom, spread over the
166  // range of its acceptance.
167  int misfit_weight = abs(min_shift);
168  if (max_shift > min_shift)
169  misfit_weight /= max_shift - min_shift;
170  for (int y = min_shift; y <= max_shift; ++y)
171  shift_stats.add(y, misfit_weight);
172  } else {
173  if (bottom_shift == 0) {
174  // Things with bottoms that are already ok need to say so, on the
175  // 1st iteration only.
176  shift_stats.add(0, kBlnBaselineOffset);
177  }
178  if (debug_x_ht_level >= 2) {
179  tprintf(" already OK\n");
180  }
181  }
182  }
183  }
184  if (shift_stats.get_total() > top_stats.get_total()) {
185  bottom_shift = IntCastRounded(shift_stats.median());
186  if (debug_x_ht_level >= 2) {
187  tprintf("Applying bottom shift=%d\n", bottom_shift);
188  }
189  }
190  } while (bottom_shift != 0 &&
191  top_stats.get_total() < shift_stats.get_total());
192  // Baseline shift is opposite sign to the bottom shift.
193  *baseline_shift = -bottom_shift / word_res->denorm.y_scale();
194  if (debug_x_ht_level >= 2) {
195  tprintf("baseline shift=%g\n", *baseline_shift);
196  }
197  if (top_stats.get_total() == 0)
198  return bottom_shift != 0 ? word_res->x_height : 0.0f;
199  // The new xheight is just the median vote, which is then scaled out
200  // of BLN space back to pixel space to get the x-height in pixel space.
201  float new_xht = top_stats.median();
202  if (debug_x_ht_level >= 2) {
203  tprintf("Median xht=%f\n", new_xht);
204  tprintf("Mode20:A: New x-height = %f (norm), %f (orig)\n",
205  new_xht, new_xht / word_res->denorm.y_scale());
206  }
207  // The xheight must change by at least x_ht_min_change to be used.
208  if (fabs(new_xht - kBlnXHeight) >= x_ht_min_change)
209  return new_xht / word_res->denorm.y_scale();
210  else
211  return bottom_shift != 0 ? word_res->x_height : 0.0f;
212 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define MAX_UINT8
Definition: host.h:63
const int kBlnXHeight
Definition: normalis.h:28
TWERD * rebuild_word
Definition: pageres.h:244
#define tprintf(...)
Definition: tprintf.h:31
const int kBlnBaselineOffset
Definition: normalis.h:29
float y_scale() const
Definition: normalis.h:272
int IntCastRounded(double x)
Definition: helpers.h:179
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:528
const int kMaxCharTopRange
Definition: fixxht.cpp:66
UNICHARSET unicharset
Definition: ccutil.h:68
int NumBlobs() const
Definition: blobs.h:425
inT16 top() const
Definition: rect.h:54
#define MAX(x, y)
Definition: ndminx.h:24
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
DENORM denorm
Definition: pageres.h:190
#define INT_FEAT_RANGE
Definition: float2int.h:27
Definition: statistc.h:33
inT16 bottom() const
Definition: rect.h:61
TBOX bounding_box() const
Definition: blobs.cpp:482
int DivRounded(int a, int b)
Definition: helpers.h:173
float x_height
Definition: pageres.h:295

◆ convert_bad_unlv_chs()

void tesseract::Tesseract::convert_bad_unlv_chs ( WERD_RES word_res)

Definition at line 664 of file docqual.cpp.

664  {
665  int i;
666  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
667  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
668  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
669  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
670  for (i = 0; i < word_res->reject_map.length(); ++i) {
671  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
672  word_res->best_choice->set_unichar_id(unichar_dash, i);
673  if (word_res->reject_map[i].accepted ())
674  word_res->reject_map[i].setrej_unlv_rej ();
675  }
676  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
677  word_res->best_choice->set_unichar_id(unichar_space, i);
678  if (word_res->reject_map[i].accepted ())
679  word_res->reject_map[i].setrej_unlv_rej ();
680  }
681  }
682 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
inT32 length() const
Definition: rejctmap.h:235
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:357
const UNICHARSET * uch_set
Definition: pageres.h:192
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
REJMAP reject_map
Definition: pageres.h:271

◆ ConvertStringToUnichars()

bool tesseract::Tesseract::ConvertStringToUnichars ( const char *  utf8,
GenericVector< UNICHAR_ID > *  class_ids 
)

Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.

Returns
false if an invalid UNICHAR_ID is encountered.

Definition at line 535 of file applybox.cpp.

536  {
537  for (int step = 0; *utf8 != '\0'; utf8 += step) {
538  const char* next_space = strchr(utf8, ' ');
539  if (next_space == NULL)
540  next_space = utf8 + strlen(utf8);
541  step = next_space - utf8;
542  UNICHAR_ID class_id = unicharset.unichar_to_id(utf8, step);
543  if (class_id == INVALID_UNICHAR_ID) {
544  return false;
545  }
546  while (utf8[step] == ' ')
547  ++step;
548  class_ids->push_back(class_id);
549  }
550  return true;
551 }
int UNICHAR_ID
Definition: unichar.h:33
int push_back(T object)
UNICHARSET unicharset
Definition: ccutil.h:68
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ CorrectClassifyWords()

void tesseract::Tesseract::CorrectClassifyWords ( PAGE_RES page_res)

Creates a fake best_choice entry in each WERD_RES with the correct text.

Definition at line 772 of file applybox.cpp.

772  {
773  PAGE_RES_IT pr_it(page_res);
774  for (WERD_RES *word_res = pr_it.word(); word_res != NULL;
775  word_res = pr_it.forward()) {
776  WERD_CHOICE* choice = new WERD_CHOICE(word_res->uch_set,
777  word_res->correct_text.size());
778  for (int i = 0; i < word_res->correct_text.size(); ++i) {
779  // The part before the first space is the real ground truth, and the
780  // rest is the bounding box location and page number.
781  GenericVector<STRING> tokens;
782  word_res->correct_text[i].split(' ', &tokens);
783  UNICHAR_ID char_id = unicharset.unichar_to_id(tokens[0].string());
784  choice->append_unichar_id_space_allocated(char_id,
785  word_res->best_state[i],
786  0.0f, 0.0f);
787  }
788  word_res->ClearWordChoices();
789  word_res->LogNewRawChoice(choice);
790  word_res->LogNewCookedChoice(1, false, choice);
791  }
792 }
int UNICHAR_ID
Definition: unichar.h:33
UNICHARSET unicharset
Definition: ccutil.h:68
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:450
WERD * word
Definition: pageres.h:175
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ count_alphanums() [1/2]

inT16 tesseract::Tesseract::count_alphanums ( const WERD_CHOICE word)

Definition at line 408 of file output.cpp.

408  {
409  int count = 0;
410  for (int i = 0; i < word.length(); ++i) {
411  if (word.unicharset()->get_isalpha(word.unichar_id(i)) ||
412  word.unicharset()->get_isdigit(word.unichar_id(i)))
413  count++;
414  }
415  return count;
416 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int length() const
Definition: ratngs.h:301
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
int count(LIST var_list)
Definition: oldlist.cpp:103

◆ count_alphanums() [2/2]

inT16 tesseract::Tesseract::count_alphanums ( WERD_RES word)

Definition at line 558 of file reject.cpp.

558  {
559  int count = 0;
560  const WERD_CHOICE *best_choice = word_res->best_choice;
561  for (int i = 0; i < word_res->reject_map.length(); ++i) {
562  if ((word_res->reject_map[i].accepted()) &&
563  (word_res->uch_set->get_isalpha(best_choice->unichar_id(i)) ||
564  word_res->uch_set->get_isdigit(best_choice->unichar_id(i)))) {
565  count++;
566  }
567  }
568  return count;
569 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int count(LIST var_list)
Definition: oldlist.cpp:103

◆ count_alphas()

inT16 tesseract::Tesseract::count_alphas ( const WERD_CHOICE word)

Definition at line 398 of file output.cpp.

398  {
399  int count = 0;
400  for (int i = 0; i < word.length(); ++i) {
401  if (word.unicharset()->get_isalpha(word.unichar_id(i)))
402  count++;
403  }
404  return count;
405 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int length() const
Definition: ratngs.h:301
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
int count(LIST var_list)
Definition: oldlist.cpp:103

◆ count_outline_errs()

inT16 tesseract::Tesseract::count_outline_errs ( char  c,
inT16  outline_count 
)

Definition at line 131 of file docqual.cpp.

131  {
132  int expected_outline_count;
133 
134  if (STRING (outlines_odd).contains (c))
135  return 0; // Don't use this char
136  else if (STRING (outlines_2).contains (c))
137  expected_outline_count = 2;
138  else
139  expected_outline_count = 1;
140  return abs (outline_count - expected_outline_count);
141 }
Definition: strngs.h:45

◆ CountMisfitTops()

int tesseract::Tesseract::CountMisfitTops ( WERD_RES word_res)

Definition at line 69 of file fixxht.cpp.

69  {
70  int bad_blobs = 0;
71  int num_blobs = word_res->rebuild_word->NumBlobs();
72  for (int blob_id = 0; blob_id < num_blobs; ++blob_id) {
73  TBLOB* blob = word_res->rebuild_word->blobs[blob_id];
74  UNICHAR_ID class_id = word_res->best_choice->unichar_id(blob_id);
75  if (unicharset.get_isalpha(class_id) || unicharset.get_isdigit(class_id)) {
76  int top = blob->bounding_box().top();
77  if (top >= INT_FEAT_RANGE)
78  top = INT_FEAT_RANGE - 1;
79  int min_bottom, max_bottom, min_top, max_top;
80  unicharset.get_top_bottom(class_id, &min_bottom, &max_bottom,
81  &min_top, &max_top);
82  if (max_top - min_top > kMaxCharTopRange)
83  continue;
84  bool bad = top < min_top - x_ht_acceptance_tolerance ||
85  top > max_top + x_ht_acceptance_tolerance;
86  if (bad)
87  ++bad_blobs;
88  if (debug_x_ht_level >= 1) {
89  tprintf("Class %s is %s with top %d vs limits of %d->%d, +/-%d\n",
90  unicharset.id_to_unichar(class_id),
91  bad ? "Misfit" : "OK", top, min_top, max_top,
92  static_cast<int>(x_ht_acceptance_tolerance));
93  }
94  }
95  }
96  return bad_blobs;
97 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
TWERD * rebuild_word
Definition: pageres.h:244
#define tprintf(...)
Definition: tprintf.h:31
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:528
const int kMaxCharTopRange
Definition: fixxht.cpp:66
UNICHARSET unicharset
Definition: ccutil.h:68
int NumBlobs() const
Definition: blobs.h:425
inT16 top() const
Definition: rect.h:54
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
#define INT_FEAT_RANGE
Definition: float2int.h:27
TBOX bounding_box() const
Definition: blobs.cpp:482

◆ debug_word()

void tesseract::Tesseract::debug_word ( PAGE_RES page_res,
const TBOX selection_box 
)

debug_word

Process the whole image, but load word_config_ for the selected word(s).

Definition at line 640 of file pgedit.cpp.

640  {
642  recog_all_words(page_res, NULL, &selection_box, word_config_.string(), 0);
643 }
const char * string() const
Definition: strngs.cpp:198
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:300

◆ dictionary_correction_pass()

void tesseract::Tesseract::dictionary_correction_pass ( PAGE_RES page_res)

Definition at line 2042 of file control.cpp.

2042  {
2043  PAGE_RES_IT word_it(page_res);
2044  for (WERD_RES* word = word_it.word(); word != NULL;
2045  word = word_it.forward()) {
2046  if (word->best_choices.singleton())
2047  continue; // There are no alternates.
2048 
2049  WERD_CHOICE* best = word->best_choice;
2050  if (word->tesseract->getDict().valid_word(*best) != 0)
2051  continue; // The best choice is in the dictionary.
2052 
2053  WERD_CHOICE_IT choice_it(&word->best_choices);
2054  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
2055  choice_it.forward()) {
2056  WERD_CHOICE* alternate = choice_it.data();
2057  if (word->tesseract->getDict().valid_word(*alternate)) {
2058  // The alternate choice is in the dictionary.
2059  if (tessedit_bigram_debug) {
2060  tprintf("Dictionary correction replaces best choice '%s' with '%s'\n",
2061  best->unichar_string().string(),
2062  alternate->unichar_string().string());
2063  }
2064  // Replace the 'best' choice with a better choice.
2065  word->ReplaceBestChoice(alternate);
2066  break;
2067  }
2068  }
2069  }
2070 }
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
const STRING & unichar_string() const
Definition: ratngs.h:539
WERD * word
Definition: pageres.h:175

◆ digit_or_numeric_punct()

BOOL8 tesseract::Tesseract::digit_or_numeric_punct ( WERD_RES word,
int  char_position 
)

Definition at line 343 of file fixspace.cpp.

343  {
344  int i;
345  int offset;
346 
347  for (i = 0, offset = 0; i < char_position;
348  offset += word->best_choice->unichar_lengths()[i++]);
349  return (
350  word->uch_set->get_isdigit(
352  word->best_choice->unichar_lengths()[i]) ||
353  (word->best_choice->permuter() == NUMBER_PERM &&
355  word->best_choice->unichar_string().string()[offset])));
356 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
const char * string() const
Definition: strngs.cpp:198
voidpf uLong offset
Definition: ioapi.h:42
uinT8 permuter() const
Definition: ratngs.h:344
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
Definition: strngs.h:45
BOOL8 contains(const char c) const
Definition: strngs.cpp:189
const STRING & unichar_string() const
Definition: ratngs.h:539
const UNICHARSET * uch_set
Definition: pageres.h:192

◆ do_re_display()

void tesseract::Tesseract::do_re_display ( BOOL8(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_painter)

do_re_display()

Redisplay page

Definition at line 308 of file pgedit.cpp.

309  {
310  int block_count = 1;
311 
312  image_win->Clear();
313  if (display_image != 0) {
314  image_win->Image(pix_binary_, 0, 0);
315  }
316 
319  for (WERD_RES* word = pr_it.word(); word != NULL; word = pr_it.forward()) {
320  (this->*word_painter)(&pr_it);
321  if (display_baselines && pr_it.row() != pr_it.prev_row())
322  pr_it.row()->row->plot_baseline(image_win, ScrollView::GREEN);
323  if (display_blocks && pr_it.block() != pr_it.prev_block())
324  pr_it.block()->block->plot(image_win, block_count++, ScrollView::RED);
325  }
326  image_win->Update();
327 }
void Brush(Color color)
Definition: scrollview.cpp:732
BOOL8 display_baselines
Definition: pgedit.cpp:126
ScrollView * image_win
Definition: pgedit.cpp:107
void Clear()
Definition: scrollview.cpp:595
BOOL8 display_image
Definition: pgedit.cpp:124
static void Update()
Definition: scrollview.cpp:715
WERD * word
Definition: pageres.h:175
void Image(struct Pix *image, int x_pos, int y_pos)
Definition: scrollview.cpp:773
BOOL8 display_blocks
Definition: pgedit.cpp:125
PAGE_RES * current_page_res
Definition: pgedit.cpp:128

◆ doc_and_block_rejection()

void tesseract::Tesseract::doc_and_block_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Definition at line 237 of file docqual.cpp.

239  {
240  inT16 block_no = 0;
241  inT16 row_no = 0;
242  BLOCK_RES *current_block;
243  ROW_RES *current_row;
244 
245  BOOL8 rej_word;
246  BOOL8 prev_word_rejected;
247  inT16 char_quality = 0;
248  inT16 accepted_char_quality;
249 
250  if (page_res_it.page_res->rej_count * 100.0 /
252  reject_whole_page(page_res_it);
254  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
255  page_res_it.page_res->char_count,
256  page_res_it.page_res->rej_count);
257  }
258  } else {
260  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
261  page_res_it.page_res->char_count,
262  page_res_it.page_res->rej_count);
263  }
264 
265  /* Walk blocks testing for block rejection */
266 
267  page_res_it.restart_page();
268  WERD_RES* word;
269  while ((word = page_res_it.word()) != NULL) {
270  current_block = page_res_it.block();
271  block_no = current_block->block->index();
272  if (current_block->char_count > 0 &&
273  (current_block->rej_count * 100.0 / current_block->char_count) >
276  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
277  block_no, current_block->char_count,
278  current_block->rej_count);
279  }
280  prev_word_rejected = FALSE;
281  while ((word = page_res_it.word()) != NULL &&
282  (page_res_it.block() == current_block)) {
284  rej_word = word->reject_map.reject_count() > 0 ||
286  if (rej_word && tessedit_dont_blkrej_good_wds &&
289  *word->uch_set,
290  word->best_choice->unichar_string().string(),
291  word->best_choice->unichar_lengths().string()) !=
292  AC_UNACCEPTABLE) {
293  word_char_quality(word, page_res_it.row()->row,
294  &char_quality,
295  &accepted_char_quality);
296  rej_word = char_quality != word->reject_map.length();
297  }
298  } else {
299  rej_word = TRUE;
300  }
301  if (rej_word) {
302  /*
303  Reject spacing if both current and prev words are rejected.
304  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
305  generated more space errors.
306  */
308  prev_word_rejected &&
309  page_res_it.prev_row() == page_res_it.row() &&
310  word->word->space() == 1)
311  word->reject_spaces = TRUE;
313  }
314  prev_word_rejected = rej_word;
315  page_res_it.forward();
316  }
317  } else {
319  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
320  block_no, page_res_it.block()->char_count,
321  page_res_it.block()->rej_count);
322  }
323 
324  /* Walk rows in block testing for row rejection */
325  row_no = 0;
326  while (page_res_it.word() != NULL &&
327  page_res_it.block() == current_block) {
328  current_row = page_res_it.row();
329  row_no++;
330  /* Reject whole row if:
331  fraction of chars on row which are rejected exceed a limit AND
332  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
333  limit
334  */
335  if (current_row->char_count > 0 &&
336  (current_row->rej_count * 100.0 / current_row->char_count) >
338  (current_row->whole_word_rej_count * 100.0 /
339  current_row->rej_count) <
342  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
343  row_no, current_row->char_count,
344  current_row->rej_count);
345  }
346  prev_word_rejected = FALSE;
347  while ((word = page_res_it.word()) != NULL &&
348  page_res_it.row () == current_row) {
349  /* Preserve words on good docs unless they are mostly rejected*/
350  if (!tessedit_row_rej_good_docs && good_quality_doc) {
351  rej_word = word->reject_map.reject_count() /
352  static_cast<float>(word->reject_map.length()) >
355  /* Preserve perfect words anyway */
356  rej_word = word->reject_map.reject_count() > 0 ||
358  if (rej_word && tessedit_dont_rowrej_good_wds &&
361  word->best_choice->unichar_string().string(),
362  word->best_choice->unichar_lengths().string()) !=
363  AC_UNACCEPTABLE) {
364  word_char_quality(word, page_res_it.row()->row,
365  &char_quality,
366  &accepted_char_quality);
367  rej_word = char_quality != word->reject_map.length();
368  }
369  } else {
370  rej_word = TRUE;
371  }
372  if (rej_word) {
373  /*
374  Reject spacing if both current and prev words are rejected.
375  NOTE - this is NOT restricted to FUZZY spaces. - When tried
376  this generated more space errors.
377  */
379  prev_word_rejected &&
380  page_res_it.prev_row() == page_res_it.row() &&
381  word->word->space () == 1)
382  word->reject_spaces = TRUE;
384  }
385  prev_word_rejected = rej_word;
386  page_res_it.forward();
387  }
388  } else {
390  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
391  row_no, current_row->char_count, current_row->rej_count);
392  }
393  while (page_res_it.word() != NULL &&
394  page_res_it.row() == current_row)
395  page_res_it.forward();
396  }
397  }
398  }
399  }
400  }
401 }
inT32 rej_count
Definition: pageres.h:129
inT32 char_count
Definition: pageres.h:60
Unacceptable word.
Definition: control.h:36
#define TRUE
Definition: capi.h:45
void rej_word_block_rej()
Definition: rejctmap.cpp:503
int index() const
Definition: pdblock.h:67
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
ROW_RES * prev_row() const
Definition: pageres.h:730
inT32 rej_count
Definition: pageres.h:61
#define tprintf(...)
Definition: tprintf.h:31
BLOCK * block
Definition: pageres.h:99
const char * string() const
Definition: strngs.cpp:198
ROW * row
Definition: pageres.h:127
double tessedit_reject_doc_percent
int16_t inT16
Definition: host.h:36
ROW_RES * row() const
Definition: pageres.h:739
BOOL8 reject_spaces
Definition: pageres.h:320
bool tessedit_preserve_row_rej_perfect_wds
inT32 length() const
Definition: rejctmap.h:235
WERD_RES * restart_page()
Definition: pageres.h:683
inT32 whole_word_rej_count
Definition: pageres.h:130
PAGE_RES * page_res
Definition: pageres.h:661
WERD_RES * forward()
Definition: pageres.h:716
inT16 reject_count()
Definition: rejctmap.h:241
inT32 rej_count
Definition: pageres.h:101
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
inT32 char_count
Definition: pageres.h:128
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
double tessedit_good_doc_still_rowrej_wd
bool tessedit_preserve_blk_rej_perfect_wds
const STRING & unichar_string() const
Definition: ratngs.h:539
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1690
WERD * word
Definition: pageres.h:175
double tessedit_whole_wd_rej_row_percent
WERD_RES * word() const
Definition: pageres.h:736
const UNICHARSET * uch_set
Definition: pageres.h:192
double tessedit_reject_block_percent
void rej_word_row_rej()
Definition: rejctmap.cpp:512
inT32 char_count
Definition: pageres.h:100
REJMAP reject_map
Definition: pageres.h:271
uinT8 space()
Definition: werd.h:104
double tessedit_reject_row_percent
BLOCK_RES * block() const
Definition: pageres.h:742
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:411

◆ dont_allow_1Il()

void tesseract::Tesseract::dont_allow_1Il ( WERD_RES word)

Definition at line 526 of file reject.cpp.

526  {
527  int i = 0;
528  int offset;
529  int word_len = word->reject_map.length();
530  const char *s = word->best_choice->unichar_string().string();
531  const char *lengths = word->best_choice->unichar_lengths().string();
532  BOOL8 accepted_1Il = FALSE;
533 
534  for (i = 0, offset = 0; i < word_len;
535  offset += word->best_choice->unichar_lengths()[i++]) {
536  if (word->reject_map[i].accepted()) {
537  if (STRING(conflict_set_I_l_1).contains(s[offset])) {
538  accepted_1Il = TRUE;
539  } else {
540  if (word->uch_set->get_isalpha(s + offset, lengths[i]) ||
541  word->uch_set->get_isdigit(s + offset, lengths[i]))
542  return; // >=1 non 1Il ch accepted
543  }
544  }
545  }
546  if (!accepted_1Il)
547  return; //Nothing to worry about
548 
549  for (i = 0, offset = 0; i < word_len;
550  offset += word->best_choice->unichar_lengths()[i++]) {
551  if (STRING(conflict_set_I_l_1).contains(s[offset]) &&
552  word->reject_map[i].accepted())
553  word->reject_map[i].setrej_postNN_1Il();
554  }
555 }
#define TRUE
Definition: capi.h:45
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
const char * string() const
Definition: strngs.cpp:198
voidpf uLong offset
Definition: ioapi.h:42
inT32 length() const
Definition: rejctmap.h:235
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
unsigned char BOOL8
Definition: host.h:44
Definition: strngs.h:45
#define FALSE
Definition: capi.h:46
BOOL8 contains(const char c) const
Definition: strngs.cpp:189
const STRING & unichar_string() const
Definition: ratngs.h:539
const UNICHARSET * uch_set
Definition: pageres.h:192
REJMAP reject_map
Definition: pageres.h:271

◆ dump_words()

void tesseract::Tesseract::dump_words ( WERD_RES_LIST &  perm,
inT16  score,
inT16  mode,
BOOL8  improved 
)

Definition at line 449 of file fixspace.cpp.

450  {
451  WERD_RES_IT word_res_it(&perm);
452 
453  if (debug_fix_space_level > 0) {
454  if (mode == 1) {
455  stats_.dump_words_str = "";
456  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
457  word_res_it.forward()) {
458  if (!word_res_it.data()->part_of_combo) {
459  stats_.dump_words_str +=
460  word_res_it.data()->best_choice->unichar_string();
461  stats_.dump_words_str += ' ';
462  }
463  }
464  }
465 
466  if (debug_fix_space_level > 1) {
467  switch (mode) {
468  case 1:
469  tprintf("EXTRACTED (%d): \"", score);
470  break;
471  case 2:
472  tprintf("TESTED (%d): \"", score);
473  break;
474  case 3:
475  tprintf("RETURNED (%d): \"", score);
476  break;
477  }
478 
479  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
480  word_res_it.forward()) {
481  if (!word_res_it.data()->part_of_combo) {
482  tprintf("%s/%1d ",
483  word_res_it.data()->best_choice->unichar_string().string(),
484  (int)word_res_it.data()->best_choice->permuter());
485  }
486  }
487  tprintf("\"\n");
488  } else if (improved) {
489  tprintf("FIX SPACING \"%s\" => \"", stats_.dump_words_str.string());
490  for (word_res_it.mark_cycle_pt(); !word_res_it.cycled_list();
491  word_res_it.forward()) {
492  if (!word_res_it.data()->part_of_combo) {
493  tprintf("%s/%1d ",
494  word_res_it.data()->best_choice->unichar_string().string(),
495  (int)word_res_it.data()->best_choice->permuter());
496  }
497  }
498  tprintf("\"\n");
499  }
500  }
501 }
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
const char int mode
Definition: ioapi.h:38
typedef int(ZCALLBACK *close_file_func) OF((voidpf opaque

◆ end_tesseract()

void tesseract::Tesseract::end_tesseract ( )

Definition at line 468 of file tessedit.cpp.

468  {
469  end_recog();
470 }

◆ eval_word_spacing()

inT16 tesseract::Tesseract::eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 239 of file fixspace.cpp.

239  {
240  WERD_RES_IT word_res_it(&word_res_list);
241  inT16 total_score = 0;
242  inT16 word_count = 0;
243  inT16 done_word_count = 0;
244  inT16 word_len;
245  inT16 i;
246  inT16 offset;
247  WERD_RES *word; // current word
248  inT16 prev_word_score = 0;
249  BOOL8 prev_word_done = FALSE;
250  BOOL8 prev_char_1 = FALSE; // prev ch a "1/I/l"?
251  BOOL8 prev_char_digit = FALSE; // prev ch 2..9 or 0
252  BOOL8 current_char_1 = FALSE;
253  BOOL8 current_word_ok_so_far;
254  STRING punct_chars = "!\"`',.:;";
255  BOOL8 prev_char_punct = FALSE;
256  BOOL8 current_char_punct = FALSE;
257  BOOL8 word_done = FALSE;
258 
259  do {
260  word = word_res_it.data();
261  word_done = fixspace_thinks_word_done(word);
262  word_count++;
263  if (word->tess_failed) {
264  total_score += prev_word_score;
265  if (prev_word_done)
266  done_word_count++;
267  prev_word_score = 0;
268  prev_char_1 = FALSE;
269  prev_char_digit = FALSE;
270  prev_word_done = FALSE;
271  } else {
272  /*
273  Can we add the prev word score and potentially count this word?
274  Yes IF it didn't end in a 1 when the first char of this word is a digit
275  AND it didn't end in a digit when the first char of this word is a 1
276  */
277  word_len = word->reject_map.length();
278  current_word_ok_so_far = FALSE;
279  if (!((prev_char_1 && digit_or_numeric_punct(word, 0)) ||
280  (prev_char_digit && (
281  (word_done &&
282  word->best_choice->unichar_lengths().string()[0] == 1 &&
283  word->best_choice->unichar_string()[0] == '1') ||
284  (!word_done && STRING(conflict_set_I_l_1).contains(
285  word->best_choice->unichar_string()[0])))))) {
286  total_score += prev_word_score;
287  if (prev_word_done)
288  done_word_count++;
289  current_word_ok_so_far = word_done;
290  }
291 
292  if (current_word_ok_so_far) {
293  prev_word_done = TRUE;
294  prev_word_score = word_len;
295  } else {
296  prev_word_done = FALSE;
297  prev_word_score = 0;
298  }
299 
300  /* Add 1 to total score for every joined 1 regardless of context and
301  rejtn */
302  for (i = 0, prev_char_1 = FALSE; i < word_len; i++) {
303  current_char_1 = word->best_choice->unichar_string()[i] == '1';
304  if (prev_char_1 || (current_char_1 && (i > 0)))
305  total_score++;
306  prev_char_1 = current_char_1;
307  }
308 
309  /* Add 1 to total score for every joined punctuation regardless of context
310  and rejtn */
312  for (i = 0, offset = 0, prev_char_punct = FALSE; i < word_len;
313  offset += word->best_choice->unichar_lengths()[i++]) {
314  current_char_punct =
315  punct_chars.contains(word->best_choice->unichar_string()[offset]);
316  if (prev_char_punct || (current_char_punct && i > 0))
317  total_score++;
318  prev_char_punct = current_char_punct;
319  }
320  }
321  prev_char_digit = digit_or_numeric_punct(word, word_len - 1);
322  for (i = 0, offset = 0; i < word_len - 1;
323  offset += word->best_choice->unichar_lengths()[i++]);
324  prev_char_1 =
325  ((word_done && (word->best_choice->unichar_string()[offset] == '1'))
326  || (!word_done && STRING(conflict_set_I_l_1).contains(
327  word->best_choice->unichar_string()[offset])));
328  }
329  /* Find next word */
330  do {
331  word_res_it.forward();
332  } while (word_res_it.data()->part_of_combo);
333  } while (!word_res_it.at_first());
334  total_score += prev_word_score;
335  if (prev_word_done)
336  done_word_count++;
337  if (done_word_count == word_count)
338  return PERFECT_WERDS;
339  else
340  return total_score;
341 }
#define TRUE
Definition: capi.h:45
BOOL8 tess_failed
Definition: pageres.h:272
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
const char * string() const
Definition: strngs.cpp:198
voidpf uLong offset
Definition: ioapi.h:42
#define PERFECT_WERDS
Definition: fixspace.cpp:33
int16_t inT16
Definition: host.h:36
inT32 length() const
Definition: rejctmap.h:235
unsigned char BOOL8
Definition: host.h:44
Definition: strngs.h:45
#define FALSE
Definition: capi.h:46
BOOL8 contains(const char c) const
Definition: strngs.cpp:189
BOOL8 fixspace_thinks_word_done(WERD_RES *word)
Definition: fixspace.cpp:503
const STRING & unichar_string() const
Definition: ratngs.h:539
REJMAP reject_map
Definition: pageres.h:271
BOOL8 digit_or_numeric_punct(WERD_RES *word, int char_position)
Definition: fixspace.cpp:343

◆ failure_count()

inT16 tesseract::Tesseract::failure_count ( WERD_RES word)

Definition at line 970 of file docqual.cpp.

970  {
971  const char *str = word->best_choice->unichar_string().string();
972  int tess_rejs = 0;
973 
974  for (; *str != '\0'; str++) {
975  if (*str == ' ')
976  tess_rejs++;
977  }
978  return tess_rejs;
979 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
const char * string() const
Definition: strngs.cpp:198
const STRING & unichar_string() const
Definition: ratngs.h:539

◆ FindSegmentation()

bool tesseract::Tesseract::FindSegmentation ( const GenericVector< UNICHAR_ID > &  target_text,
WERD_RES word_res 
)

Resegments the word to achieve the target_text from the classifier. Returns false if the re-segmentation fails. Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and applies a full search on the classifier results to find the best classified segmentation. As a compromise to obtain better recall, 1-1 ambiguity substitutions ARE used.

Definition at line 559 of file applybox.cpp.

560  {
561  // Classify all required combinations of blobs and save results in choices.
562  int word_length = word_res->box_word->length();
564  new GenericVector<BLOB_CHOICE_LIST*>[word_length];
565  for (int i = 0; i < word_length; ++i) {
566  for (int j = 1; j <= kMaxGroupSize && i + j <= word_length; ++j) {
567  BLOB_CHOICE_LIST* match_result = classify_piece(
568  word_res->seam_array, i, i + j - 1, "Applybox",
569  word_res->chopped_word, word_res->blamer_bundle);
570  if (applybox_debug > 2) {
571  tprintf("%d+%d:", i, j);
572  print_ratings_list("Segment:", match_result, unicharset);
573  }
574  choices[i].push_back(match_result);
575  }
576  }
577  // Search the segmentation graph for the target text. Must be an exact
578  // match. Using wildcards makes it difficult to find the correct
579  // segmentation even when it is there.
580  word_res->best_state.clear();
581  GenericVector<int> search_segmentation;
582  float best_rating = 0.0f;
583  SearchForText(choices, 0, word_length, target_text, 0, 0.0f,
584  &search_segmentation, &best_rating, &word_res->best_state);
585  for (int i = 0; i < word_length; ++i)
586  choices[i].delete_data_pointers();
587  delete [] choices;
588  if (word_res->best_state.empty()) {
589  // Build the original segmentation and if it is the same length as the
590  // truth, assume it will do.
591  int blob_count = 1;
592  for (int s = 0; s < word_res->seam_array.size(); ++s) {
593  SEAM* seam = word_res->seam_array[s];
594  if (!seam->HasAnySplits()) {
595  word_res->best_state.push_back(blob_count);
596  blob_count = 1;
597  } else {
598  ++blob_count;
599  }
600  }
601  word_res->best_state.push_back(blob_count);
602  if (word_res->best_state.size() != target_text.size()) {
603  word_res->best_state.clear(); // No good. Original segmentation bad size.
604  return false;
605  }
606  }
607  word_res->correct_text.clear();
608  for (int i = 0; i < target_text.size(); ++i) {
609  word_res->correct_text.push_back(
610  STRING(unicharset.id_to_unichar(target_text[i])));
611  }
612  return true;
613 }
GenericVector< int > best_state
Definition: pageres.h:255
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:629
BlamerBundle * blamer_bundle
Definition: pageres.h:230
GenericVector< STRING > correct_text
Definition: pageres.h:259
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
bool empty() const
Definition: genericvector.h:90
const int kMaxGroupSize
Definition: applybox.cpp:40
int size() const
Definition: genericvector.h:72
tesseract::BoxWord * box_word
Definition: pageres.h:250
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
Definition: seam.h:44
Definition: strngs.h:45
UNICHARSET unicharset
Definition: ccutil.h:68
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:56
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
TWERD * chopped_word
Definition: pageres.h:201
int length() const
Definition: boxword.h:85
GenericVector< SEAM * > seam_array
Definition: pageres.h:203
bool HasAnySplits() const
Definition: seam.h:67

◆ first_alphanum_index()

inT16 tesseract::Tesseract::first_alphanum_index ( const char *  word,
const char *  word_lengths 
)

Definition at line 469 of file reject.cpp.

470  {
471  inT16 i;
472  inT16 offset;
473 
474  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
475  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
476  unicharset.get_isdigit(word + offset, word_lengths[i]))
477  return i;
478  }
479  return -1;
480 }
voidpf uLong offset
Definition: ioapi.h:42
int16_t inT16
Definition: host.h:36
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
UNICHARSET unicharset
Definition: ccutil.h:68

◆ first_alphanum_offset()

inT16 tesseract::Tesseract::first_alphanum_offset ( const char *  word,
const char *  word_lengths 
)

Definition at line 482 of file reject.cpp.

483  {
484  inT16 i;
485  inT16 offset;
486 
487  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
488  if (unicharset.get_isalpha(word + offset, word_lengths[i]) ||
489  unicharset.get_isdigit(word + offset, word_lengths[i]))
490  return offset;
491  }
492  return -1;
493 }
voidpf uLong offset
Definition: ioapi.h:42
int16_t inT16
Definition: host.h:36
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
UNICHARSET unicharset
Definition: ccutil.h:68

◆ fix_fuzzy_space_list()

void tesseract::Tesseract::fix_fuzzy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 145 of file fixspace.cpp.

147  {
148  inT16 best_score;
149  WERD_RES_LIST current_perm;
150  inT16 current_score;
151  BOOL8 improved = FALSE;
152 
153  best_score = eval_word_spacing(best_perm); // default score
154  dump_words(best_perm, best_score, 1, improved);
155 
156  if (best_score != PERFECT_WERDS)
157  initialise_search(best_perm, current_perm);
158 
159  while ((best_score != PERFECT_WERDS) && !current_perm.empty()) {
160  match_current_words(current_perm, row, block);
161  current_score = eval_word_spacing(current_perm);
162  dump_words(current_perm, current_score, 2, improved);
163  if (current_score > best_score) {
164  best_perm.clear();
165  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
166  best_score = current_score;
167  improved = TRUE;
168  }
169  if (current_score < PERFECT_WERDS)
170  transform_to_next_perm(current_perm);
171  }
172  dump_words(best_perm, best_score, 3, improved);
173 }
void transform_to_next_perm(WERD_RES_LIST &words)
Definition: fixspace.cpp:372
#define TRUE
Definition: capi.h:45
#define PERFECT_WERDS
Definition: fixspace.cpp:33
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:196
int16_t inT16
Definition: host.h:36
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
inT16 eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:239
void initialise_search(WERD_RES_LIST &src_list, WERD_RES_LIST &new_list)
Definition: fixspace.cpp:177
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:449
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:633

◆ fix_fuzzy_spaces()

void tesseract::Tesseract::fix_fuzzy_spaces ( ETEXT_DESC monitor,
inT32  word_count,
PAGE_RES page_res 
)

Definition at line 48 of file fixspace.cpp.

50  {
51  BLOCK_RES_IT block_res_it;
52  ROW_RES_IT row_res_it;
53  WERD_RES_IT word_res_it_from;
54  WERD_RES_IT word_res_it_to;
55  WERD_RES *word_res;
56  WERD_RES_LIST fuzzy_space_words;
57  inT16 new_length;
58  BOOL8 prevent_null_wd_fixsp; // DON'T process blobless wds
59  inT32 word_index; // current word
60 
61  block_res_it.set_to_list(&page_res->block_res_list);
62  word_index = 0;
63  for (block_res_it.mark_cycle_pt(); !block_res_it.cycled_list();
64  block_res_it.forward()) {
65  row_res_it.set_to_list(&block_res_it.data()->row_res_list);
66  for (row_res_it.mark_cycle_pt(); !row_res_it.cycled_list();
67  row_res_it.forward()) {
68  word_res_it_from.set_to_list(&row_res_it.data()->word_res_list);
69  while (!word_res_it_from.at_last()) {
70  word_res = word_res_it_from.data();
71  while (!word_res_it_from.at_last() &&
72  !(word_res->combination ||
73  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_NON) ||
74  word_res_it_from.data_relative(1)->word->flag(W_FUZZY_SP))) {
75  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
76  block_res_it.data()->block);
77  word_res = word_res_it_from.forward();
78  word_index++;
79  if (monitor != NULL) {
80  monitor->ocr_alive = TRUE;
81  monitor->progress = 90 + 5 * word_index / word_count;
82  if (monitor->deadline_exceeded() ||
83  (monitor->cancel != NULL &&
84  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
85  return;
86  }
87  }
88 
89  if (!word_res_it_from.at_last()) {
90  word_res_it_to = word_res_it_from;
91  prevent_null_wd_fixsp =
92  word_res->word->cblob_list()->empty();
93  if (check_debug_pt(word_res, 60))
94  debug_fix_space_level.set_value(10);
95  word_res_it_to.forward();
96  word_index++;
97  if (monitor != NULL) {
98  monitor->ocr_alive = TRUE;
99  monitor->progress = 90 + 5 * word_index / word_count;
100  if (monitor->deadline_exceeded() ||
101  (monitor->cancel != NULL &&
102  (*monitor->cancel)(monitor->cancel_this, stats_.dict_words)))
103  return;
104  }
105  while (!word_res_it_to.at_last () &&
106  (word_res_it_to.data_relative(1)->word->flag(W_FUZZY_NON) ||
107  word_res_it_to.data_relative(1)->word->flag(W_FUZZY_SP))) {
108  if (check_debug_pt(word_res, 60))
109  debug_fix_space_level.set_value(10);
110  if (word_res->word->cblob_list()->empty())
111  prevent_null_wd_fixsp = TRUE;
112  word_res = word_res_it_to.forward();
113  }
114  if (check_debug_pt(word_res, 60))
115  debug_fix_space_level.set_value(10);
116  if (word_res->word->cblob_list()->empty())
117  prevent_null_wd_fixsp = TRUE;
118  if (prevent_null_wd_fixsp) {
119  word_res_it_from = word_res_it_to;
120  } else {
121  fuzzy_space_words.assign_to_sublist(&word_res_it_from,
122  &word_res_it_to);
123  fix_fuzzy_space_list(fuzzy_space_words,
124  row_res_it.data()->row,
125  block_res_it.data()->block);
126  new_length = fuzzy_space_words.length();
127  word_res_it_from.add_list_before(&fuzzy_space_words);
128  for (;
129  !word_res_it_from.at_last() && new_length > 0;
130  new_length--) {
131  word_res_it_from.forward();
132  }
133  }
134  if (test_pt)
135  debug_fix_space_level.set_value(0);
136  }
137  fix_sp_fp_word(word_res_it_from, row_res_it.data()->row,
138  block_res_it.data()->block);
139  // Last word in row
140  }
141  }
142  }
143 }
bool deadline_exceeded() const
Definition: ocrclass.h:158
#define TRUE
Definition: capi.h:45
int32_t inT32
Definition: host.h:38
volatile inT8 ocr_alive
true if not last
Definition: ocrclass.h:123
BLOCK_RES_LIST block_res_list
Definition: pageres.h:62
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1794
void * cancel_this
called whenever progress increases
Definition: ocrclass.h:127
int16_t inT16
Definition: host.h:36
BOOL8 combination
Definition: pageres.h:318
unsigned char BOOL8
Definition: host.h:44
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:125
void fix_sp_fp_word(WERD_RES_IT &word_res_it, ROW *row, BLOCK *block)
Definition: fixspace.cpp:535
void fix_fuzzy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:145
inT16 progress
chars in this buffer(0)
Definition: ocrclass.h:118

◆ fix_noisy_space_list()

void tesseract::Tesseract::fix_noisy_space_list ( WERD_RES_LIST &  best_perm,
ROW row,
BLOCK block 
)

Definition at line 569 of file fixspace.cpp.

570  {
571  inT16 best_score;
572  WERD_RES_IT best_perm_it(&best_perm);
573  WERD_RES_LIST current_perm;
574  WERD_RES_IT current_perm_it(&current_perm);
575  WERD_RES *old_word_res;
576  inT16 current_score;
577  BOOL8 improved = FALSE;
578 
579  best_score = fp_eval_word_spacing(best_perm); // default score
580 
581  dump_words(best_perm, best_score, 1, improved);
582 
583  old_word_res = best_perm_it.data();
584  // Even deep_copy doesn't copy the underlying WERD unless its combination
585  // flag is true!.
586  old_word_res->combination = TRUE; // Kludge to force deep copy
587  current_perm_it.add_to_end(WERD_RES::deep_copy(old_word_res));
588  old_word_res->combination = FALSE; // Undo kludge
589 
590  break_noisiest_blob_word(current_perm);
591 
592  while (best_score != PERFECT_WERDS && !current_perm.empty()) {
593  match_current_words(current_perm, row, block);
594  current_score = fp_eval_word_spacing(current_perm);
595  dump_words(current_perm, current_score, 2, improved);
596  if (current_score > best_score) {
597  best_perm.clear();
598  best_perm.deep_copy(&current_perm, &WERD_RES::deep_copy);
599  best_score = current_score;
600  improved = TRUE;
601  }
602  if (current_score < PERFECT_WERDS) {
603  break_noisiest_blob_word(current_perm);
604  }
605  }
606  dump_words(best_perm, best_score, 3, improved);
607 }
#define TRUE
Definition: capi.h:45
#define PERFECT_WERDS
Definition: fixspace.cpp:33
void match_current_words(WERD_RES_LIST &words, ROW *row, BLOCK *block)
Definition: fixspace.cpp:196
int16_t inT16
Definition: host.h:36
BOOL8 combination
Definition: pageres.h:318
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
void break_noisiest_blob_word(WERD_RES_LIST &words)
Definition: fixspace.cpp:615
inT16 fp_eval_word_spacing(WERD_RES_LIST &word_res_list)
Definition: fixspace.cpp:830
void dump_words(WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved)
Definition: fixspace.cpp:449
static WERD_RES * deep_copy(const WERD_RES *src)
Definition: pageres.h:633

◆ fix_rep_char()

void tesseract::Tesseract::fix_rep_char ( PAGE_RES_IT page_res_it)

fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.

Definition at line 1651 of file control.cpp.

1651  {
1652  WERD_RES *word_res = page_res_it->word();
1653  const WERD_CHOICE &word = *(word_res->best_choice);
1654 
1655  // Find the frequency of each unique character in the word.
1656  SortHelper<UNICHAR_ID> rep_ch(word.length());
1657  for (int i = 0; i < word.length(); ++i) {
1658  rep_ch.Add(word.unichar_id(i), 1);
1659  }
1660 
1661  // Find the most frequent result.
1662  UNICHAR_ID maxch_id = INVALID_UNICHAR_ID; // most common char
1663  int max_count = rep_ch.MaxCount(&maxch_id);
1664  // Find the best exemplar of a classifier result for maxch_id.
1665  BLOB_CHOICE* best_choice = FindBestMatchingChoice(maxch_id, word_res);
1666  if (best_choice == NULL) {
1667  tprintf("Failed to find a choice for %s, occurring %d times\n",
1668  word_res->uch_set->debug_str(maxch_id).string(), max_count);
1669  return;
1670  }
1671  word_res->done = TRUE;
1672 
1673  // Measure the mean space.
1674  int gap_count = 0;
1675  WERD* werd = word_res->word;
1676  C_BLOB_IT blob_it(werd->cblob_list());
1677  C_BLOB* prev_blob = blob_it.data();
1678  for (blob_it.forward(); !blob_it.at_first(); blob_it.forward()) {
1679  C_BLOB* blob = blob_it.data();
1680  int gap = blob->bounding_box().left();
1681  gap -= prev_blob->bounding_box().right();
1682  ++gap_count;
1683  prev_blob = blob;
1684  }
1685  // Just correct existing classification.
1686  CorrectRepcharChoices(best_choice, word_res);
1687  word_res->reject_map.initialise(word.length());
1688 }
#define TRUE
Definition: capi.h:45
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
inT16 left() const
Definition: rect.h:68
void Add(T value, int count)
Definition: sorthelper.h:65
TBOX bounding_box() const
Definition: stepblob.cpp:250
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD * word
Definition: pageres.h:175
WERD_RES * word() const
Definition: pageres.h:736
const UNICHARSET * uch_set
Definition: pageres.h:192
void initialise(inT16 length)
Definition: rejctmap.cpp:318
Definition: werd.h:60
REJMAP reject_map
Definition: pageres.h:271
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:318
BOOL8 done
Definition: pageres.h:282

◆ fix_sp_fp_word()

void tesseract::Tesseract::fix_sp_fp_word ( WERD_RES_IT &  word_res_it,
ROW row,
BLOCK block 
)

Definition at line 535 of file fixspace.cpp.

536  {
537  WERD_RES *word_res;
538  WERD_RES_LIST sub_word_list;
539  WERD_RES_IT sub_word_list_it(&sub_word_list);
540  inT16 blob_index;
541  inT16 new_length;
542  float junk;
543 
544  word_res = word_res_it.data();
545  if (word_res->word->flag(W_REP_CHAR) ||
546  word_res->combination ||
547  word_res->part_of_combo ||
548  !word_res->word->flag(W_DONT_CHOP))
549  return;
550 
551  blob_index = worst_noise_blob(word_res, &junk);
552  if (blob_index < 0)
553  return;
554 
555  if (debug_fix_space_level > 1) {
556  tprintf("FP fixspace working on \"%s\"\n",
557  word_res->best_choice->unichar_string().string());
558  }
559  word_res->word->rej_cblob_list()->sort(c_blob_comparator);
560  sub_word_list_it.add_after_stay_put(word_res_it.extract());
561  fix_noisy_space_list(sub_word_list, row, block);
562  new_length = sub_word_list.length();
563  word_res_it.add_list_before(&sub_word_list);
564  for (; !word_res_it.at_last() && new_length > 1; new_length--) {
565  word_res_it.forward();
566  }
567 }
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:680
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
int16_t inT16
Definition: host.h:36
int c_blob_comparator(const void *blob1p, const void *blob2p)
Definition: genblob.cpp:30
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
BOOL8 combination
Definition: pageres.h:318
void fix_noisy_space_list(WERD_RES_LIST &best_perm, ROW *row, BLOCK *block)
Definition: fixspace.cpp:569
BOOL8 part_of_combo
Definition: pageres.h:319
const STRING & unichar_string() const
Definition: ratngs.h:539
WERD * word
Definition: pageres.h:175
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95

◆ fixspace_thinks_word_done()

BOOL8 tesseract::Tesseract::fixspace_thinks_word_done ( WERD_RES word)

Definition at line 503 of file fixspace.cpp.

503  {
504  if (word->done)
505  return TRUE;
506 
507  /*
508  Use all the standard pass 2 conditions for mode 5 in set_done() in
509  reject.c BUT DON'T REJECT IF THE WERD IS AMBIGUOUS - FOR SPACING WE DON'T
510  CARE WHETHER WE HAVE of/at on/an etc.
511  */
512  if (fixsp_done_mode > 0 &&
513  (word->tess_accepted ||
514  (fixsp_done_mode == 2 && word->reject_map.reject_count() == 0) ||
515  fixsp_done_mode == 3) &&
516  (strchr(word->best_choice->unichar_string().string(), ' ') == NULL) &&
517  ((word->best_choice->permuter() == SYSTEM_DAWG_PERM) ||
518  (word->best_choice->permuter() == FREQ_DAWG_PERM) ||
519  (word->best_choice->permuter() == USER_DAWG_PERM) ||
520  (word->best_choice->permuter() == NUMBER_PERM))) {
521  return TRUE;
522  } else {
523  return FALSE;
524  }
525 }
#define TRUE
Definition: capi.h:45
WERD_CHOICE * best_choice
Definition: pageres.h:219
const char * string() const
Definition: strngs.cpp:198
uinT8 permuter() const
Definition: ratngs.h:344
inT16 reject_count()
Definition: rejctmap.h:241
#define FALSE
Definition: capi.h:46
BOOL8 tess_accepted
Definition: pageres.h:280
const STRING & unichar_string() const
Definition: ratngs.h:539
REJMAP reject_map
Definition: pageres.h:271
BOOL8 done
Definition: pageres.h:282

◆ flip_0O()

void tesseract::Tesseract::flip_0O ( WERD_RES word)

Definition at line 673 of file reject.cpp.

673  {
674  WERD_CHOICE *best_choice = word_res->best_choice;
675  int i;
676  TBOX out_box;
677 
678  if (!tessedit_flip_0O)
679  return;
680 
681  int num_blobs = word_res->rebuild_word->NumBlobs();
682  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
683  TBLOB* blob = word_res->rebuild_word->blobs[i];
684  if (word_res->uch_set->get_isupper(best_choice->unichar_id(i)) ||
685  word_res->uch_set->get_isdigit(best_choice->unichar_id(i))) {
686  out_box = blob->bounding_box();
687  if ((out_box.top() < kBlnBaselineOffset + kBlnXHeight) ||
688  (out_box.bottom() > kBlnBaselineOffset + kBlnXHeight / 4))
689  return; //Beware words with sub/superscripts
690  }
691  }
692  UNICHAR_ID unichar_0 = word_res->uch_set->unichar_to_id("0");
693  UNICHAR_ID unichar_O = word_res->uch_set->unichar_to_id("O");
694  if (unichar_0 == INVALID_UNICHAR_ID ||
695  !word_res->uch_set->get_enabled(unichar_0) ||
696  unichar_O == INVALID_UNICHAR_ID ||
697  !word_res->uch_set->get_enabled(unichar_O)) {
698  return; // 0 or O are not present/enabled in unicharset
699  }
700  for (i = 1; i < best_choice->length(); ++i) {
701  if (best_choice->unichar_id(i) == unichar_0 ||
702  best_choice->unichar_id(i) == unichar_O) {
703  /* A0A */
704  if ((i+1) < best_choice->length() &&
705  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
706  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+1))) {
707  best_choice->set_unichar_id(unichar_O, i);
708  }
709  /* A00A */
710  if (non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
711  (i+1) < best_choice->length() &&
712  (best_choice->unichar_id(i+1) == unichar_0 ||
713  best_choice->unichar_id(i+1) == unichar_O) &&
714  (i+2) < best_choice->length() &&
715  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i+2))) {
716  best_choice->set_unichar_id(unichar_O, i);
717  i++;
718  }
719  /* AA0<non digit or end of word> */
720  if ((i > 1) &&
721  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-2)) &&
722  non_O_upper(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
723  (((i+1) < best_choice->length() &&
724  !word_res->uch_set->get_isdigit(best_choice->unichar_id(i+1)) &&
725  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "l") &&
726  !word_res->uch_set->eq(best_choice->unichar_id(i+1), "I")) ||
727  (i == best_choice->length() - 1))) {
728  best_choice->set_unichar_id(unichar_O, i);
729  }
730  /* 9O9 */
731  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
732  (i+1) < best_choice->length() &&
733  non_0_digit(*word_res->uch_set, best_choice->unichar_id(i+1))) {
734  best_choice->set_unichar_id(unichar_0, i);
735  }
736  /* 9OOO */
737  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
738  (i+2) < best_choice->length() &&
739  (best_choice->unichar_id(i+1) == unichar_0 ||
740  best_choice->unichar_id(i+1) == unichar_O) &&
741  (best_choice->unichar_id(i+2) == unichar_0 ||
742  best_choice->unichar_id(i+2) == unichar_O)) {
743  best_choice->set_unichar_id(unichar_0, i);
744  best_choice->set_unichar_id(unichar_0, i+1);
745  best_choice->set_unichar_id(unichar_0, i+2);
746  i += 2;
747  }
748  /* 9OO<non upper> */
749  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
750  (i+2) < best_choice->length() &&
751  (best_choice->unichar_id(i+1) == unichar_0 ||
752  best_choice->unichar_id(i+1) == unichar_O) &&
753  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+2))) {
754  best_choice->set_unichar_id(unichar_0, i);
755  best_choice->set_unichar_id(unichar_0, i+1);
756  i++;
757  }
758  /* 9O<non upper> */
759  if (non_0_digit(*word_res->uch_set, best_choice->unichar_id(i-1)) &&
760  (i+1) < best_choice->length() &&
761  !word_res->uch_set->get_isupper(best_choice->unichar_id(i+1))) {
762  best_choice->set_unichar_id(unichar_0, i);
763  }
764  /* 9[.,]OOO.. */
765  if ((i > 1) &&
766  (word_res->uch_set->eq(best_choice->unichar_id(i-1), ".") ||
767  word_res->uch_set->eq(best_choice->unichar_id(i-1), ",")) &&
768  (word_res->uch_set->get_isdigit(best_choice->unichar_id(i-2)) ||
769  best_choice->unichar_id(i-2) == unichar_O)) {
770  if (best_choice->unichar_id(i-2) == unichar_O) {
771  best_choice->set_unichar_id(unichar_0, i-2);
772  }
773  while (i < best_choice->length() &&
774  (best_choice->unichar_id(i) == unichar_O ||
775  best_choice->unichar_id(i) == unichar_0)) {
776  best_choice->set_unichar_id(unichar_0, i);
777  i++;
778  }
779  i--;
780  }
781  }
782  }
783 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
int length() const
Definition: ratngs.h:301
const int kBlnXHeight
Definition: normalis.h:28
const int kBlnBaselineOffset
Definition: normalis.h:29
BOOL8 non_O_upper(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:785
BOOL8 non_0_digit(const UNICHARSET &ch_set, UNICHAR_ID unichar_id)
Definition: reject.cpp:789
inT16 top() const
Definition: rect.h:54
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:357
Definition: rect.h:30
Definition: blobs.h:261
inT16 bottom() const
Definition: rect.h:61
TBOX bounding_box() const
Definition: blobs.cpp:482

◆ flip_hyphens()

void tesseract::Tesseract::flip_hyphens ( WERD_RES word)

Definition at line 616 of file reject.cpp.

616  {
617  WERD_CHOICE *best_choice = word_res->best_choice;
618  int i;
619  int prev_right = -9999;
620  int next_left;
621  TBOX out_box;
622  float aspect_ratio;
623 
625  return;
626 
627  int num_blobs = word_res->rebuild_word->NumBlobs();
628  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
629  for (i = 0; i < best_choice->length() && i < num_blobs; ++i) {
630  TBLOB* blob = word_res->rebuild_word->blobs[i];
631  out_box = blob->bounding_box();
632  if (i + 1 == num_blobs)
633  next_left = 9999;
634  else
635  next_left = word_res->rebuild_word->blobs[i + 1]->bounding_box().left();
636  // Don't touch small or touching blobs - it is too dangerous.
637  if ((out_box.width() > 8 * word_res->denorm.x_scale()) &&
638  (out_box.left() > prev_right) && (out_box.right() < next_left)) {
639  aspect_ratio = out_box.width() / (float) out_box.height();
640  if (word_res->uch_set->eq(best_choice->unichar_id(i), ".")) {
641  if (aspect_ratio >= tessedit_upper_flip_hyphen &&
642  word_res->uch_set->contains_unichar_id(unichar_dash) &&
643  word_res->uch_set->get_enabled(unichar_dash)) {
644  /* Certain HYPHEN */
645  best_choice->set_unichar_id(unichar_dash, i);
646  if (word_res->reject_map[i].rejected())
647  word_res->reject_map[i].setrej_hyphen_accept();
648  }
649  if ((aspect_ratio > tessedit_lower_flip_hyphen) &&
650  word_res->reject_map[i].accepted())
651  //Suspected HYPHEN
652  word_res->reject_map[i].setrej_hyphen ();
653  }
654  else if (best_choice->unichar_id(i) == unichar_dash) {
655  if ((aspect_ratio >= tessedit_upper_flip_hyphen) &&
656  (word_res->reject_map[i].rejected()))
657  word_res->reject_map[i].setrej_hyphen_accept();
658  //Certain HYPHEN
659 
660  if ((aspect_ratio <= tessedit_lower_flip_hyphen) &&
661  (word_res->reject_map[i].accepted()))
662  //Suspected HYPHEN
663  word_res->reject_map[i].setrej_hyphen();
664  }
665  }
666  prev_right = out_box.right();
667  }
668 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
int length() const
Definition: ratngs.h:301
inT16 left() const
Definition: rect.h:68
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:357
Definition: rect.h:30
Definition: blobs.h:261
inT16 height() const
Definition: rect.h:104
inT16 right() const
Definition: rect.h:75
inT16 width() const
Definition: rect.h:111
TBOX bounding_box() const
Definition: blobs.cpp:482

◆ font_recognition_pass()

void tesseract::Tesseract::font_recognition_pass ( PAGE_RES page_res)

font_recognition_pass

Smooth the fonts for the document.

Definition at line 1985 of file control.cpp.

1985  {
1986  PAGE_RES_IT page_res_it(page_res);
1987  WERD_RES *word; // current word
1988  STATS doc_fonts(0, font_table_size_); // font counters
1989 
1990  // Gather font id statistics.
1991  for (page_res_it.restart_page(); page_res_it.word() != NULL;
1992  page_res_it.forward()) {
1993  word = page_res_it.word();
1994  if (word->fontinfo != NULL) {
1995  doc_fonts.add(word->fontinfo->universal_id, word->fontinfo_id_count);
1996  }
1997  if (word->fontinfo2 != NULL) {
1998  doc_fonts.add(word->fontinfo2->universal_id, word->fontinfo_id2_count);
1999  }
2000  }
2001  inT16 doc_font; // modal font
2002  int8_t doc_font_count; // modal font
2003  find_modal_font(&doc_fonts, &doc_font, &doc_font_count);
2004  if (doc_font_count == 0)
2005  return;
2006  // Get the modal font pointer.
2007  const FontInfo* modal_font = NULL;
2008  for (page_res_it.restart_page(); page_res_it.word() != NULL;
2009  page_res_it.forward()) {
2010  word = page_res_it.word();
2011  if (word->fontinfo != NULL && word->fontinfo->universal_id == doc_font) {
2012  modal_font = word->fontinfo;
2013  break;
2014  }
2015  if (word->fontinfo2 != NULL && word->fontinfo2->universal_id == doc_font) {
2016  modal_font = word->fontinfo2;
2017  break;
2018  }
2019  }
2020  ASSERT_HOST(modal_font != NULL);
2021 
2022  // Assign modal font to weak words.
2023  for (page_res_it.restart_page(); page_res_it.word() != NULL;
2024  page_res_it.forward()) {
2025  word = page_res_it.word();
2026  int length = word->best_choice->length();
2027 
2028  int count = word->fontinfo_id_count;
2029  if (!(count == length || (length > 3 && count >= length * 3 / 4))) {
2030  word->fontinfo = modal_font;
2031  // Counts only get 1 as it came from the doc.
2032  word->fontinfo_id_count = 1;
2033  word->italic = modal_font->is_italic() ? 1 : -1;
2034  word->bold = modal_font->is_bold() ? 1 : -1;
2035  }
2036  }
2037 }
const FontInfo * fontinfo2
Definition: pageres.h:289
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
inT8 bold
Definition: pageres.h:286
bool is_bold() const
Definition: fontinfo.h:112
inT8 italic
Definition: pageres.h:285
int16_t inT16
Definition: host.h:36
#define ASSERT_HOST(x)
Definition: errcode.h:84
const FontInfo * fontinfo
Definition: pageres.h:288
inT8 fontinfo_id2_count
Definition: pageres.h:291
inT8 fontinfo_id_count
Definition: pageres.h:290
WERD * word
Definition: pageres.h:175
Definition: statistc.h:33
bool is_italic() const
Definition: fontinfo.h:111
int count(LIST var_list)
Definition: oldlist.cpp:103

◆ fp_eval_word_spacing()

inT16 tesseract::Tesseract::fp_eval_word_spacing ( WERD_RES_LIST &  word_res_list)

Definition at line 830 of file fixspace.cpp.

830  {
831  WERD_RES_IT word_it(&word_res_list);
832  WERD_RES *word;
833  inT16 score = 0;
834  inT16 i;
835  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
836 
837  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
838  word = word_it.data();
839  if (word->rebuild_word == NULL)
840  continue; // Can't handle cube words.
841  if (word->done ||
842  word->tess_accepted ||
843  word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
844  word->best_choice->permuter() == FREQ_DAWG_PERM ||
845  word->best_choice->permuter() == USER_DAWG_PERM ||
846  safe_dict_word(word) > 0) {
847  int num_blobs = word->rebuild_word->NumBlobs();
848  UNICHAR_ID space = word->uch_set->unichar_to_id(" ");
849  for (i = 0; i < word->best_choice->length() && i < num_blobs; ++i) {
850  TBLOB* blob = word->rebuild_word->blobs[i];
851  if (word->best_choice->unichar_id(i) == space ||
852  blob_noise_score(blob) < small_limit) {
853  score -= 1; // penalise possibly erroneous non-space
854  } else if (word->reject_map[i].accepted()) {
855  score++;
856  }
857  }
858  }
859  }
860  if (score < 0)
861  score = 0;
862  return score;
863 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
const int kBlnXHeight
Definition: normalis.h:28
TWERD * rebuild_word
Definition: pageres.h:244
int16_t inT16
Definition: host.h:36
uinT8 permuter() const
Definition: ratngs.h:344
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
BOOL8 tess_accepted
Definition: pageres.h:280
int NumBlobs() const
Definition: blobs.h:425
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:760
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
const UNICHARSET * uch_set
Definition: pageres.h:192
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
REJMAP reject_map
Definition: pageres.h:271
BOOL8 done
Definition: pageres.h:282

◆ garbage_word()

GARBAGE_LEVEL tesseract::Tesseract::garbage_word ( WERD_RES word,
BOOL8  ok_dict_word 
)

Definition at line 684 of file docqual.cpp.

684  {
685  enum STATES
686  {
687  JUNK,
688  FIRST_UPPER,
689  FIRST_LOWER,
690  FIRST_NUM,
691  SUBSEQUENT_UPPER,
692  SUBSEQUENT_LOWER,
693  SUBSEQUENT_NUM
694  };
695  const char *str = word->best_choice->unichar_string().string();
696  const char *lengths = word->best_choice->unichar_lengths().string();
697  STATES state = JUNK;
698  int len = 0;
699  int isolated_digits = 0;
700  int isolated_alphas = 0;
701  int bad_char_count = 0;
702  int tess_rejs = 0;
703  int dodgy_chars = 0;
704  int ok_chars;
705  UNICHAR_ID last_char = -1;
706  int alpha_repetition_count = 0;
707  int longest_alpha_repetition_count = 0;
708  int longest_lower_run_len = 0;
709  int lower_string_count = 0;
710  int longest_upper_run_len = 0;
711  int upper_string_count = 0;
712  int total_alpha_count = 0;
713  int total_digit_count = 0;
714 
715  for (; *str != '\0'; str += *(lengths++)) {
716  len++;
717  if (word->uch_set->get_isupper (str, *lengths)) {
718  total_alpha_count++;
719  switch (state) {
720  case SUBSEQUENT_UPPER:
721  case FIRST_UPPER:
722  state = SUBSEQUENT_UPPER;
723  upper_string_count++;
724  if (longest_upper_run_len < upper_string_count)
725  longest_upper_run_len = upper_string_count;
726  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
727  alpha_repetition_count++;
728  if (longest_alpha_repetition_count < alpha_repetition_count) {
729  longest_alpha_repetition_count = alpha_repetition_count;
730  }
731  }
732  else {
733  last_char = word->uch_set->unichar_to_id(str, *lengths);
734  alpha_repetition_count = 1;
735  }
736  break;
737  case FIRST_NUM:
738  isolated_digits++;
739  default:
740  state = FIRST_UPPER;
741  last_char = word->uch_set->unichar_to_id(str, *lengths);
742  alpha_repetition_count = 1;
743  upper_string_count = 1;
744  break;
745  }
746  }
747  else if (word->uch_set->get_islower (str, *lengths)) {
748  total_alpha_count++;
749  switch (state) {
750  case SUBSEQUENT_LOWER:
751  case FIRST_LOWER:
752  state = SUBSEQUENT_LOWER;
753  lower_string_count++;
754  if (longest_lower_run_len < lower_string_count)
755  longest_lower_run_len = lower_string_count;
756  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
757  alpha_repetition_count++;
758  if (longest_alpha_repetition_count < alpha_repetition_count) {
759  longest_alpha_repetition_count = alpha_repetition_count;
760  }
761  }
762  else {
763  last_char = word->uch_set->unichar_to_id(str, *lengths);
764  alpha_repetition_count = 1;
765  }
766  break;
767  case FIRST_NUM:
768  isolated_digits++;
769  default:
770  state = FIRST_LOWER;
771  last_char = word->uch_set->unichar_to_id(str, *lengths);
772  alpha_repetition_count = 1;
773  lower_string_count = 1;
774  break;
775  }
776  }
777  else if (word->uch_set->get_isdigit (str, *lengths)) {
778  total_digit_count++;
779  switch (state) {
780  case FIRST_NUM:
781  state = SUBSEQUENT_NUM;
782  case SUBSEQUENT_NUM:
783  break;
784  case FIRST_UPPER:
785  case FIRST_LOWER:
786  isolated_alphas++;
787  default:
788  state = FIRST_NUM;
789  break;
790  }
791  }
792  else {
793  if (*lengths == 1 && *str == ' ')
794  tess_rejs++;
795  else
796  bad_char_count++;
797  switch (state) {
798  case FIRST_NUM:
799  isolated_digits++;
800  break;
801  case FIRST_UPPER:
802  case FIRST_LOWER:
803  isolated_alphas++;
804  default:
805  break;
806  }
807  state = JUNK;
808  }
809  }
810 
811  switch (state) {
812  case FIRST_NUM:
813  isolated_digits++;
814  break;
815  case FIRST_UPPER:
816  case FIRST_LOWER:
817  isolated_alphas++;
818  default:
819  break;
820  }
821 
823  total_alpha_count += total_digit_count - isolated_digits;
824  }
825 
826  if (crunch_leave_ok_strings && len >= 4 &&
827  2 * (total_alpha_count - isolated_alphas) > len &&
828  longest_alpha_repetition_count < crunch_long_repetitions) {
829  if ((crunch_accept_ok &&
830  acceptable_word_string(*word->uch_set, str, lengths) !=
831  AC_UNACCEPTABLE) ||
832  longest_lower_run_len > crunch_leave_lc_strings ||
833  longest_upper_run_len > crunch_leave_uc_strings)
834  return G_NEVER_CRUNCH;
835  }
836  if (word->reject_map.length() > 1 &&
837  strpbrk(str, " ") == NULL &&
838  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
839  word->best_choice->permuter() == FREQ_DAWG_PERM ||
840  word->best_choice->permuter() == USER_DAWG_PERM ||
841  word->best_choice->permuter() == NUMBER_PERM ||
842  acceptable_word_string(*word->uch_set, str, lengths) !=
843  AC_UNACCEPTABLE || ok_dict_word))
844  return G_OK;
845 
846  ok_chars = len - bad_char_count - isolated_digits -
847  isolated_alphas - tess_rejs;
848 
849  if (crunch_debug > 3) {
850  tprintf("garbage_word: \"%s\"\n",
851  word->best_choice->unichar_string().string());
852  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
853  len,
854  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
855  }
856  if (bad_char_count == 0 &&
857  tess_rejs == 0 &&
858  (len > isolated_digits + isolated_alphas || len <= 2))
859  return G_OK;
860 
861  if (tess_rejs > ok_chars ||
862  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
863  return G_TERRIBLE;
864 
865  if (len > 4) {
866  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
867  isolated_alphas;
868  if (dodgy_chars > 5 || (dodgy_chars / (float) len) > 0.5)
869  return G_DODGY;
870  else
871  return G_OK;
872  } else {
873  dodgy_chars = 2 * tess_rejs + bad_char_count;
874  if ((len == 4 && dodgy_chars > 2) ||
875  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
876  return G_DODGY;
877  else
878  return G_OK;
879  }
880 }
Unacceptable word.
Definition: control.h:36
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
inT32 length() const
Definition: rejctmap.h:235
uinT8 permuter() const
Definition: ratngs.h:344
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
Definition: docqual.h:28
const STRING & unichar_string() const
Definition: ratngs.h:539
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1690
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
const UNICHARSET * uch_set
Definition: pageres.h:192
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
REJMAP reject_map
Definition: pageres.h:271

◆ get_rep_char()

UNICHAR_ID tesseract::Tesseract::get_rep_char ( WERD_RES word)

Definition at line 283 of file output.cpp.

283  { // what char is repeated?
284  int i;
285  for (i = 0; ((i < word->reject_map.length()) &&
286  (word->reject_map[i].rejected())); ++i);
287 
288  if (i < word->reject_map.length()) {
289  return word->best_choice->unichar_id(i);
290  } else {
291  return word->uch_set->unichar_to_id(unrecognised_char.string());
292  }
293 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
WERD_CHOICE * best_choice
Definition: pageres.h:219
inT32 length() const
Definition: rejctmap.h:235
const UNICHARSET * uch_set
Definition: pageres.h:192
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
REJMAP reject_map
Definition: pageres.h:271

◆ get_sub_lang()

Tesseract* tesseract::Tesseract::get_sub_lang ( int  index) const
inline

Definition at line 256 of file tesseractclass.h.

256  {
257  return sub_langs_[index];
258  }

◆ GetLineData()

ImageData * tesseract::Tesseract::GetLineData ( const TBOX line_box,
const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
int  start_box,
int  end_box,
const BLOCK block 
)

Definition at line 131 of file linerec.cpp.

135  {
136  TBOX revised_box;
137  ImageData* image_data = GetRectImage(line_box, block, kImagePadding,
138  &revised_box);
139  if (image_data == NULL) return NULL;
140  image_data->set_page_number(applybox_page);
141  // Copy the boxes and shift them so they are relative to the image.
142  FCOORD block_rotation(block.re_rotation().x(), -block.re_rotation().y());
143  ICOORD shift = -revised_box.botleft();
144  GenericVector<TBOX> line_boxes;
145  GenericVector<STRING> line_texts;
146  for (int b = start_box; b < end_box; ++b) {
147  TBOX box = boxes[b];
148  box.rotate(block_rotation);
149  box.move(shift);
150  line_boxes.push_back(box);
151  line_texts.push_back(texts[b]);
152  }
153  GenericVector<int> page_numbers;
154  page_numbers.init_to_size(line_boxes.size(), applybox_page);
155  image_data->AddBoxes(line_boxes, line_texts, page_numbers);
156  return image_data;
157 }
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:165
Definition: points.h:189
void init_to_size(int size, T t)
int push_back(T object)
const int kImagePadding
Definition: imagedata.h:37
int size() const
Definition: genericvector.h:72
Definition: rect.h:30
float y() const
Definition: points.h:212
FCOORD re_rotation() const
Definition: ocrblock.h:138
void move(const ICOORD vec)
Definition: rect.h:153
const ICOORD & botleft() const
Definition: rect.h:88
float x() const
Definition: points.h:209
void rotate(const FCOORD &vec)
Definition: rect.h:189
integer coordinate
Definition: points.h:30

◆ GetRectImage()

ImageData * tesseract::Tesseract::GetRectImage ( const TBOX box,
const BLOCK block,
int  padding,
TBOX revised_box 
) const

Definition at line 165 of file linerec.cpp.

166  {
167  TBOX wbox = box;
168  wbox.pad(padding, padding);
169  *revised_box = wbox;
170  // Number of clockwise 90 degree rotations needed to get back to tesseract
171  // coords from the clipped image.
172  int num_rotations = 0;
173  if (block.re_rotation().y() > 0.0f)
174  num_rotations = 1;
175  else if (block.re_rotation().x() < 0.0f)
176  num_rotations = 2;
177  else if (block.re_rotation().y() < 0.0f)
178  num_rotations = 3;
179  // Handle two cases automatically: 1 the box came from the block, 2 the box
180  // came from a box file, and refers to the image, which the block may not.
181  if (block.bounding_box().major_overlap(*revised_box))
182  revised_box->rotate(block.re_rotation());
183  // Now revised_box always refers to the image.
184  // BestPix is never colormapped, but may be of any depth.
185  Pix* pix = BestPix();
186  int width = pixGetWidth(pix);
187  int height = pixGetHeight(pix);
188  TBOX image_box(0, 0, width, height);
189  // Clip to image bounds;
190  *revised_box &= image_box;
191  if (revised_box->null_box()) return NULL;
192  Box* clip_box = boxCreate(revised_box->left(), height - revised_box->top(),
193  revised_box->width(), revised_box->height());
194  Pix* box_pix = pixClipRectangle(pix, clip_box, NULL);
195  if (box_pix == NULL) return NULL;
196  boxDestroy(&clip_box);
197  if (num_rotations > 0) {
198  Pix* rot_pix = pixRotateOrth(box_pix, num_rotations);
199  pixDestroy(&box_pix);
200  box_pix = rot_pix;
201  }
202  // Convert sub-8-bit images to 8 bit.
203  int depth = pixGetDepth(box_pix);
204  if (depth < 8) {
205  Pix* grey;
206  grey = pixConvertTo8(box_pix, false);
207  pixDestroy(&box_pix);
208  box_pix = grey;
209  }
210  bool vertical_text = false;
211  if (num_rotations > 0) {
212  // Rotated the clipped revised box back to internal coordinates.
213  FCOORD rotation(block.re_rotation().x(), -block.re_rotation().y());
214  revised_box->rotate(rotation);
215  if (num_rotations != 2)
216  vertical_text = true;
217  }
218  return new ImageData(vertical_text, box_pix);
219 }
Definition: points.h:189
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
inT16 left() const
Definition: rect.h:68
Pix * BestPix() const
bool null_box() const
Definition: rect.h:46
void pad(int xpad, int ypad)
Definition: rect.h:127
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 height() const
Definition: rect.h:104
float y() const
Definition: points.h:212
inT16 width() const
Definition: rect.h:111
FCOORD re_rotation() const
Definition: ocrblock.h:138
float x() const
Definition: points.h:209
void rotate(const FCOORD &vec)
Definition: rect.h:189

◆ GetSubAndSuperscriptCandidates()

void tesseract::Tesseract::GetSubAndSuperscriptCandidates ( const WERD_RES word,
int num_rebuilt_leading,
ScriptPos leading_pos,
float *  leading_certainty,
int num_rebuilt_trailing,
ScriptPos trailing_pos,
float *  trailing_certainty,
float *  avg_certainty,
float *  unlikely_threshold 
)

Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.

Parameters
[in]wordThe word to examine.
[out]num_rebuilt_leadingthe number of rebuilt blobs at the start of the word which are all up or down and seem badly classified.
[out]leading_pos"super" or "sub" (for debugging)
[out]leading_certaintythe worst certainty in the leading blobs.
[out]num_rebuilt_trailingthe number of rebuilt blobs at the end of the word which are all up or down and seem badly classified.
[out]trailing_pos"super" or "sub" (for debugging)
[out]trailing_certaintythe worst certainty in the trailing blobs.
[out]avg_certaintythe average certainty of "normal" blobs in the word.
[out]unlikely_thresholdthe threshold (on certainty) we used to select "bad enough" outlier characters.

Definition at line 253 of file superscript.cpp.

261  {
262  *avg_certainty = *unlikely_threshold = 0.0f;
263  *num_rebuilt_leading = *num_rebuilt_trailing = 0;
264  *leading_certainty = *trailing_certainty = 0.0f;
265 
266  int super_y_bottom =
268  int sub_y_top =
270 
271  // Step one: Get an average certainty for "normally placed" characters.
272 
273  // Counts here are of blobs in the rebuild_word / unichars in best_choice.
274  *leading_pos = *trailing_pos = SP_NORMAL;
275  int leading_outliers = 0;
276  int trailing_outliers = 0;
277  int num_normal = 0;
278  float normal_certainty_total = 0.0f;
279  float worst_normal_certainty = 0.0f;
280  ScriptPos last_pos = SP_NORMAL;
281  int num_blobs = word->rebuild_word->NumBlobs();
282  for (int b = 0; b < num_blobs; ++b) {
283  TBOX box = word->rebuild_word->blobs[b]->bounding_box();
284  ScriptPos pos = SP_NORMAL;
285  if (box.bottom() >= super_y_bottom) {
286  pos = SP_SUPERSCRIPT;
287  } else if (box.top() <= sub_y_top) {
288  pos = SP_SUBSCRIPT;
289  }
290  if (pos == SP_NORMAL) {
291  if (word->best_choice->unichar_id(b) != 0) {
292  float char_certainty = word->best_choice->certainty(b);
293  if (char_certainty < worst_normal_certainty) {
294  worst_normal_certainty = char_certainty;
295  }
296  num_normal++;
297  normal_certainty_total += char_certainty;
298  }
299  if (trailing_outliers == b) {
300  leading_outliers = trailing_outliers;
301  *leading_pos = last_pos;
302  }
303  trailing_outliers = 0;
304  } else {
305  if (last_pos == pos) {
306  trailing_outliers++;
307  } else {
308  trailing_outliers = 1;
309  }
310  }
311  last_pos = pos;
312  }
313  *trailing_pos = last_pos;
314  if (num_normal >= 3) { // throw out the worst as an outlier.
315  num_normal--;
316  normal_certainty_total -= worst_normal_certainty;
317  }
318  if (num_normal > 0) {
319  *avg_certainty = normal_certainty_total / num_normal;
320  *unlikely_threshold = superscript_worse_certainty * (*avg_certainty);
321  }
322  if (num_normal == 0 ||
323  (leading_outliers == 0 && trailing_outliers == 0)) {
324  return;
325  }
326 
327  // Step two: Try to split off bits of the word that are both outliers
328  // and have much lower certainty than average
329  // Calculate num_leading and leading_certainty.
330  for (*leading_certainty = 0.0f, *num_rebuilt_leading = 0;
331  *num_rebuilt_leading < leading_outliers;
332  (*num_rebuilt_leading)++) {
333  float char_certainty = word->best_choice->certainty(*num_rebuilt_leading);
334  if (char_certainty > *unlikely_threshold) {
335  break;
336  }
337  if (char_certainty < *leading_certainty) {
338  *leading_certainty = char_certainty;
339  }
340  }
341 
342  // Calculate num_trailing and trailing_certainty.
343  for (*trailing_certainty = 0.0f, *num_rebuilt_trailing = 0;
344  *num_rebuilt_trailing < trailing_outliers;
345  (*num_rebuilt_trailing)++) {
346  int blob_idx = num_blobs - 1 - *num_rebuilt_trailing;
347  float char_certainty = word->best_choice->certainty(blob_idx);
348  if (char_certainty > *unlikely_threshold) {
349  break;
350  }
351  if (char_certainty < *trailing_certainty) {
352  *trailing_certainty = char_certainty;
353  }
354  }
355 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
WERD_CHOICE * best_choice
Definition: pageres.h:219
const int kBlnXHeight
Definition: normalis.h:28
TWERD * rebuild_word
Definition: pageres.h:244
double superscript_worse_certainty
const int kBlnBaselineOffset
Definition: normalis.h:29
float certainty() const
Definition: ratngs.h:328
int NumBlobs() const
Definition: blobs.h:425
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
inT16 bottom() const
Definition: rect.h:61

◆ ImageHeight()

int tesseract::Tesseract::ImageHeight ( ) const
inline

Definition at line 230 of file tesseractclass.h.

230  {
231  return pixGetHeight(pix_binary_);
232  }

◆ ImageWidth()

int tesseract::Tesseract::ImageWidth ( ) const
inline

Definition at line 227 of file tesseractclass.h.

227  {
228  return pixGetWidth(pix_binary_);
229  }

◆ init_recog_training()

FILE * tesseract::Tesseract::init_recog_training ( const STRING fname)

Definition at line 36 of file recogtraining.cpp.

36  {
38  tessedit_tess_adaption_mode.set_value(0); // turn off adaption
39  tessedit_enable_doc_dict.set_value(0); // turn off document dictionary
40  // Explore all segmentations.
42  }
43 
44  STRING output_fname = fname;
45  const char *lastdot = strrchr(output_fname.string(), '.');
46  if (lastdot != NULL) output_fname[lastdot - output_fname.string()] = '\0';
47  output_fname += ".txt";
48  FILE *output_file = open_file(output_fname.string(), "a+");
49  return output_file;
50 }
Dict & getDict()
Definition: classify.h:65
const char * string() const
Definition: strngs.cpp:198
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
Definition: strngs.h:45
bool stopper_no_acceptable_choices
Definition: dict.h:625

◆ init_tesseract() [1/2]

int tesseract::Tesseract::init_tesseract ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 295 of file tessedit.cpp.

301  {
302  GenericVector<STRING> langs_to_load;
303  GenericVector<STRING> langs_not_to_load;
304  ParseLanguageString(language, &langs_to_load, &langs_not_to_load);
305 
306  sub_langs_.delete_data_pointers();
307  sub_langs_.clear();
308  // Find the first loadable lang and load into this.
309  // Add any languages that this language requires
310  bool loaded_primary = false;
311  // Load the rest into sub_langs_.
312  for (int lang_index = 0; lang_index < langs_to_load.size(); ++lang_index) {
313  if (!IsStrInList(langs_to_load[lang_index], langs_not_to_load)) {
314  const char *lang_str = langs_to_load[lang_index].string();
315  Tesseract *tess_to_init;
316  if (!loaded_primary) {
317  tess_to_init = this;
318  } else {
319  tess_to_init = new Tesseract;
320  }
321 
322  int result = tess_to_init->init_tesseract_internal(
323  arg0, textbase, lang_str, oem, configs, configs_size, vars_vec,
324  vars_values, set_only_non_debug_params, mgr);
325  // Forget that language, but keep any reader we were given.
326  mgr->Clear();
327 
328  if (!loaded_primary) {
329  if (result < 0) {
330  tprintf("Failed loading language '%s'\n", lang_str);
331  } else {
332  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
333  &langs_to_load, &langs_not_to_load);
334  loaded_primary = true;
335  }
336  } else {
337  if (result < 0) {
338  tprintf("Failed loading language '%s'\n", lang_str);
339  delete tess_to_init;
340  } else {
341  sub_langs_.push_back(tess_to_init);
342  // Add any languages that this language requires
343  ParseLanguageString(tess_to_init->tessedit_load_sublangs.string(),
344  &langs_to_load, &langs_not_to_load);
345  }
346  }
347  }
348  }
349  if (!loaded_primary) {
350  tprintf("Tesseract couldn't load any languages!\n");
351  return -1; // Couldn't load any language!
352  }
353  if (!sub_langs_.empty()) {
354  // In multilingual mode word ratings have to be directly comparable,
355  // so use the same language model weights for all languages:
356  // use the primary language's params model if
357  // tessedit_use_primary_params_model is set,
358  // otherwise use default language model weights.
360  for (int s = 0; s < sub_langs_.size(); ++s) {
361  sub_langs_[s]->language_model_->getParamsModel().Copy(
363  }
364  tprintf("Using params model of the primary language\n");
365  } else {
367  for (int s = 0; s < sub_langs_.size(); ++s) {
368  sub_langs_[s]->language_model_->getParamsModel().Clear();
369  }
370  }
371  }
372 
374  return 0;
375 }
void SetupUniversalFontIds()
Definition: tessedit.cpp:436
#define tprintf(...)
Definition: tprintf.h:31
void ParseLanguageString(const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load)
Definition: tessedit.cpp:261
int size() const
Definition: genericvector.h:72
ParamsModel & getParamsModel()
LanguageModel * language_model_
Definition: wordrec.h:410

◆ init_tesseract() [2/2]

int tesseract::Tesseract::init_tesseract ( const char *  datapath,
const char *  language,
OcrEngineMode  oem 
)
inline

Definition at line 504 of file tesseractclass.h.

506  {
507  TessdataManager mgr;
508  return init_tesseract(datapath, NULL, language, oem, NULL, 0, NULL, NULL,
509  false, &mgr);
510  }
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:295

◆ init_tesseract_internal()

int tesseract::Tesseract::init_tesseract_internal ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 393 of file tessedit.cpp.

399  {
400  if (!init_tesseract_lang_data(arg0, textbase, language, oem, configs,
401  configs_size, vars_vec, vars_values,
402  set_only_non_debug_params, mgr)) {
403  return -1;
404  }
406  return 0;
407  }
408  // If only LSTM will be used, skip loading Tesseract classifier's
409  // pre-trained templates and dictionary.
411  program_editup(textbase, init_tesseract ? mgr : nullptr,
412  init_tesseract ? mgr : nullptr);
413  return 0; //Normal exit
414 }
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:91
void program_editup(const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
Definition: tface.cpp:46
int init_tesseract(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:295

◆ init_tesseract_lang_data()

bool tesseract::Tesseract::init_tesseract_lang_data ( const char *  arg0,
const char *  textbase,
const char *  language,
OcrEngineMode  oem,
char **  configs,
int  configs_size,
const GenericVector< STRING > *  vars_vec,
const GenericVector< STRING > *  vars_values,
bool  set_only_init_params,
TessdataManager mgr 
)

Definition at line 91 of file tessedit.cpp.

96  {
97  // Set the basename, compute the data directory.
98  main_setup(arg0, textbase);
99 
100  // Set the language data path prefix
101  lang = language != NULL ? language : "eng";
105 
106  // Initialize TessdataManager.
107  STRING tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
108  if (!mgr->is_loaded() && !mgr->Init(tessdata_path.string())) {
109  // Try without tessdata.
110  m_data_sub_dir.set_value("");
111  main_setup(arg0, textbase);
115  tessdata_path = language_data_path_prefix + kTrainedDataSuffix;
116  if (!mgr->Init(tessdata_path.string())) {
117  tprintf("Error opening data file %s\n", tessdata_path.string());
118  tprintf(
119  "Please make sure the TESSDATA_PREFIX environment variable is set"
120  " to your \"tessdata\" directory.\n");
121  return false;
122  }
123  }
124  if (oem == OEM_DEFAULT) {
125  // Set the engine mode from availability, which can then be overidden by
126  // the config file when we read it below.
127  if (!mgr->IsLSTMAvailable()) {
129  } else if (!mgr->IsBaseAvailable()) {
131  } else {
133  }
134  }
135 
136  // If a language specific config file (lang.config) exists, load it in.
137  TFile fp;
138  if (mgr->GetComponent(TESSDATA_LANG_CONFIG, &fp)) {
140  this->params());
141  }
142 
143  SetParamConstraint set_params_constraint = set_only_non_debug_params ?
145  // Load tesseract variables from config files. This is done after loading
146  // language-specific variables from [lang].traineddata file, so that custom
147  // config files can override values in [lang].traineddata file.
148  for (int i = 0; i < configs_size; ++i) {
149  read_config_file(configs[i], set_params_constraint);
150  }
151 
152  // Set params specified in vars_vec (done after setting params from config
153  // files, so that params in vars_vec can override those from files).
154  if (vars_vec != NULL && vars_values != NULL) {
155  for (int i = 0; i < vars_vec->size(); ++i) {
156  if (!ParamUtils::SetParam((*vars_vec)[i].string(),
157  (*vars_values)[i].string(),
158  set_params_constraint, this->params())) {
159  tprintf("Error setting param %s\n", (*vars_vec)[i].string());
160  exit(1);
161  }
162  }
163  }
164 
165  if (((STRING &)tessedit_write_params_to_file).length() > 0) {
166  FILE *params_file = fopen(tessedit_write_params_to_file.string(), "wb");
167  if (params_file != NULL) {
168  ParamUtils::PrintParams(params_file, this->params());
169  fclose(params_file);
170  } else {
171  tprintf("Failed to open %s for writing params.\n",
172  tessedit_write_params_to_file.string());
173  }
174  }
175 
176  // Determine which ocr engine(s) should be loaded and used for recognition.
177  if (oem != OEM_DEFAULT) tessedit_ocr_engine_mode.set_value(oem);
178 
179  // If we are only loading the config file (and so not planning on doing any
180  // recognition) then there's nothing else do here.
182  return true;
183  }
184 
185 // The various OcrEngineMode settings (see publictypes.h) determine which
186 // engine-specific data files need to be loaded.
187 // If LSTM_ONLY is requested, the base Tesseract files are *Not* required.
188 #ifndef ANDROID_BUILD
191  if (mgr->GetComponent(TESSDATA_LSTM, &fp)) {
192  lstm_recognizer_ = new LSTMRecognizer;
193  ASSERT_HOST(lstm_recognizer_->DeSerialize(&fp));
194  if (lstm_use_matrix) lstm_recognizer_->LoadDictionary(language, mgr);
195  } else {
196  tprintf("Error: LSTM requested, but not present!! Loading tesseract.\n");
198  }
199  }
200 #endif
201 
202  // Load the unicharset
204  // Avoid requiring a unicharset when we aren't running base tesseract.
205 #ifndef ANDROID_BUILD
206  unicharset.CopyFrom(lstm_recognizer_->GetUnicharset());
207 #endif
208  } else if (!mgr->GetComponent(TESSDATA_UNICHARSET, &fp) ||
209  !unicharset.load_from_file(&fp, false)) {
210  return false;
211  }
212  if (unicharset.size() > MAX_NUM_CLASSES) {
213  tprintf("Error: Size of unicharset is greater than MAX_NUM_CLASSES\n");
214  return false;
215  }
216  right_to_left_ = unicharset.major_right_to_left();
217 
218  // Setup initial unichar ambigs table and read universal ambigs.
219  UNICHARSET encoder_unicharset;
220  encoder_unicharset.CopyFrom(unicharset);
222  unichar_ambigs.LoadUniversal(encoder_unicharset, &unicharset);
223 
224  if (!tessedit_ambigs_training && mgr->GetComponent(TESSDATA_AMBIGS, &fp)) {
225  unichar_ambigs.LoadUnicharAmbigs(encoder_unicharset, &fp,
228  }
229  // Init ParamsModel.
230  // Load pass1 and pass2 weights (for now these two sets are the same, but in
231  // the future separate sets of weights can be generated).
232  for (int p = ParamsModel::PTRAIN_PASS1;
235  static_cast<ParamsModel::PassEnum>(p));
236  if (mgr->GetComponent(TESSDATA_PARAMS_MODEL, &fp)) {
238  return false;
239  }
240  }
241  }
242 
243  return true;
244 }
static bool ReadParamsFromFp(SetParamConstraint constraint, TFile *fp, ParamsVectors *member_params)
Definition: params.cpp:61
void LoadUniversal(const UNICHARSET &encoder_set, UNICHARSET *unicharset)
Definition: ambigs.cpp:67
ParamsVectors * params()
Definition: ccutil.h:62
bool LoadFromFp(const char *lang, TFile *fp)
const UNICHARSET & GetUnicharset() const
#define tprintf(...)
Definition: tprintf.h:31
int ambigs_debug_level
Definition: ccutil.h:85
const char * string() const
Definition: strngs.cpp:198
char * m_data_sub_dir
Definition: ccutil.h:80
STRING language_data_path_prefix
Definition: ccutil.h:67
void main_setup(const char *argv0, const char *basename)
CCUtil::main_setup - set location of tessdata and name of image.
Definition: mainblk.cpp:53
int size() const
Definition: genericvector.h:72
bool LoadDictionary(const char *lang, TessdataManager *mgr)
#define ASSERT_HOST(x)
Definition: errcode.h:84
STRING lang
Definition: ccutil.h:66
bool use_ambigs_for_adaption
Definition: ccutil.h:89
Definition: strngs.h:45
ParamsModel & getParamsModel()
UNICHARSET unicharset
Definition: ccutil.h:68
static bool SetParam(const char *name, const char *value, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:91
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:348
void read_config_file(const char *filename, SetParamConstraint constraint)
Definition: tessedit.cpp:60
SetParamConstraint
Definition: params.h:36
char * tessedit_write_params_to_file
int size() const
Definition: unicharset.h:299
STRING datadir
Definition: ccutil.h:64
LanguageModel * language_model_
Definition: wordrec.h:410
#define MAX_NUM_CLASSES
Definition: matchdefs.h:31
UnicharAmbigs unichar_ambigs
Definition: ccutil.h:69
void InitUnicharAmbigs(const UNICHARSET &unicharset, bool use_ambigs_for_adaption)
Definition: ambigs.cpp:53
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:173
void SetPass(PassEnum pass)
Definition: params_model.h:72
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:423
void LoadUnicharAmbigs(const UNICHARSET &encoder_set, TFile *ambigs_file, int debug_level, bool use_ambigs_for_adaption, UNICHARSET *unicharset)
Definition: ambigs.cpp:74
bool major_right_to_left() const
Definition: unicharset.cpp:933

◆ init_tesseract_lm()

int tesseract::Tesseract::init_tesseract_lm ( const char *  arg0,
const char *  textbase,
const char *  language,
TessdataManager mgr 
)

Definition at line 457 of file tessedit.cpp.

458  {
459  if (!init_tesseract_lang_data(arg0, textbase, language, OEM_TESSERACT_ONLY,
460  NULL, 0, NULL, NULL, false, mgr))
461  return -1;
463  getDict().Load(lang, mgr);
464  getDict().FinishLoad();
465  return 0;
466 }
Dict & getDict()
Definition: classify.h:65
bool init_tesseract_lang_data(const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr)
Definition: tessedit.cpp:91
STRING lang
Definition: ccutil.h:66
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:198
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:206
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:224
bool FinishLoad()
Definition: dict.cpp:327

◆ join_words()

void tesseract::Tesseract::join_words ( WERD_RES word,
WERD_RES word2,
BlamerBundle orig_bb 
) const

Definition at line 240 of file tfacepp.cpp.

242  {
243  TBOX prev_box = word->chopped_word->blobs.back()->bounding_box();
244  TBOX blob_box = word2->chopped_word->blobs[0]->bounding_box();
245  // Tack the word2 outputs onto the end of the word outputs.
246  word->chopped_word->blobs += word2->chopped_word->blobs;
247  word->rebuild_word->blobs += word2->rebuild_word->blobs;
248  word2->chopped_word->blobs.clear();
249  word2->rebuild_word->blobs.clear();
250  TPOINT split_pt;
251  split_pt.x = (prev_box.right() + blob_box.left()) / 2;
252  split_pt.y = (prev_box.top() + prev_box.bottom() +
253  blob_box.top() + blob_box.bottom()) / 4;
254  // Move the word2 seams onto the end of the word1 seam_array.
255  // Since the seam list is one element short, an empty seam marking the
256  // end of the last blob in the first word is needed first.
257  word->seam_array.push_back(new SEAM(0.0f, split_pt));
258  word->seam_array += word2->seam_array;
259  word2->seam_array.truncate(0);
260  // Fix widths and gaps.
261  word->blob_widths += word2->blob_widths;
262  word->blob_gaps += word2->blob_gaps;
263  // Fix the ratings matrix.
264  int rat1 = word->ratings->dimension();
265  int rat2 = word2->ratings->dimension();
266  word->ratings->AttachOnCorner(word2->ratings);
267  ASSERT_HOST(word->ratings->dimension() == rat1 + rat2);
268  word->best_state += word2->best_state;
269  // Append the word choices.
270  *word->raw_choice += *word2->raw_choice;
271 
272  // How many alt choices from each should we try to get?
273  const int kAltsPerPiece = 2;
274  // When do we start throwing away extra alt choices?
275  const int kTooManyAltChoices = 100;
276 
277  // Construct the cartesian product of the best_choices of word(1) and word2.
278  WERD_CHOICE_LIST joined_choices;
279  WERD_CHOICE_IT jc_it(&joined_choices);
280  WERD_CHOICE_IT bc1_it(&word->best_choices);
281  WERD_CHOICE_IT bc2_it(&word2->best_choices);
282  int num_word1_choices = word->best_choices.length();
283  int total_joined_choices = num_word1_choices;
284  // Nota Bene: For the main loop here, we operate only on the 2nd and greater
285  // word2 choices, and put them in the joined_choices list. The 1st word2
286  // choice gets added to the original word1 choices in-place after we have
287  // finished with them.
288  int bc2_index = 1;
289  for (bc2_it.forward(); !bc2_it.at_first(); bc2_it.forward(), ++bc2_index) {
290  if (total_joined_choices >= kTooManyAltChoices &&
291  bc2_index > kAltsPerPiece)
292  break;
293  int bc1_index = 0;
294  for (bc1_it.move_to_first(); bc1_index < num_word1_choices;
295  ++bc1_index, bc1_it.forward()) {
296  if (total_joined_choices >= kTooManyAltChoices &&
297  bc1_index > kAltsPerPiece)
298  break;
299  WERD_CHOICE *wc = new WERD_CHOICE(*bc1_it.data());
300  *wc += *bc2_it.data();
301  jc_it.add_after_then_move(wc);
302  ++total_joined_choices;
303  }
304  }
305  // Now that we've filled in as many alternates as we want, paste the best
306  // choice for word2 onto the original word alt_choices.
307  bc1_it.move_to_first();
308  bc2_it.move_to_first();
309  for (bc1_it.mark_cycle_pt(); !bc1_it.cycled_list(); bc1_it.forward()) {
310  *bc1_it.data() += *bc2_it.data();
311  }
312  bc1_it.move_to_last();
313  bc1_it.add_list_after(&joined_choices);
314 
315  // Restore the pointer to original blamer bundle and combine blamer
316  // information recorded in the splits.
317  if (orig_bb != NULL) {
318  orig_bb->JoinBlames(*word->blamer_bundle, *word2->blamer_bundle,
320  delete word->blamer_bundle;
321  word->blamer_bundle = orig_bb;
322  }
323  word->SetupBoxWord();
324  word->reject_map.initialise(word->box_word->length());
325  delete word2;
326 }
void AttachOnCorner(BandTriMatrix< T > *array2)
Definition: matrix.h:538
GenericVector< int > best_state
Definition: pageres.h:255
BlamerBundle * blamer_bundle
Definition: pageres.h:230
int push_back(T object)
TWERD * rebuild_word
Definition: pageres.h:244
GenericVector< int > blob_widths
Definition: pageres.h:205
void truncate(int size)
tesseract::BoxWord * box_word
Definition: pageres.h:250
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
MATRIX * ratings
Definition: pageres.h:215
Definition: seam.h:44
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
inT16 x
Definition: blobs.h:71
WERD_CHOICE * raw_choice
Definition: pageres.h:224
inT16 top() const
Definition: rect.h:54
void SetupBoxWord()
Definition: pageres.cpp:843
int dimension() const
Definition: matrix.h:521
inT16 y
Definition: blobs.h:72
Definition: rect.h:30
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
T & back() const
void JoinBlames(const BlamerBundle &bundle1, const BlamerBundle &bundle2, bool debug)
Definition: blamer.cpp:225
Definition: blobs.h:50
inT16 right() const
Definition: rect.h:75
GenericVector< int > blob_gaps
Definition: pageres.h:208
inT16 bottom() const
Definition: rect.h:61
bool wordrec_debug_blamer
Definition: wordrec.h:167
void initialise(inT16 length)
Definition: rejctmap.cpp:318
TBOX bounding_box() const
Definition: blobs.cpp:482
TWERD * chopped_word
Definition: pageres.h:201
REJMAP reject_map
Definition: pageres.h:271
int length() const
Definition: boxword.h:85
GenericVector< SEAM * > seam_array
Definition: pageres.h:203

◆ LSTMRecognizeWord()

void tesseract::Tesseract::LSTMRecognizeWord ( const BLOCK block,
ROW row,
WERD_RES word,
PointerVector< WERD_RES > *  words 
)

Definition at line 224 of file linerec.cpp.

225  {
226  TBOX word_box = word->word->bounding_box();
227  // Get the word image - no frills.
230  // In single word mode, use the whole image without any other row/word
231  // interpretation.
232  word_box = TBOX(0, 0, ImageWidth(), ImageHeight());
233  } else {
234  float baseline = row->base_line((word_box.left() + word_box.right()) / 2);
235  if (baseline + row->descenders() < word_box.bottom())
236  word_box.set_bottom(baseline + row->descenders());
237  if (baseline + row->x_height() + row->ascenders() > word_box.top())
238  word_box.set_top(baseline + row->x_height() + row->ascenders());
239  }
240  ImageData* im_data = GetRectImage(word_box, block, kImagePadding, &word_box);
241  if (im_data == NULL) return;
242  lstm_recognizer_->RecognizeLine(*im_data, true, classify_debug_level > 0,
244  lstm_use_matrix, &unicharset, word_box, 2.0,
245  false, words);
246  delete im_data;
247  SearchWords(words);
248 }
ImageData * GetRectImage(const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const
Definition: linerec.cpp:165
void RecognizeLine(const ImageData &image_data, bool invert, bool debug, double worst_dict_cert, bool use_alternates, const UNICHARSET *target_unicharset, const TBOX &line_box, float score_ratio, bool one_word, PointerVector< WERD_RES > *words)
Treat the image as a single word.
Definition: publictypes.h:162
const int kImagePadding
Definition: imagedata.h:37
float x_height() const
Definition: ocrrow.h:61
const float kWorstDictCertainty
Definition: linerec.cpp:40
TBOX bounding_box() const
Definition: werd.cpp:160
inT16 left() const
Definition: rect.h:68
void set_top(int y)
Definition: rect.h:57
void SearchWords(PointerVector< WERD_RES > *words)
Definition: linerec.cpp:253
const float kCertaintyScale
Definition: linerec.cpp:38
UNICHARSET unicharset
Definition: ccutil.h:68
inT16 top() const
Definition: rect.h:54
float ascenders() const
Definition: ocrrow.h:79
Definition: rect.h:30
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
void set_bottom(int y)
Definition: rect.h:64
float base_line(float xpos) const
Definition: ocrrow.h:56
float descenders() const
Definition: ocrrow.h:82

◆ make_reject_map()

void tesseract::Tesseract::make_reject_map ( WERD_RES word,
ROW row,
inT16  pass 
)

◆ match_current_words()

void tesseract::Tesseract::match_current_words ( WERD_RES_LIST &  words,
ROW row,
BLOCK block 
)

Definition at line 196 of file fixspace.cpp.

197  {
198  WERD_RES_IT word_it(&words);
199  WERD_RES *word;
200  // Since we are not using PAGE_RES to iterate over words, we need to update
201  // prev_word_best_choice_ before calling classify_word_pass2().
202  prev_word_best_choice_ = NULL;
203  for (word_it.mark_cycle_pt(); !word_it.cycled_list(); word_it.forward()) {
204  word = word_it.data();
205  if ((!word->part_of_combo) && (word->box_word == NULL)) {
206  WordData word_data(block, row, word);
207  SetupWordPassN(2, &word_data);
208  classify_word_and_language(2, NULL, &word_data);
209  }
211  }
212 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1285
tesseract::BoxWord * box_word
Definition: pageres.h:250
BOOL8 part_of_combo
Definition: pageres.h:319
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:174
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:415

◆ match_word_pass_n()

void tesseract::Tesseract::match_word_pass_n ( int  pass_n,
WERD_RES word,
ROW row,
BLOCK block 
)

match_word_pass2

Baseline normalize the word and pass it to Tess.

Definition at line 1576 of file control.cpp.

1577  {
1578  if (word->tess_failed) return;
1579  tess_segment_pass_n(pass_n, word);
1580 
1581  if (!word->tess_failed) {
1582  if (!word->word->flag (W_REP_CHAR)) {
1583  word->fix_quotes();
1585  word->fix_hyphens();
1586  /* Don't trust fix_quotes! - though I think I've fixed the bug */
1587  if (word->best_choice->length() != word->box_word->length()) {
1588  tprintf("POST FIX_QUOTES FAIL String:\"%s\"; Strlen=%d;"
1589  " #Blobs=%d\n",
1590  word->best_choice->debug_string().string(),
1591  word->best_choice->length(),
1592  word->box_word->length());
1593 
1594  }
1595  word->tess_accepted = tess_acceptable_word(word);
1596 
1597  // Also sets word->done flag
1598  make_reject_map(word, row, pass_n);
1599  }
1600  }
1601  set_word_fonts(word);
1602 
1603  ASSERT_HOST(word->raw_choice != NULL);
1604 }
BOOL8 tess_failed
Definition: pageres.h:272
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
const STRING debug_string() const
Definition: ratngs.h:503
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
void fix_hyphens()
Definition: pageres.cpp:1042
tesseract::BoxWord * box_word
Definition: pageres.h:250
void set_word_fonts(WERD_RES *word)
Definition: control.cpp:1907
#define ASSERT_HOST(x)
Definition: errcode.h:84
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void fix_quotes()
Definition: pageres.cpp:1013
bool tess_acceptable_word(WERD_RES *word)
Definition: tessbox.cpp:69
WERD_CHOICE * raw_choice
Definition: pageres.h:224
BOOL8 tess_accepted
Definition: pageres.h:280
WERD * word
Definition: pageres.h:175
void tess_segment_pass_n(int pass_n, WERD_RES *word)
Definition: tessbox.cpp:39
void make_reject_map(WERD_RES *word, ROW *row, inT16 pass)
int length() const
Definition: boxword.h:85

◆ MaximallyChopWord()

void tesseract::Tesseract::MaximallyChopWord ( const GenericVector< TBOX > &  boxes,
BLOCK block,
ROW row,
WERD_RES word_res 
)

Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.

Definition at line 253 of file applybox.cpp.

255  {
256  if (!word_res->SetupForRecognition(unicharset, this, BestPix(),
261  row, block)) {
262  word_res->CloneChoppedToRebuild();
263  return;
264  }
265  if (chop_debug) {
266  tprintf("Maximally chopping word at:");
267  word_res->word->bounding_box().print();
268  }
269  GenericVector<BLOB_CHOICE*> blob_choices;
270  ASSERT_HOST(!word_res->chopped_word->blobs.empty());
271  float rating = static_cast<float>(MAX_INT8);
272  for (int i = 0; i < word_res->chopped_word->NumBlobs(); ++i) {
273  // The rating and certainty are not quite arbitrary. Since
274  // select_blob_to_chop uses the worst certainty to choose, they all have
275  // to be different, so starting with MAX_INT8, subtract 1/8 for each blob
276  // in here, and then divide by e each time they are chopped, which
277  // should guarantee a set of unequal values for the whole tree of blobs
278  // produced, however much chopping is required. The chops are thus only
279  // limited by the ability of the chopper to find suitable chop points,
280  // and not by the value of the certainties.
281  BLOB_CHOICE* choice =
282  new BLOB_CHOICE(0, rating, -rating, -1, 0.0f, 0.0f, 0.0f, BCC_FAKE);
283  blob_choices.push_back(choice);
284  rating -= 0.125f;
285  }
286  const double e = exp(1.0); // The base of natural logs.
287  int blob_number;
288  int right_chop_index = 0;
290  // We only chop if the language is not fixed pitch like CJK.
291  SEAM* seam = NULL;
292  while ((seam = chop_one_blob(boxes, blob_choices, word_res,
293  &blob_number)) != NULL) {
294  word_res->InsertSeam(blob_number, seam);
295  BLOB_CHOICE* left_choice = blob_choices[blob_number];
296  rating = left_choice->rating() / e;
297  left_choice->set_rating(rating);
298  left_choice->set_certainty(-rating);
299  // combine confidence w/ serial #
300  BLOB_CHOICE* right_choice = new BLOB_CHOICE(++right_chop_index,
301  rating - 0.125f, -rating, -1,
302  0.0f, 0.0f, 0.0f, BCC_FAKE);
303  blob_choices.insert(right_choice, blob_number + 1);
304  }
305  }
306  word_res->CloneChoppedToRebuild();
307  word_res->FakeClassifyWord(blob_choices.size(), &blob_choices[0]);
308 }
bool classify_bln_numeric_mode
Definition: classify.h:499
int push_back(T object)
float rating() const
Definition: ratngs.h:79
#define tprintf(...)
Definition: tprintf.h:31
bool empty() const
Definition: genericvector.h:90
void CloneChoppedToRebuild()
Definition: pageres.cpp:828
int size() const
Definition: genericvector.h:72
TBOX bounding_box() const
Definition: werd.cpp:160
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:410
void FakeClassifyWord(int blob_count, BLOB_CHOICE **choices)
Definition: pageres.cpp:872
void insert(T t, int index)
Definition: seam.h:44
Pix * BestPix() const
void set_certainty(float newrat)
Definition: ratngs.h:150
UNICHARSET unicharset
Definition: ccutil.h:68
int NumBlobs() const
Definition: blobs.h:425
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
WERD * word
Definition: pageres.h:175
SEAM * chop_one_blob(const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:373
void print() const
Definition: rect.h:270
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
#define MAX_INT8
Definition: host.h:60
void set_rating(float newrat)
Definition: ratngs.h:147
TWERD * chopped_word
Definition: pageres.h:201

◆ mutable_pix_binary()

Pix** tesseract::Tesseract::mutable_pix_binary ( )
inline

Definition at line 185 of file tesseractclass.h.

185  {
186  pixDestroy(&pix_binary_);
187  return &pix_binary_;
188  }

◆ mutable_textord()

Textord* tesseract::Tesseract::mutable_textord ( )
inline

Definition at line 246 of file tesseractclass.h.

246  {
247  return &textord_;
248  }

◆ nn_match_word()

void tesseract::Tesseract::nn_match_word ( WERD_RES word,
ROW row 
)

◆ nn_recover_rejects()

void tesseract::Tesseract::nn_recover_rejects ( WERD_RES word,
ROW row 
)

◆ noise_outlines()

BOOL8 tesseract::Tesseract::noise_outlines ( TWERD word)

Definition at line 982 of file docqual.cpp.

982  {
983  TBOX box; // BB of outline
984  inT16 outline_count = 0;
985  inT16 small_outline_count = 0;
986  inT16 max_dimension;
987  float small_limit = kBlnXHeight * crunch_small_outlines_size;
988 
989  for (int b = 0; b < word->NumBlobs(); ++b) {
990  TBLOB* blob = word->blobs[b];
991  for (TESSLINE* ol = blob->outlines; ol != NULL; ol = ol->next) {
992  outline_count++;
993  box = ol->bounding_box();
994  if (box.height() > box.width())
995  max_dimension = box.height();
996  else
997  max_dimension = box.width();
998  if (max_dimension < small_limit)
999  small_outline_count++;
1000  }
1001  }
1002  return small_outline_count >= outline_count;
1003 }
TESSLINE * next
Definition: blobs.h:258
TESSLINE * outlines
Definition: blobs.h:377
const int kBlnXHeight
Definition: normalis.h:28
int16_t inT16
Definition: host.h:36
int NumBlobs() const
Definition: blobs.h:425
Definition: rect.h:30
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111

◆ non_0_digit()

BOOL8 tesseract::Tesseract::non_0_digit ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 789 of file reject.cpp.

789  {
790  return ch_set.get_isdigit(unichar_id) && !ch_set.eq(unichar_id, "0");
791 }
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656

◆ non_O_upper()

BOOL8 tesseract::Tesseract::non_O_upper ( const UNICHARSET ch_set,
UNICHAR_ID  unichar_id 
)

Definition at line 785 of file reject.cpp.

785  {
786  return ch_set.get_isupper(unichar_id) && !ch_set.eq(unichar_id, "O");
787 }
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465

◆ num_sub_langs()

int tesseract::Tesseract::num_sub_langs ( ) const
inline

Definition at line 253 of file tesseractclass.h.

253  {
254  return sub_langs_.size();
255  }

◆ one_ell_conflict()

BOOL8 tesseract::Tesseract::one_ell_conflict ( WERD_RES word_res,
BOOL8  update_map 
)

Definition at line 292 of file reject.cpp.

292  {
293  const char *word;
294  const char *lengths;
295  inT16 word_len; //its length
296  inT16 first_alphanum_index_;
297  inT16 first_alphanum_offset_;
298  inT16 i;
299  inT16 offset;
300  BOOL8 non_conflict_set_char; //non conf set a/n?
301  BOOL8 conflict = FALSE;
302  BOOL8 allow_1s;
303  ACCEPTABLE_WERD_TYPE word_type;
304  BOOL8 dict_perm_type;
305  BOOL8 dict_word_ok;
306  int dict_word_type;
307 
308  word = word_res->best_choice->unichar_string().string ();
309  lengths = word_res->best_choice->unichar_lengths().string();
310  word_len = strlen (lengths);
311  /*
312  If there are no occurrences of the conflict set characters then the word
313  is OK.
314  */
315  if (strpbrk (word, conflict_set_I_l_1.string ()) == NULL)
316  return FALSE;
317 
318  /*
319  There is a conflict if there are NO other (confirmed) alphanumerics apart
320  from those in the conflict set.
321  */
322 
323  for (i = 0, offset = 0, non_conflict_set_char = FALSE;
324  (i < word_len) && !non_conflict_set_char; offset += lengths[i++])
325  non_conflict_set_char =
326  (word_res->uch_set->get_isalpha(word + offset, lengths[i]) ||
327  word_res->uch_set->get_isdigit(word + offset, lengths[i])) &&
328  !STRING (conflict_set_I_l_1).contains (word[offset]);
329  if (!non_conflict_set_char) {
330  if (update_map)
331  reject_I_1_L(word_res);
332  return TRUE;
333  }
334 
335  /*
336  If the word is accepted by a dawg permuter, and the first alpha character
337  is "I" or "l", check to see if the alternative is also a dawg word. If it
338  is, then there is a potential error otherwise the word is ok.
339  */
340 
341  dict_perm_type = (word_res->best_choice->permuter () == SYSTEM_DAWG_PERM) ||
342  (word_res->best_choice->permuter () == USER_DAWG_PERM) ||
344  (word_res->best_choice->permuter () == DOC_DAWG_PERM)) ||
345  (word_res->best_choice->permuter () == FREQ_DAWG_PERM);
346  dict_word_type = dict_word(*(word_res->best_choice));
347  dict_word_ok = (dict_word_type > 0) &&
348  (rej_trust_doc_dawg || (dict_word_type != DOC_DAWG_PERM));
349 
350  if ((rej_1Il_use_dict_word && dict_word_ok) ||
351  (rej_1Il_trust_permuter_type && dict_perm_type) ||
352  (dict_perm_type && dict_word_ok)) {
353  first_alphanum_index_ = first_alphanum_index (word, lengths);
354  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
355  if (lengths[first_alphanum_index_] == 1 &&
356  word[first_alphanum_offset_] == 'I') {
357  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
358  if (safe_dict_word(word_res) > 0) {
359  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
360  if (update_map)
361  word_res->reject_map[first_alphanum_index_].
362  setrej_1Il_conflict();
363  return TRUE;
364  }
365  else {
366  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
367  return FALSE;
368  }
369  }
370 
371  if (lengths[first_alphanum_index_] == 1 &&
372  word[first_alphanum_offset_] == 'l') {
373  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
374  if (safe_dict_word(word_res) > 0) {
375  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
376  if (update_map)
377  word_res->reject_map[first_alphanum_index_].
378  setrej_1Il_conflict();
379  return TRUE;
380  }
381  else {
382  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
383  return FALSE;
384  }
385  }
386  return FALSE;
387  }
388 
389  /*
390  NEW 1Il code. The old code relied on permuter types too much. In fact,
391  tess will use TOP_CHOICE permute for good things like "palette".
392  In this code the string is examined independently to see if it looks like
393  a well formed word.
394  */
395 
396  /*
397  REGARDLESS OF PERMUTER, see if flipping a leading I/l generates a
398  dictionary word.
399  */
400  first_alphanum_index_ = first_alphanum_index (word, lengths);
401  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
402  if (lengths[first_alphanum_index_] == 1 &&
403  word[first_alphanum_offset_] == 'l') {
404  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
405  if (safe_dict_word(word_res) > 0)
406  return FALSE;
407  else
408  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
409  }
410  else if (lengths[first_alphanum_index_] == 1 &&
411  word[first_alphanum_offset_] == 'I') {
412  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'l';
413  if (safe_dict_word(word_res) > 0)
414  return FALSE;
415  else
416  word_res->best_choice->unichar_string()[first_alphanum_offset_] = 'I';
417  }
418  /*
419  For strings containing digits:
420  If there are no alphas OR the numeric permuter liked the word,
421  reject any non 1 conflict chs
422  Else reject all conflict chs
423  */
424  if (word_contains_non_1_digit (word, lengths)) {
425  allow_1s = (alpha_count (word, lengths) == 0) ||
426  (word_res->best_choice->permuter () == NUMBER_PERM);
427 
428  inT16 offset;
429  conflict = FALSE;
430  for (i = 0, offset = 0; word[offset] != '\0';
431  offset += word_res->best_choice->unichar_lengths()[i++]) {
432  if ((!allow_1s || (word[offset] != '1')) &&
433  STRING (conflict_set_I_l_1).contains (word[offset])) {
434  if (update_map)
435  word_res->reject_map[i].setrej_1Il_conflict ();
436  conflict = TRUE;
437  }
438  }
439  return conflict;
440  }
441  /*
442  For anything else. See if it conforms to an acceptable word type. If so,
443  treat accordingly.
444  */
445  word_type = acceptable_word_string(*word_res->uch_set, word, lengths);
446  if ((word_type == AC_LOWER_CASE) || (word_type == AC_INITIAL_CAP)) {
447  first_alphanum_index_ = first_alphanum_index (word, lengths);
448  first_alphanum_offset_ = first_alphanum_offset (word, lengths);
449  if (STRING (conflict_set_I_l_1).contains (word[first_alphanum_offset_])) {
450  if (update_map)
451  word_res->reject_map[first_alphanum_index_].
452  setrej_1Il_conflict ();
453  return TRUE;
454  }
455  else
456  return FALSE;
457  }
458  else if (word_type == AC_UPPER_CASE) {
459  return FALSE;
460  }
461  else {
462  if (update_map)
463  reject_I_1_L(word_res);
464  return TRUE;
465  }
466 }
#define TRUE
Definition: capi.h:45
inT16 first_alphanum_offset(const char *word, const char *word_lengths)
Definition: reject.cpp:482
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
ACCEPTABLE_WERD_TYPE
Definition: control.h:34
inT16 first_alphanum_index(const char *word, const char *word_lengths)
Definition: reject.cpp:469
const char * string() const
Definition: strngs.cpp:198
voidpf uLong offset
Definition: ioapi.h:42
int16_t inT16
Definition: host.h:36
void reject_I_1_L(WERD_RES *word)
Definition: reject.cpp:191
uinT8 permuter() const
Definition: ratngs.h:344
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
BOOL8 word_contains_non_1_digit(const char *word, const char *word_lengths)
Definition: reject.cpp:509
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
unsigned char BOOL8
Definition: host.h:44
Definition: strngs.h:45
#define FALSE
Definition: capi.h:46
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:495
BOOL8 contains(const char c) const
Definition: strngs.cpp:189
const STRING & unichar_string() const
Definition: ratngs.h:539
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1690
ALL but initial lc.
Definition: control.h:39
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:128
ALL lower case.
Definition: control.h:37
ALL upper case.
Definition: control.h:38
const UNICHARSET * uch_set
Definition: pageres.h:192
REJMAP reject_map
Definition: pageres.h:271

◆ output_pass()

void tesseract::Tesseract::output_pass ( PAGE_RES_IT page_res_it,
const TBOX target_word_box 
)

Definition at line 68 of file output.cpp.

70  {
71  BLOCK_RES *block_of_last_word;
72  BOOL8 force_eol; //During output
73  BLOCK *nextblock; //block of next word
74  WERD *nextword; //next word
75 
76  page_res_it.restart_page ();
77  block_of_last_word = NULL;
78  while (page_res_it.word () != NULL) {
79  check_debug_pt (page_res_it.word (), 120);
80 
81  if (target_word_box) {
82  TBOX current_word_box = page_res_it.word()->word->bounding_box();
83  FCOORD center_pt(
84  (current_word_box.right() + current_word_box.left()) / 2,
85  (current_word_box.bottom() + current_word_box.top()) / 2);
86  if (!target_word_box->contains(center_pt)) {
87  page_res_it.forward();
88  continue;
89  }
90  }
92  block_of_last_word != page_res_it.block ()) {
93  block_of_last_word = page_res_it.block ();
94  }
95 
96  force_eol = (tessedit_write_block_separators &&
97  (page_res_it.block () != page_res_it.next_block ())) ||
98  (page_res_it.next_word () == NULL);
99 
100  if (page_res_it.next_word () != NULL)
101  nextword = page_res_it.next_word ()->word;
102  else
103  nextword = NULL;
104  if (page_res_it.next_block () != NULL)
105  nextblock = page_res_it.next_block ()->block;
106  else
107  nextblock = NULL;
108  //regardless of tilde crunching
109  write_results(page_res_it,
110  determine_newline_type(page_res_it.word()->word,
111  page_res_it.block()->block,
112  nextword, nextblock), force_eol);
113  page_res_it.forward();
114  }
115 }
WERD_RES * next_word() const
Definition: pageres.h:745
Definition: points.h:189
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1794
BLOCK * block
Definition: pageres.h:99
TBOX bounding_box() const
Definition: werd.cpp:160
inT16 left() const
Definition: rect.h:68
WERD_RES * restart_page()
Definition: pageres.h:683
WERD_RES * forward()
Definition: pageres.h:716
unsigned char BOOL8
Definition: host.h:44
void write_results(PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol)
Definition: output.cpp:130
bool contains(const FCOORD pt) const
Definition: rect.h:323
inT16 top() const
Definition: rect.h:54
BLOCK_RES * next_block() const
Definition: pageres.h:751
Definition: rect.h:30
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
WERD_RES * word() const
Definition: pageres.h:736
inT16 bottom() const
Definition: rect.h:61
Definition: werd.h:60
char determine_newline_type(WERD *word, BLOCK *block, WERD *next_word, BLOCK *next_block)
Definition: output.cpp:245
Definition: ocrblock.h:30
BLOCK_RES * block() const
Definition: pageres.h:742

◆ ParseLanguageString()

void tesseract::Tesseract::ParseLanguageString ( const char *  lang_str,
GenericVector< STRING > *  to_load,
GenericVector< STRING > *  not_to_load 
)

Definition at line 261 of file tessedit.cpp.

263  {
264  STRING remains(lang_str);
265  while (remains.length() > 0) {
266  // Find the start of the lang code and which vector to add to.
267  const char* start = remains.string();
268  while (*start == '+')
269  ++start;
270  GenericVector<STRING>* target = to_load;
271  if (*start == '~') {
272  target = not_to_load;
273  ++start;
274  }
275  // Find the index of the end of the lang code in string start.
276  int end = strlen(start);
277  const char* plus = strchr(start, '+');
278  if (plus != NULL && plus - start < end)
279  end = plus - start;
280  STRING lang_code(start);
281  lang_code.truncate_at(end);
282  STRING next(start + end);
283  remains = next;
284  // Check whether lang_code is already in the target vector and add.
285  if (!IsStrInList(lang_code, *target)) {
286  target->push_back(lang_code);
287  }
288  }
289 }
int push_back(T object)
Definition: strngs.h:45

◆ pgeditor_main()

void tesseract::Tesseract::pgeditor_main ( int  width,
int  height,
PAGE_RES page_res 
)

pgeditor_main()

Top level editor operation: Setup a new window and an according event handler

Definition at line 337 of file pgedit.cpp.

337  {
338  current_page_res = page_res;
339  if (current_page_res->block_res_list.empty())
340  return;
341 
342  recog_done = false;
343  stillRunning = true;
344 
345  build_image_window(width, height);
348 #ifndef GRAPHICS_DISABLED
349  pe = new ParamsEditor(this, image_win);
350 #endif
351  PGEventHandler pgEventHandler(this);
352 
353  image_win->AddEventHandler(&pgEventHandler);
355 
356  SVMenuNode* svMenuRoot = build_menu_new();
357 
358  svMenuRoot->BuildMenu(image_win);
359  image_win->SetVisible(true);
360 
362  image_win->AddEventHandler(NULL);
363 }
BLOCK_RES_LIST block_res_list
Definition: pageres.h:62
void build_image_window(int width, int height)
Definition: pgedit.cpp:193
ScrollView * image_win
Definition: pgedit.cpp:107
SVMenuNode * build_menu_new()
Definition: pgedit.cpp:257
ParamsEditor * pe
Definition: pgedit.cpp:108
BOOL8 word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:945
void AddEventHandler(SVEventHandler *listener)
Add an Event Listener to this ScrollView Window.
Definition: scrollview.cpp:418
bool recog_done
Definition: pgedit.cpp:118
bool stillRunning
Definition: pgedit.cpp:109
void do_re_display(BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:308
PAGE_RES * current_page_res
Definition: pgedit.cpp:128
void SetVisible(bool visible)
Definition: scrollview.cpp:555
void turn_on_bit(uinT8 bit_num)
Definition: bits16.h:37
BITS16 word_display_mode
Definition: pgedit.cpp:122
void BuildMenu(ScrollView *sv, bool menu_bar=true)
Definition: svmnode.cpp:121
void AddMessageBox()
Definition: scrollview.cpp:584
SVEvent * AwaitEvent(SVEventType type)
Definition: scrollview.cpp:449

◆ pix_binary()

Pix* tesseract::Tesseract::pix_binary ( ) const
inline

Definition at line 189 of file tesseractclass.h.

189  {
190  return pix_binary_;
191  }

◆ pix_grey()

Pix* tesseract::Tesseract::pix_grey ( ) const
inline

Definition at line 192 of file tesseractclass.h.

192  {
193  return pix_grey_;
194  }

◆ pix_original()

Pix* tesseract::Tesseract::pix_original ( ) const
inline

Definition at line 199 of file tesseractclass.h.

199 { return pix_original_; }

◆ potential_word_crunch()

BOOL8 tesseract::Tesseract::potential_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level,
BOOL8  ok_dict_word 
)

Definition at line 546 of file docqual.cpp.

548  {
549  float rating_per_ch;
550  int adjusted_len;
551  const char *str = word->best_choice->unichar_string().string();
552  const char *lengths = word->best_choice->unichar_lengths().string();
553  BOOL8 word_crunchable;
554  int poor_indicator_count = 0;
555 
556  word_crunchable = !crunch_leave_accept_strings ||
557  word->reject_map.length() < 3 ||
559  str, lengths) == AC_UNACCEPTABLE &&
560  !ok_dict_word);
561 
562  adjusted_len = word->reject_map.length();
563  if (adjusted_len > 10)
564  adjusted_len = 10;
565  rating_per_ch = word->best_choice->rating() / adjusted_len;
566 
567  if (rating_per_ch > crunch_pot_poor_rate) {
568  if (crunch_debug > 2) {
569  tprintf("Potential poor rating on \"%s\"\n",
570  word->best_choice->unichar_string().string());
571  }
572  poor_indicator_count++;
573  }
574 
575  if (word_crunchable &&
577  if (crunch_debug > 2) {
578  tprintf("Potential poor cert on \"%s\"\n",
579  word->best_choice->unichar_string().string());
580  }
581  poor_indicator_count++;
582  }
583 
584  if (garbage_level != G_OK) {
585  if (crunch_debug > 2) {
586  tprintf("Potential garbage on \"%s\"\n",
587  word->best_choice->unichar_string().string());
588  }
589  poor_indicator_count++;
590  }
591  return poor_indicator_count >= crunch_pot_indicators;
592 }
Unacceptable word.
Definition: control.h:36
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
inT32 length() const
Definition: rejctmap.h:235
unsigned char BOOL8
Definition: host.h:44
Definition: docqual.h:28
float certainty() const
Definition: ratngs.h:328
const STRING & unichar_string() const
Definition: ratngs.h:539
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1690
const UNICHARSET * uch_set
Definition: pageres.h:192
REJMAP reject_map
Definition: pageres.h:271
float rating() const
Definition: ratngs.h:325

◆ PreenXHeights()

void tesseract::Tesseract::PreenXHeights ( BLOCK_LIST *  block_list)

Any row xheight that is significantly different from the median is set to the median.

Definition at line 193 of file applybox.cpp.

193  {
194  double median_xheight = MedianXHeight(block_list);
195  double max_deviation = kMaxXHeightDeviationFraction * median_xheight;
196  // Strip all fuzzy space markers to simplify the PAGE_RES.
197  BLOCK_IT b_it(block_list);
198  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
199  BLOCK* block = b_it.data();
200  ROW_IT r_it(block->row_list());
201  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
202  ROW* row = r_it.data();
203  float diff = fabs(row->x_height() - median_xheight);
204  if (diff > max_deviation) {
205  if (applybox_debug) {
206  tprintf("row xheight=%g, but median xheight = %g\n",
207  row->x_height(), median_xheight);
208  }
209  row->set_x_height(static_cast<float>(median_xheight));
210  }
211  }
212  }
213 }
#define tprintf(...)
Definition: tprintf.h:31
float x_height() const
Definition: ocrrow.h:61
const double kMaxXHeightDeviationFraction
Definition: applybox.cpp:43
void set_x_height(float new_xheight)
Definition: ocrrow.h:64
Definition: ocrrow.h:32
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
Definition: ocrblock.h:30

◆ PrepareForPageseg()

void tesseract::Tesseract::PrepareForPageseg ( )

Definition at line 688 of file tesseractclass.cpp.

688  {
690  // Find the max splitter strategy over all langs.
691  ShiroRekhaSplitter::SplitStrategy max_pageseg_strategy =
694  for (int i = 0; i < sub_langs_.size(); ++i) {
695  ShiroRekhaSplitter::SplitStrategy pageseg_strategy =
697  static_cast<inT32>(sub_langs_[i]->pageseg_devanagari_split_strategy));
698  if (pageseg_strategy > max_pageseg_strategy)
699  max_pageseg_strategy = pageseg_strategy;
700  pixDestroy(&sub_langs_[i]->pix_binary_);
701  sub_langs_[i]->pix_binary_ = pixClone(pix_binary());
702  }
703  // Perform shiro-rekha (top-line) splitting and replace the current image by
704  // the newly splitted image.
705  splitter_.set_orig_pix(pix_binary());
706  splitter_.set_pageseg_split_strategy(max_pageseg_strategy);
707  if (splitter_.Split(true, &pixa_debug_)) {
708  ASSERT_HOST(splitter_.splitted_image());
709  pixDestroy(&pix_binary_);
710  pix_binary_ = pixClone(splitter_.splitted_image());
711  }
712 }
int32_t inT32
Definition: host.h:38
void set_use_cjk_fp_model(bool flag)
Definition: textord.h:95
Pix * pix_binary() const
#define ASSERT_HOST(x)
Definition: errcode.h:84
void set_pageseg_split_strategy(SplitStrategy strategy)
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)

◆ PrepareForTessOCR()

void tesseract::Tesseract::PrepareForTessOCR ( BLOCK_LIST *  block_list,
Tesseract osd_tess,
OSResults osr 
)

Definition at line 719 of file tesseractclass.cpp.

720  {
721  // Find the max splitter strategy over all langs.
722  ShiroRekhaSplitter::SplitStrategy max_ocr_strategy =
724  static_cast<inT32>(ocr_devanagari_split_strategy));
725  for (int i = 0; i < sub_langs_.size(); ++i) {
726  ShiroRekhaSplitter::SplitStrategy ocr_strategy =
728  static_cast<inT32>(sub_langs_[i]->ocr_devanagari_split_strategy));
729  if (ocr_strategy > max_ocr_strategy)
730  max_ocr_strategy = ocr_strategy;
731  }
732  // Utilize the segmentation information available.
733  splitter_.set_segmentation_block_list(block_list);
734  splitter_.set_ocr_split_strategy(max_ocr_strategy);
735  // Run the splitter for OCR
736  bool split_for_ocr = splitter_.Split(false, &pixa_debug_);
737  // Restore pix_binary to the binarized original pix for future reference.
738  ASSERT_HOST(splitter_.orig_pix());
739  pixDestroy(&pix_binary_);
740  pix_binary_ = pixClone(splitter_.orig_pix());
741  // If the pageseg and ocr strategies are different, refresh the block list
742  // (from the last SegmentImage call) with blobs from the real image to be used
743  // for OCR.
744  if (splitter_.HasDifferentSplitStrategies()) {
745  BLOCK block("", TRUE, 0, 0, 0, 0, pixGetWidth(pix_binary_),
746  pixGetHeight(pix_binary_));
747  Pix* pix_for_ocr = split_for_ocr ? splitter_.splitted_image() :
748  splitter_.orig_pix();
749  extract_edges(pix_for_ocr, &block);
750  splitter_.RefreshSegmentationWithNewBlobs(block.blob_list());
751  }
752  // The splitter isn't needed any more after this, so save memory by clearing.
753  splitter_.Clear();
754 }
#define TRUE
Definition: capi.h:45
void set_segmentation_block_list(BLOCK_LIST *block_list)
int32_t inT32
Definition: host.h:38
void set_ocr_split_strategy(SplitStrategy strategy)
void RefreshSegmentationWithNewBlobs(C_BLOB_LIST *new_blobs)
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool Split(bool split_for_pageseg, DebugPixa *pixa_debug)
void extract_edges(Pix *pix, BLOCK *block)
Definition: edgblob.cpp:334
Definition: ocrblock.h:30

◆ PrerecAllWordsPar()

void tesseract::Tesseract::PrerecAllWordsPar ( const GenericVector< WordData > &  words)

Definition at line 39 of file par_control.cpp.

39  {
40  // Prepare all the blobs.
42  for (int w = 0; w < words.size(); ++w) {
43  if (words[w].word->ratings != NULL &&
44  words[w].word->ratings->get(0, 0) == NULL) {
45  for (int s = 0; s < words[w].lang_words.size(); ++s) {
46  Tesseract* sub = s < sub_langs_.size() ? sub_langs_[s] : this;
47  const WERD_RES& word = *words[w].lang_words[s];
48  for (int b = 0; b < word.chopped_word->NumBlobs(); ++b) {
49  blobs.push_back(BlobData(b, sub, word));
50  }
51  }
52  }
53  }
54  // Pre-classify all the blobs.
55  if (tessedit_parallelize > 1) {
56 #ifdef _OPENMP
57 #pragma omp parallel for num_threads(10)
58 #endif // _OPENMP
59  for (int b = 0; b < blobs.size(); ++b) {
60  *blobs[b].choices =
61  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
62  }
63  } else {
64  // TODO(AMD) parallelize this.
65  for (int b = 0; b < blobs.size(); ++b) {
66  *blobs[b].choices =
67  blobs[b].tesseract->classify_blob(blobs[b].blob, "par", White, NULL);
68  }
69  }
70 }
Definition: callcpp.h:34
int push_back(T object)
int size() const
Definition: genericvector.h:72
T & get(int index) const
int NumBlobs() const
Definition: blobs.h:425
TWERD * chopped_word
Definition: pageres.h:201

◆ process_cmd_win_event()

BOOL8 tesseract::Tesseract::process_cmd_win_event ( inT32  cmd_event,
char *  new_value 
)

Definition at line 397 of file pgedit.cpp.

400  {
401  char msg[160];
402  BOOL8 exit = FALSE;
403 
404  color_mode = CM_RAINBOW;
405 
406  // Run recognition on the full page if needed.
407  switch (cmd_event) {
408  case BLAMER_CMD_EVENT:
412  case SHOW_BOLD_CMD_EVENT:
418  if (!recog_done) {
419  recog_all_words(current_page_res, NULL, NULL, NULL, 0);
420  recog_done = true;
421  }
422  break;
423  default:
424  break;
425  }
426 
427  switch (cmd_event) {
428  case NULL_CMD_EVENT:
429  break;
430 
432  case DUMP_WERD_CMD_EVENT:
435  case RECOG_WERDS:
436  case RECOG_PSEUDO:
437  case SHOW_BLOB_FEATURES:
438  mode =(CMD_EVENTS) cmd_event;
439  break;
442  word_config_ = image_win->ShowInputDialog("Config File Name");
443  break;
445  if (new_value[0] == 'T')
447  else
450  break;
451  case BLAMER_CMD_EVENT:
452  if (new_value[0] == 'T')
454  else
458  break;
460  if (new_value[0] == 'T')
462  else
465  break;
466  case POLYGONAL_CMD_EVENT:
467  if (new_value[0] == 'T')
469  else
472  break;
473  case BL_NORM_CMD_EVENT:
474  if (new_value[0] == 'T')
476  else
479  break;
480  case BITMAP_CMD_EVENT:
481  if (new_value[0] == 'T')
483  else
486  break;
489  break;
490  case IMAGE_CMD_EVENT:
491  display_image =(new_value[0] == 'T');
493  break;
494  case BLOCKS_CMD_EVENT:
495  display_blocks =(new_value[0] == 'T');
497  break;
498  case BASELINES_CMD_EVENT:
499  display_baselines =(new_value[0] == 'T');
501  break;
503  color_mode = CM_SUBSCRIPT;
505  break;
507  color_mode = CM_SUPERSCRIPT;
509  break;
511  color_mode = CM_ITALIC;
513  break;
514  case SHOW_BOLD_CMD_EVENT:
515  color_mode = CM_BOLD;
517  break;
519  color_mode = CM_UNDERLINE;
521  break;
523  color_mode = CM_FIXEDPITCH;
525  break;
527  color_mode = CM_SERIF;
529  break;
531  color_mode = CM_SMALLCAPS;
533  break;
535  color_mode = CM_DROPCAPS;
537  break;
538  case REFRESH_CMD_EVENT:
540  break;
541  case QUIT_CMD_EVENT:
542  exit = TRUE;
544  break;
545 
546  default:
547  sprintf(msg, "Unrecognised event %" PRId32 "(%s)", cmd_event, new_value);
548  image_win->AddMessage(msg);
549  break;
550  }
551  return exit;
552 }
#define TRUE
Definition: capi.h:45
Definition: werd.h:55
Definition: werd.h:50
BOOL8 display_baselines
Definition: pgedit.cpp:126
ScrollView * image_win
Definition: pgedit.cpp:107
void turn_off_bit(uinT8 bit_num)
Definition: bits16.h:42
char * ShowInputDialog(const char *msg)
Definition: scrollview.cpp:740
BOOL8 display_image
Definition: pgedit.cpp:124
BOOL8 word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:945
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
const char int mode
Definition: ioapi.h:38
static void Exit()
Definition: scrollview.cpp:589
bool recog_done
Definition: pgedit.cpp:118
BOOL8 word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:760
void do_re_display(BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it))
Definition: pgedit.cpp:308
BOOL8 display_blocks
Definition: pgedit.cpp:125
PAGE_RES * current_page_res
Definition: pgedit.cpp:128
void turn_on_bit(uinT8 bit_num)
Definition: bits16.h:37
bool recog_all_words(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses)
Definition: control.cpp:300
Definition: werd.h:51
BITS16 word_display_mode
Definition: pgedit.cpp:122
void AddMessage(const char *format,...)
Definition: scrollview.cpp:567

◆ process_image_event()

void tesseract::Tesseract::process_image_event ( const SVEvent event)

process_image_event()

User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.

Definition at line 564 of file pgedit.cpp.

565  {
566  // The following variable should remain static, since it is used by
567  // debug editor, which uses a single Tesseract instance.
568  static ICOORD down;
569  ICOORD up;
570  TBOX selection_box;
571  char msg[80];
572 
573  switch(event.type) {
574 
575  case SVET_SELECTION:
576  if (event.type == SVET_SELECTION) {
577  down.set_x(event.x + event.x_size);
578  down.set_y(event.y + event.y_size);
579  if (mode == SHOW_POINT_CMD_EVENT)
580  show_point(current_page_res, event.x, event.y);
581  }
582 
583  up.set_x(event.x);
584  up.set_y(event.y);
585 
586  selection_box = TBOX(down, up);
587 
588  switch(mode) {
592  selection_box,
594  break;
595  case DUMP_WERD_CMD_EVENT:
597  selection_box,
599  break;
602  selection_box,
604  break;
606  debug_word(current_page_res, selection_box);
607  break;
609  break; // ignore up event
610 
611  case RECOG_WERDS:
612  image_win->AddMessage("Recogging selected words");
614  selection_box,
616  break;
617  case RECOG_PSEUDO:
618  image_win->AddMessage("Recogging selected blobs");
619  recog_pseudo_word(current_page_res, selection_box);
620  break;
621  case SHOW_BLOB_FEATURES:
622  blob_feature_display(current_page_res, selection_box);
623  break;
624 
625  default:
626  sprintf(msg, "Mode %d not yet implemented", mode);
627  image_win->AddMessage(msg);
628  break;
629  }
630  default:
631  break;
632  }
633 }
SVEventType type
Definition: scrollview.h:64
void set_x(inT16 xin)
rewrite function
Definition: points.h:61
void blob_feature_display(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:959
void debug_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: pgedit.cpp:640
ScrollView * image_win
Definition: pgedit.cpp:107
BOOL8 word_bln_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:728
BOOL8 word_blank_and_set_display(PAGE_RES_IT *pr_its)
Definition: pgedit.cpp:716
void process_selected_words(PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it))
Definition: pagewalk.cpp:30
int y_size
Definition: scrollview.h:69
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:82
const char int mode
Definition: ioapi.h:38
int y
Definition: scrollview.h:67
int x
Definition: scrollview.h:66
Definition: rect.h:30
BOOL8 word_dumper(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:921
void recog_pseudo_word(PAGE_RES *page_res, TBOX &selection_box)
Definition: control.cpp:67
void show_point(PAGE_RES *page_res, float x, float y)
Definition: pgedit.cpp:654
PAGE_RES * current_page_res
Definition: pgedit.cpp:128
int x_size
Definition: scrollview.h:68
void AddMessage(const char *format,...)
Definition: scrollview.cpp:567
void set_y(inT16 yin)
rewrite function
Definition: points.h:65
integer coordinate
Definition: points.h:30

◆ process_selected_words()

void tesseract::Tesseract::process_selected_words ( PAGE_RES page_res,
TBOX selection_box,
BOOL8(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it)  word_processor 
)

Definition at line 30 of file pagewalk.cpp.

33  {
34  for (PAGE_RES_IT page_res_it(page_res); page_res_it.word() != NULL;
35  page_res_it.forward()) {
36  WERD* word = page_res_it.word()->word;
37  if (word->bounding_box().overlap(selection_box)) {
38  if (!(this->*word_processor)(&page_res_it))
39  return;
40  }
41  }
42 }
bool overlap(const TBOX &box) const
Definition: rect.h:345
TBOX bounding_box() const
Definition: werd.cpp:160
WERD_RES * word() const
Definition: pageres.h:736
Definition: werd.h:60

◆ ProcessTargetWord()

bool tesseract::Tesseract::ProcessTargetWord ( const TBOX word_box,
const TBOX target_word_box,
const char *  word_config,
int  pass 
)

Definition at line 121 of file control.cpp.

124  {
125  if (word_config != NULL) {
126  if (word_box.major_overlap(target_word_box)) {
127  if (backup_config_file_ == NULL) {
128  backup_config_file_ = kBackUpConfigFile;
129  FILE* config_fp = fopen(backup_config_file_, "wb");
130  ParamUtils::PrintParams(config_fp, params());
131  fclose(config_fp);
132  ParamUtils::ReadParamsFile(word_config,
134  params());
135  }
136  } else {
137  if (backup_config_file_ != NULL) {
138  ParamUtils::ReadParamsFile(backup_config_file_,
140  params());
141  backup_config_file_ = NULL;
142  }
143  }
144  } else if (pass > 1 && !word_box.major_overlap(target_word_box)) {
145  return false;
146  }
147  return true;
148 }
ParamsVectors * params()
Definition: ccutil.h:62
const char *const kBackUpConfigFile
Definition: control.cpp:54
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:40
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
static void PrintParams(FILE *fp, const ParamsVectors *member_params)
Definition: params.cpp:173

◆ quality_based_rejection()

void tesseract::Tesseract::quality_based_rejection ( PAGE_RES_IT page_res_it,
BOOL8  good_quality_doc 
)

Definition at line 143 of file docqual.cpp.

144  {
145  if ((tessedit_good_quality_unrej && good_quality_doc))
146  unrej_good_quality_words(page_res_it);
147  doc_and_block_rejection(page_res_it, good_quality_doc);
148  if (unlv_tilde_crunching) {
149  tilde_crunch(page_res_it);
150  tilde_delete(page_res_it);
151  }
152 }
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:237
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:422
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:594
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:165

◆ read_config_file()

void tesseract::Tesseract::read_config_file ( const char *  filename,
SetParamConstraint  constraint 
)

Definition at line 60 of file tessedit.cpp.

61  {
62  STRING path = datadir;
63  path += "configs/";
64  path += filename;
65  FILE* fp;
66  if ((fp = fopen(path.string(), "rb")) != NULL) {
67  fclose(fp);
68  } else {
69  path = datadir;
70  path += "tessconfigs/";
71  path += filename;
72  if ((fp = fopen(path.string(), "rb")) != NULL) {
73  fclose(fp);
74  } else {
75  path = filename;
76  }
77  }
78  ParamUtils::ReadParamsFile(path.string(), constraint, this->params());
79 }
ParamsVectors * params()
Definition: ccutil.h:62
const char * string() const
Definition: strngs.cpp:198
Definition: strngs.h:45
static bool ReadParamsFile(const char *file, SetParamConstraint constraint, ParamsVectors *member_params)
Definition: params.cpp:40
const char * filename
Definition: ioapi.h:38
STRING datadir
Definition: ccutil.h:64

◆ ReassignDiacritics()

bool tesseract::Tesseract::ReassignDiacritics ( int  pass,
PAGE_RES_IT pr_it,
bool *  make_next_word_fuzzy 
)

Definition at line 927 of file control.cpp.

928  {
929  *make_next_word_fuzzy = false;
930  WERD* real_word = pr_it->word()->word;
931  if (real_word->rej_cblob_list()->empty() ||
932  real_word->cblob_list()->empty() ||
933  real_word->rej_cblob_list()->length() > noise_maxperword)
934  return false;
935  real_word->rej_cblob_list()->sort(&C_BLOB::SortByXMiddle);
936  // Get the noise outlines into a vector with matching bool map.
937  GenericVector<C_OUTLINE*> outlines;
938  real_word->GetNoiseOutlines(&outlines);
939  GenericVector<bool> word_wanted;
940  GenericVector<bool> overlapped_any_blob;
941  GenericVector<C_BLOB*> target_blobs;
942  AssignDiacriticsToOverlappingBlobs(outlines, pass, real_word, pr_it,
943  &word_wanted, &overlapped_any_blob,
944  &target_blobs);
945  // Filter the outlines that overlapped any blob and put them into the word
946  // now. This simplifies the remaining task and also makes it more accurate
947  // as it has more completed blobs to work on.
948  GenericVector<bool> wanted;
949  GenericVector<C_BLOB*> wanted_blobs;
950  GenericVector<C_OUTLINE*> wanted_outlines;
951  int num_overlapped = 0;
952  int num_overlapped_used = 0;
953  for (int i = 0; i < overlapped_any_blob.size(); ++i) {
954  if (overlapped_any_blob[i]) {
955  ++num_overlapped;
956  if (word_wanted[i]) ++num_overlapped_used;
957  wanted.push_back(word_wanted[i]);
958  wanted_blobs.push_back(target_blobs[i]);
959  wanted_outlines.push_back(outlines[i]);
960  outlines[i] = NULL;
961  }
962  }
963  real_word->AddSelectedOutlines(wanted, wanted_blobs, wanted_outlines, NULL);
964  AssignDiacriticsToNewBlobs(outlines, pass, real_word, pr_it, &word_wanted,
965  &target_blobs);
966  int non_overlapped = 0;
967  int non_overlapped_used = 0;
968  for (int i = 0; i < word_wanted.size(); ++i) {
969  if (word_wanted[i]) ++non_overlapped_used;
970  if (outlines[i] != NULL) ++non_overlapped_used;
971  }
972  if (debug_noise_removal) {
973  tprintf("Used %d/%d overlapped %d/%d non-overlaped diacritics on word:",
974  num_overlapped_used, num_overlapped, non_overlapped_used,
975  non_overlapped);
976  real_word->bounding_box().print();
977  }
978  // Now we have decided which outlines we want, put them into the real_word.
979  if (real_word->AddSelectedOutlines(word_wanted, target_blobs, outlines,
980  make_next_word_fuzzy)) {
981  pr_it->MakeCurrentWordFuzzy();
982  }
983  // TODO(rays) Parts of combos have a deep copy of the real word, and need
984  // to have their noise outlines moved/assigned in the same way!!
985  return num_overlapped_used != 0 || non_overlapped_used != 0;
986 }
void AssignDiacriticsToNewBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:1046
int push_back(T object)
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
TBOX bounding_box() const
Definition: werd.cpp:160
static int SortByXMiddle(const void *v1, const void *v2)
Definition: stepblob.h:119
void AssignDiacriticsToOverlappingBlobs(const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs)
Definition: control.cpp:993
void GetNoiseOutlines(GenericVector< C_OUTLINE *> *outlines)
Definition: werd.cpp:530
bool AddSelectedOutlines(const GenericVector< bool > &wanted, const GenericVector< C_BLOB *> &target_blobs, const GenericVector< C_OUTLINE *> &outlines, bool *make_next_word_fuzzy)
Definition: werd.cpp:548
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD * word
Definition: pageres.h:175
void print() const
Definition: rect.h:270
C_BLOB_LIST * rej_cblob_list()
Definition: werd.h:95
WERD_RES * word() const
Definition: pageres.h:736
Definition: werd.h:60
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1484

◆ recog_all_words()

bool tesseract::Tesseract::recog_all_words ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config,
int  dopasses 
)

recog_all_words()

Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.

Parameters
page_respage structure
monitorprogress monitor
word_configword_config file
target_word_boxspecifies just to extract a rectangle
dopasses0 - all, 1 just pass 1, 2 passes 2 and higher

Definition at line 300 of file control.cpp.

304  {
305  PAGE_RES_IT page_res_it(page_res);
306 
308  tessedit_test_adaption.set_value (TRUE);
309  tessedit_minimal_rejection.set_value (TRUE);
310  }
311 
312  if (dopasses==0 || dopasses==1) {
313  page_res_it.restart_page();
314  // ****************** Pass 1 *******************
315 
316  // If the adaptive classifier is full switch to one we prepared earlier,
317  // ie on the previous page. If the current adaptive classifier is non-empty,
318  // prepare a backup starting at this page, in case it fills up. Do all this
319  // independently for each language.
320  if (AdaptiveClassifierIsFull()) {
322  } else if (!AdaptiveClassifierIsEmpty()) {
324  }
325  // Now check the sub-langs as well.
326  for (int i = 0; i < sub_langs_.size(); ++i) {
327  if (sub_langs_[i]->AdaptiveClassifierIsFull()) {
328  sub_langs_[i]->SwitchAdaptiveClassifier();
329  } else if (!sub_langs_[i]->AdaptiveClassifierIsEmpty()) {
330  sub_langs_[i]->StartBackupAdaptiveClassifier();
331  }
332  }
333  // Set up all words ready for recognition, so that if parallelism is on
334  // all the input and output classes are ready to run the classifier.
336  SetupAllWordsPassN(1, target_word_box, word_config, page_res, &words);
337  if (tessedit_parallelize) {
338  PrerecAllWordsPar(words);
339  }
340 
341  stats_.word_count = words.size();
342 
343  stats_.dict_words = 0;
344  stats_.doc_blob_quality = 0;
345  stats_.doc_outline_errs = 0;
346  stats_.doc_char_quality = 0;
347  stats_.good_char_count = 0;
348  stats_.doc_good_char_quality = 0;
349 
350  most_recently_used_ = this;
351  // Run pass 1 word recognition.
352  if (!RecogAllWordsPassN(1, monitor, &page_res_it, &words)) return false;
353  // Pass 1 post-processing.
354  for (page_res_it.restart_page(); page_res_it.word() != NULL;
355  page_res_it.forward()) {
356  if (page_res_it.word()->word->flag(W_REP_CHAR)) {
357  fix_rep_char(&page_res_it);
358  continue;
359  }
360 
361  // Count dict words.
362  if (page_res_it.word()->best_choice->permuter() == USER_DAWG_PERM)
363  ++(stats_.dict_words);
364 
365  // Update misadaption log (we only need to do it on pass 1, since
366  // adaption only happens on this pass).
367  if (page_res_it.word()->blamer_bundle != NULL &&
368  page_res_it.word()->blamer_bundle->misadaption_debug().length() > 0) {
369  page_res->misadaption_log.push_back(
370  page_res_it.word()->blamer_bundle->misadaption_debug());
371  }
372  }
373  }
374 
375  if (dopasses == 1) return true;
376 
377  // ****************** Pass 2 *******************
379  AnyTessLang()) {
380  page_res_it.restart_page();
382  SetupAllWordsPassN(2, target_word_box, word_config, page_res, &words);
383  if (tessedit_parallelize) {
384  PrerecAllWordsPar(words);
385  }
386  most_recently_used_ = this;
387  // Run pass 2 word recognition.
388  if (!RecogAllWordsPassN(2, monitor, &page_res_it, &words)) return false;
389  }
390 
391  // The next passes are only required for Tess-only.
392  if (AnyTessLang() && !AnyLSTMLang()) {
393  // ****************** Pass 3 *******************
394  // Fix fuzzy spaces.
396 
399  fix_fuzzy_spaces(monitor, stats_.word_count, page_res);
400 
401  // ****************** Pass 4 *******************
404 
405  // ****************** Pass 5,6 *******************
406  rejection_passes(page_res, monitor, target_word_box, word_config);
407 
408  // ****************** Pass 8 *******************
409  font_recognition_pass(page_res);
410 
411  // ****************** Pass 9 *******************
412  // Check the correctness of the final results.
413  blamer_pass(page_res);
414  script_pos_pass(page_res);
415  }
416 
417  // Write results pass.
419  // This is now redundant, but retained commented so show how to obtain
420  // bounding boxes and style information.
421 
422  // changed by jetsoft
423  // needed for dll to output memory structure
424  if ((dopasses == 0 || dopasses == 2) && (monitor || tessedit_write_unlv))
425  output_pass(page_res_it, target_word_box);
426  // end jetsoft
427  PageSegMode pageseg_mode = static_cast<PageSegMode>(
428  static_cast<int>(tessedit_pageseg_mode));
429  textord_.CleanupSingleRowResult(pageseg_mode, page_res);
430 
431  // Remove empty words, as these mess up the result iterators.
432  for (page_res_it.restart_page(); page_res_it.word() != NULL;
433  page_res_it.forward()) {
434  WERD_RES* word = page_res_it.word();
435  POLY_BLOCK* pb = page_res_it.block()->block != NULL
436  ? page_res_it.block()->block->poly_block()
437  : NULL;
438  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
439  (word->best_choice->IsAllSpaces() && (pb == NULL || pb->IsText()))) {
440  page_res_it.DeleteCurrentWord();
441  }
442  }
443 
444  if (monitor != NULL) {
445  monitor->progress = 100;
446  }
447  return true;
448 }
void output_pass(PAGE_RES_IT &page_res_it, const TBOX *target_word_box)
Definition: output.cpp:68
#define LOC_WRITE_RESULTS
Definition: errcode.h:54
bool RecogAllWordsPassN(int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words)
Definition: control.cpp:210
#define TRUE
Definition: capi.h:45
void fix_fuzzy_spaces(ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res)
Definition: fixspace.cpp:48
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:79
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
void CleanupSingleRowResult(PageSegMode pageseg_mode, PAGE_RES *page_res)
Definition: textord.cpp:325
bool IsAllSpaces() const
Definition: ratngs.h:519
void font_recognition_pass(PAGE_RES *page_res)
Definition: control.cpp:1985
void script_pos_pass(PAGE_RES *page_res)
Definition: control.cpp:718
int push_back(T object)
bool AdaptiveClassifierIsFull() const
Definition: classify.h:283
int size() const
Definition: genericvector.h:72
void bigram_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:450
bool IsText() const
Definition: polyblk.h:52
void dictionary_correction_pass(PAGE_RES *page_res)
Definition: control.cpp:2042
bool AdaptiveClassifierIsEmpty() const
Definition: classify.h:284
void blamer_pass(PAGE_RES *page_res)
Definition: control.cpp:694
bool tessedit_enable_bigram_correction
void rejection_passes(PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config)
Definition: control.cpp:598
void StartBackupAdaptiveClassifier()
Definition: adaptmatch.cpp:630
void fix_rep_char(PAGE_RES_IT *page_res_it)
Definition: control.cpp:1651
void SetupAllWordsPassN(int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words)
Definition: control.cpp:151
void SwitchAdaptiveClassifier()
Definition: adaptmatch.cpp:614
#define LOC_FUZZY_SPACE
Definition: errcode.h:50
WERD * word
Definition: pageres.h:175
bool AnyTessLang() const
inT16 progress
chars in this buffer(0)
Definition: ocrclass.h:118
bool right_to_left() const
GenericVector< STRING > misadaption_log
Definition: pageres.h:73
bool AnyLSTMLang() const
void PrerecAllWordsPar(const GenericVector< WordData > &words)
Definition: par_control.cpp:39

◆ recog_interactive()

BOOL8 tesseract::Tesseract::recog_interactive ( PAGE_RES_IT pr_it)

Recognize a single word in interactive mode.

Parameters
pr_itthe page results iterator

Definition at line 82 of file control.cpp.

82  {
83  inT16 char_qual;
84  inT16 good_char_qual;
85 
86  WordData word_data(*pr_it);
87  SetupWordPassN(2, &word_data);
88  // LSTM doesn't run on pass2, but we want to run pass2 for tesseract.
89  if (lstm_recognizer_ == NULL) {
90  classify_word_and_language(2, pr_it, &word_data);
91  } else {
92  classify_word_and_language(1, pr_it, &word_data);
93  }
95  WERD_RES* word_res = pr_it->word();
96  word_char_quality(word_res, pr_it->row()->row, &char_qual, &good_char_qual);
97  tprintf("\n%d chars; word_blob_quality: %d; outline_errs: %d; "
98  "char_quality: %d; good_char_quality: %d\n",
99  word_res->reject_map.length(),
100  word_blob_quality(word_res, pr_it->row()->row),
101  word_outline_errs(word_res), char_qual, good_char_qual);
102  }
103  return TRUE;
104 }
#define TRUE
Definition: capi.h:45
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1285
#define tprintf(...)
Definition: tprintf.h:31
ROW * row
Definition: pageres.h:127
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65
int16_t inT16
Definition: host.h:36
ROW_RES * row() const
Definition: pageres.h:739
inT32 length() const
Definition: rejctmap.h:235
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77
WERD_RES * word() const
Definition: pageres.h:736
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:174
REJMAP reject_map
Definition: pageres.h:271

◆ recog_pseudo_word()

void tesseract::Tesseract::recog_pseudo_word ( PAGE_RES page_res,
TBOX selection_box 
)

Definition at line 67 of file control.cpp.

68  {
69  PAGE_RES_IT* it = make_pseudo_word(page_res, selection_box);
70  if (it != NULL) {
72  it->DeleteCurrentWord();
73  delete it;
74  }
75 }
PAGE_RES_IT * make_pseudo_word(PAGE_RES *page_res, const TBOX &selection_box)
Definition: werdit.cpp:31
BOOL8 recog_interactive(PAGE_RES_IT *pr_it)
Definition: control.cpp:82
void DeleteCurrentWord()
Definition: pageres.cpp:1451

◆ recog_training_segmented()

void tesseract::Tesseract::recog_training_segmented ( const STRING fname,
PAGE_RES page_res,
volatile ETEXT_DESC monitor,
FILE *  output_file 
)

Definition at line 79 of file recogtraining.cpp.

82  {
83  STRING box_fname = fname;
84  const char *lastdot = strrchr(box_fname.string(), '.');
85  if (lastdot != NULL) box_fname[lastdot - box_fname.string()] = '\0';
86  box_fname += ".box";
87  // ReadNextBox() will close box_file
88  FILE *box_file = open_file(box_fname.string(), "r");
89 
90  PAGE_RES_IT page_res_it;
91  page_res_it.page_res = page_res;
92  page_res_it.restart_page();
93  STRING label;
94 
95  // Process all the words on this page.
96  TBOX tbox; // tesseract-identified box
97  TBOX bbox; // box from the box file
98  bool keep_going;
99  int line_number = 0;
100  int examined_words = 0;
101  do {
102  keep_going = read_t(&page_res_it, &tbox);
103  keep_going &= ReadNextBox(applybox_page, &line_number, box_file, &label,
104  &bbox);
105  // Align bottom left points of the TBOXes.
106  while (keep_going &&
107  !NearlyEqual<int>(tbox.bottom(), bbox.bottom(), kMaxBoxEdgeDiff)) {
108  if (bbox.bottom() < tbox.bottom()) {
109  page_res_it.forward();
110  keep_going = read_t(&page_res_it, &tbox);
111  } else {
112  keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
113  &bbox);
114  }
115  }
116  while (keep_going &&
117  !NearlyEqual<int>(tbox.left(), bbox.left(), kMaxBoxEdgeDiff)) {
118  if (bbox.left() > tbox.left()) {
119  page_res_it.forward();
120  keep_going = read_t(&page_res_it, &tbox);
121  } else {
122  keep_going = ReadNextBox(applybox_page, &line_number, box_file, &label,
123  &bbox);
124  }
125  }
126  // OCR the word if top right points of the TBOXes are similar.
127  if (keep_going &&
128  NearlyEqual<int>(tbox.right(), bbox.right(), kMaxBoxEdgeDiff) &&
129  NearlyEqual<int>(tbox.top(), bbox.top(), kMaxBoxEdgeDiff)) {
130  ambigs_classify_and_output(label.string(), &page_res_it, output_file);
131  examined_words++;
132  }
133  page_res_it.forward();
134  } while (keep_going);
135 
136  // Set up scripts on all of the words that did not get sent to
137  // ambigs_classify_and_output. They all should have, but if all the
138  // werd_res's don't get uch_sets, tesseract will crash when you try
139  // to iterate over them. :-(
140  int total_words = 0;
141  for (page_res_it.restart_page(); page_res_it.block() != NULL;
142  page_res_it.forward()) {
143  if (page_res_it.word()) {
144  if (page_res_it.word()->uch_set == NULL)
145  page_res_it.word()->SetupFake(unicharset);
146  total_words++;
147  }
148  }
149  if (examined_words < 0.85 * total_words) {
150  tprintf("TODO(antonova): clean up recog_training_segmented; "
151  " It examined only a small fraction of the ambigs image.\n");
152  }
153  tprintf("recog_training_segmented: examined %d / %d words.\n",
154  examined_words, total_words);
155 }
void ambigs_classify_and_output(const char *label, PAGE_RES_IT *pr_it, FILE *output_file)
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
inT16 left() const
Definition: rect.h:68
FILE * open_file(const char *filename, const char *mode)
Definition: cutil.cpp:82
bool ReadNextBox(int *line_number, FILE *box_file, STRING *utf8_str, TBOX *bounding_box)
Definition: boxread.cpp:119
bool read_t(PAGE_RES_IT *page_res_it, TBOX *tbox)
PAGE_RES * page_res
Definition: pageres.h:661
Definition: strngs.h:45
UNICHARSET unicharset
Definition: ccutil.h:68
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
const inT16 kMaxBoxEdgeDiff
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61

◆ recog_word()

void tesseract::Tesseract::recog_word ( WERD_RES word)

Definition at line 46 of file tfacepp.cpp.

46  {
47  if (wordrec_skip_no_truth_words && (word->blamer_bundle == NULL ||
49  if (classify_debug_level) tprintf("No truth for word - skipping\n");
50  word->tess_failed = true;
51  return;
52  }
55  word->SetupBoxWord();
56  if (word->best_choice->length() != word->box_word->length()) {
57  tprintf("recog_word ASSERT FAIL String:\"%s\"; "
58  "Strlen=%d; #Blobs=%d\n",
59  word->best_choice->debug_string().string(),
60  word->best_choice->length(), word->box_word->length());
61  }
62  ASSERT_HOST(word->best_choice->length() == word->box_word->length());
63  // Check that the ratings matrix size matches the sum of all the
64  // segmentation states.
65  if (!word->StatesAllValid()) {
66  tprintf("Not all words have valid states relative to ratings matrix!!");
67  word->DebugWordChoices(true, NULL);
68  ASSERT_HOST(word->StatesAllValid());
69  }
71  /* Override the permuter type if a straight dictionary check disagrees. */
72  uint8_t perm_type = word->best_choice->permuter();
73  if ((perm_type != SYSTEM_DAWG_PERM) &&
74  (perm_type != FREQ_DAWG_PERM) && (perm_type != USER_DAWG_PERM)) {
75  uint8_t real_dict_perm_type = dict_word(*word->best_choice);
76  if (((real_dict_perm_type == SYSTEM_DAWG_PERM) ||
77  (real_dict_perm_type == FREQ_DAWG_PERM) ||
78  (real_dict_perm_type == USER_DAWG_PERM)) &&
80  word->best_choice->unichar_lengths().string()) > 0)) {
81  word->best_choice->set_permuter(real_dict_perm_type); // use dict perm
82  }
83  }
85  perm_type != word->best_choice->permuter()) {
86  tprintf("Permuter Type Flipped from %d to %d\n",
87  perm_type, word->best_choice->permuter());
88  }
89  }
90  // Factored out from control.cpp
91  ASSERT_HOST((word->best_choice == NULL) == (word->raw_choice == NULL));
92  if (word->best_choice == NULL || word->best_choice->length() == 0 ||
93  static_cast<int>(strspn(word->best_choice->unichar_string().string(),
94  " ")) == word->best_choice->length()) {
95  word->tess_failed = true;
96  word->reject_map.initialise(word->box_word->length());
98  } else {
99  word->tess_failed = false;
100  }
101 }
bool wordrec_skip_no_truth_words
Definition: wordrec.h:166
BOOL8 tess_failed
Definition: pageres.h:272
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
const STRING & unichar_lengths() const
Definition: ratngs.h:546
BlamerBundle * blamer_bundle
Definition: pageres.h:230
const STRING debug_string() const
Definition: ratngs.h:503
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
bool StatesAllValid()
Definition: pageres.cpp:450
bool empty() const
Definition: genericvector.h:90
tesseract::BoxWord * box_word
Definition: pageres.h:250
#define ASSERT_HOST(x)
Definition: errcode.h:84
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
uinT8 permuter() const
Definition: ratngs.h:344
inT16 alpha_count(const char *word, const char *word_lengths)
Definition: reject.cpp:495
WERD_CHOICE * raw_choice
Definition: pageres.h:224
void SetupBoxWord()
Definition: pageres.cpp:843
const STRING & unichar_string() const
Definition: ratngs.h:539
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:110
void rej_word_tess_failure()
Definition: rejctmap.cpp:422
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:128
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:472
void initialise(inT16 length)
Definition: rejctmap.cpp:318
TWERD * chopped_word
Definition: pageres.h:201
REJMAP reject_map
Definition: pageres.h:271
void set_permuter(uinT8 perm)
Definition: ratngs.h:373
int length() const
Definition: boxword.h:85

◆ recog_word_recursive()

void tesseract::Tesseract::recog_word_recursive ( WERD_RES word)

Definition at line 110 of file tfacepp.cpp.

110  {
111  int word_length = word->chopped_word->NumBlobs(); // no of blobs
112  if (word_length > MAX_UNDIVIDED_LENGTH) {
113  return split_and_recog_word(word);
114  }
115  cc_recog(word);
116  word_length = word->rebuild_word->NumBlobs(); // No of blobs in output.
117 
118  // Do sanity checks and minor fixes on best_choice.
119  if (word->best_choice->length() > word_length) {
120  word->best_choice->make_bad(); // should never happen
121  tprintf("recog_word: Discarded long string \"%s\""
122  " (%d characters vs %d blobs)\n",
123  word->best_choice->unichar_string().string(),
124  word->best_choice->length(), word_length);
125  tprintf("Word is at:");
126  word->word->bounding_box().print();
127  }
128  if (word->best_choice->length() < word_length) {
129  UNICHAR_ID space_id = unicharset.unichar_to_id(" ");
130  while (word->best_choice->length() < word_length) {
131  word->best_choice->append_unichar_id(space_id, 1, 0.0,
132  word->best_choice->certainty());
133  }
134  }
135 }
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
TWERD * rebuild_word
Definition: pageres.h:244
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:446
TBOX bounding_box() const
Definition: werd.cpp:160
void split_and_recog_word(WERD_RES *word)
Definition: tfacepp.cpp:144
UNICHARSET unicharset
Definition: ccutil.h:68
#define MAX_UNDIVIDED_LENGTH
Definition: tfacepp.cpp:35
float certainty() const
Definition: ratngs.h:328
int NumBlobs() const
Definition: blobs.h:425
const STRING & unichar_string() const
Definition: ratngs.h:539
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:441
WERD * word
Definition: pageres.h:175
void print() const
Definition: rect.h:270
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
TWERD * chopped_word
Definition: pageres.h:201
void cc_recog(WERD_RES *word)
Definition: tface.cpp:113

◆ RecogAllWordsPassN()

bool tesseract::Tesseract::RecogAllWordsPassN ( int  pass_n,
ETEXT_DESC monitor,
PAGE_RES_IT pr_it,
GenericVector< WordData > *  words 
)

Definition at line 210 of file control.cpp.

212  {
213  // TODO(rays) Before this loop can be parallelized (it would yield a massive
214  // speed-up) all remaining member globals need to be converted to local/heap
215  // (eg set_pass1 and set_pass2) and an intermediate adaption pass needs to be
216  // added. The results will be significantly different with adaption on, and
217  // deterioration will need investigation.
218  pr_it->restart_page();
219  for (int w = 0; w < words->size(); ++w) {
220  WordData* word = &(*words)[w];
221  if (w > 0) word->prev_word = &(*words)[w - 1];
222  if (monitor != NULL) {
223  monitor->ocr_alive = TRUE;
224  if (pass_n == 1) {
225  monitor->progress = 70 * w / words->size();
226  if (monitor->progress_callback != NULL) {
227  TBOX box = pr_it->word()->word->bounding_box();
228  (*monitor->progress_callback)(monitor->progress, box.left(),
229  box.right(), box.top(), box.bottom());
230  }
231  } else {
232  monitor->progress = 70 + 30 * w / words->size();
233  if (monitor->progress_callback != NULL) {
234  (*monitor->progress_callback)(monitor->progress, 0, 0, 0, 0);
235  }
236  }
237  if (monitor->deadline_exceeded() ||
238  (monitor->cancel != NULL && (*monitor->cancel)(monitor->cancel_this,
239  words->size()))) {
240  // Timeout. Fake out the rest of the words.
241  for (; w < words->size(); ++w) {
242  (*words)[w].word->SetupFake(unicharset);
243  }
244  return false;
245  }
246  }
247  if (word->word->tess_failed) {
248  int s;
249  for (s = 0; s < word->lang_words.size() &&
250  word->lang_words[s]->tess_failed; ++s) {}
251  // If all are failed, skip it. Image words are skipped by this test.
252  if (s > word->lang_words.size()) continue;
253  }
254  // Sync pr_it with the wth WordData.
255  while (pr_it->word() != NULL && pr_it->word() != word->word)
256  pr_it->forward();
257  ASSERT_HOST(pr_it->word() != NULL);
258  bool make_next_word_fuzzy = false;
259  if (!AnyLSTMLang() &&
260  ReassignDiacritics(pass_n, pr_it, &make_next_word_fuzzy)) {
261  // Needs to be setup again to see the new outlines in the chopped_word.
262  SetupWordPassN(pass_n, word);
263  }
264 
265  classify_word_and_language(pass_n, pr_it, word);
267  tprintf("Pass%d: %s [%s]\n", pass_n,
268  word->word->best_choice->unichar_string().string(),
269  word->word->best_choice->debug_string().string());
270  }
271  pr_it->forward();
272  if (make_next_word_fuzzy && pr_it->word() != NULL) {
273  pr_it->MakeCurrentWordFuzzy();
274  }
275  }
276  return true;
277 }
bool deadline_exceeded() const
Definition: ocrclass.h:158
#define TRUE
Definition: capi.h:45
volatile inT8 ocr_alive
true if not last
Definition: ocrclass.h:123
void classify_word_and_language(int pass_n, PAGE_RES_IT *pr_it, WordData *word_data)
Definition: control.cpp:1285
#define tprintf(...)
Definition: tprintf.h:31
void * cancel_this
called whenever progress increases
Definition: ocrclass.h:127
bool ReassignDiacritics(int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy)
Definition: control.cpp:927
int size() const
Definition: genericvector.h:72
TBOX bounding_box() const
Definition: werd.cpp:160
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
WERD_RES * restart_page()
Definition: pageres.h:683
PROGRESS_FUNC progress_callback
returns true to cancel
Definition: ocrclass.h:126
WERD_RES * forward()
Definition: pageres.h:716
UNICHARSET unicharset
Definition: ccutil.h:68
CANCEL_FUNC cancel
for errcode use
Definition: ocrclass.h:125
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
WERD_RES * word() const
Definition: pageres.h:736
inT16 bottom() const
Definition: rect.h:61
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:174
inT16 progress
chars in this buffer(0)
Definition: ocrclass.h:118
void MakeCurrentWordFuzzy()
Definition: pageres.cpp:1484
bool AnyLSTMLang() const

◆ recognize_page()

void tesseract::Tesseract::recognize_page ( STRING image_name)

◆ reject_edge_blobs()

void tesseract::Tesseract::reject_edge_blobs ( WERD_RES word)

Definition at line 263 of file reject.cpp.

263  {
264  TBOX word_box = word->word->bounding_box();
265  // Use the box_word as it is already denormed back to image coordinates.
266  int blobcount = word->box_word->length();
267 
268  if (word_box.left() < tessedit_image_border ||
269  word_box.bottom() < tessedit_image_border ||
270  word_box.right() + tessedit_image_border > ImageWidth() - 1 ||
271  word_box.top() + tessedit_image_border > ImageHeight() - 1) {
272  ASSERT_HOST(word->reject_map.length() == blobcount);
273  for (int blobindex = 0; blobindex < blobcount; blobindex++) {
274  TBOX blob_box = word->box_word->BlobBox(blobindex);
275  if (blob_box.left() < tessedit_image_border ||
276  blob_box.bottom() < tessedit_image_border ||
277  blob_box.right() + tessedit_image_border > ImageWidth() - 1 ||
278  blob_box.top() + tessedit_image_border > ImageHeight() - 1) {
279  word->reject_map[blobindex].setrej_edge_char();
280  // Close to edge
281  }
282  }
283  }
284 }
const TBOX & BlobBox(int index) const
Definition: boxword.h:86
TBOX bounding_box() const
Definition: werd.cpp:160
tesseract::BoxWord * box_word
Definition: pageres.h:250
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
inT32 length() const
Definition: rejctmap.h:235
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
REJMAP reject_map
Definition: pageres.h:271
int length() const
Definition: boxword.h:85

◆ reject_I_1_L()

void tesseract::Tesseract::reject_I_1_L ( WERD_RES word)

Definition at line 191 of file reject.cpp.

191  {
192  inT16 i;
193  inT16 offset;
194 
195  for (i = 0, offset = 0; word->best_choice->unichar_string()[offset] != '\0';
196  offset += word->best_choice->unichar_lengths()[i], i += 1) {
198  contains (word->best_choice->unichar_string()[offset])) {
199  //rej 1Il conflict
200  word->reject_map[i].setrej_1Il_conflict ();
201  }
202  }
203 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING & unichar_lengths() const
Definition: ratngs.h:546
voidpf uLong offset
Definition: ioapi.h:42
int16_t inT16
Definition: host.h:36
Definition: strngs.h:45
const STRING & unichar_string() const
Definition: ratngs.h:539
REJMAP reject_map
Definition: pageres.h:271

◆ reject_mostly_rejects()

void tesseract::Tesseract::reject_mostly_rejects ( WERD_RES word)

Definition at line 573 of file reject.cpp.

573  {
574  /* Reject the whole of the word if the fraction of rejects exceeds a limit */
575 
576  if ((float) word->reject_map.reject_count() / word->reject_map.length() >=
579 }
double rej_whole_of_mostly_reject_word_fract
void rej_word_mostly_rej()
Definition: rejctmap.cpp:476
inT32 length() const
Definition: rejctmap.h:235
inT16 reject_count()
Definition: rejctmap.h:241
REJMAP reject_map
Definition: pageres.h:271

◆ rejection_passes()

void tesseract::Tesseract::rejection_passes ( PAGE_RES page_res,
ETEXT_DESC monitor,
const TBOX target_word_box,
const char *  word_config 
)

Definition at line 598 of file control.cpp.

601  {
602  PAGE_RES_IT page_res_it(page_res);
603  // ****************** Pass 5 *******************
604  // Gather statistics on rejects.
605  int word_index = 0;
606  while (!tessedit_test_adaption && page_res_it.word() != NULL) {
608  WERD_RES* word = page_res_it.word();
609  word_index++;
610  if (monitor != NULL) {
611  monitor->ocr_alive = TRUE;
612  monitor->progress = 95 + 5 * word_index / stats_.word_count;
613  }
614  if (word->rebuild_word == NULL) {
615  // Word was not processed by tesseract.
616  page_res_it.forward();
617  continue;
618  }
619  check_debug_pt(word, 70);
620 
621  // changed by jetsoft
622  // specific to its needs to extract one word when need
623  if (target_word_box &&
625  *target_word_box, word_config, 4)) {
626  page_res_it.forward();
627  continue;
628  }
629  // end jetsoft
630 
631  page_res_it.rej_stat_word();
632  int chars_in_word = word->reject_map.length();
633  int rejects_in_word = word->reject_map.reject_count();
634 
635  int blob_quality = word_blob_quality(word, page_res_it.row()->row);
636  stats_.doc_blob_quality += blob_quality;
637  int outline_errs = word_outline_errs(word);
638  stats_.doc_outline_errs += outline_errs;
639  inT16 all_char_quality;
640  inT16 accepted_all_char_quality;
641  word_char_quality(word, page_res_it.row()->row,
642  &all_char_quality, &accepted_all_char_quality);
643  stats_.doc_char_quality += all_char_quality;
644  uint8_t permuter_type = word->best_choice->permuter();
645  if ((permuter_type == SYSTEM_DAWG_PERM) ||
646  (permuter_type == FREQ_DAWG_PERM) ||
647  (permuter_type == USER_DAWG_PERM)) {
648  stats_.good_char_count += chars_in_word - rejects_in_word;
649  stats_.doc_good_char_quality += accepted_all_char_quality;
650  }
651  check_debug_pt(word, 80);
653  (blob_quality == 0) && (outline_errs >= chars_in_word))
655  check_debug_pt(word, 90);
656  page_res_it.forward();
657  }
658 
660  tprintf
661  ("QUALITY: num_chs= %d num_rejs= %d %5.3f blob_qual= %d %5.3f"
662  " outline_errs= %d %5.3f char_qual= %d %5.3f good_ch_qual= %d %5.3f\n",
663  page_res->char_count, page_res->rej_count,
664  page_res->rej_count / static_cast<float>(page_res->char_count),
665  stats_.doc_blob_quality,
666  stats_.doc_blob_quality / static_cast<float>(page_res->char_count),
667  stats_.doc_outline_errs,
668  stats_.doc_outline_errs / static_cast<float>(page_res->char_count),
669  stats_.doc_char_quality,
670  stats_.doc_char_quality / static_cast<float>(page_res->char_count),
671  stats_.doc_good_char_quality,
672  (stats_.good_char_count > 0) ?
673  (stats_.doc_good_char_quality /
674  static_cast<float>(stats_.good_char_count)) : 0.0);
675  }
676  BOOL8 good_quality_doc =
677  ((page_res->rej_count / static_cast<float>(page_res->char_count)) <=
678  quality_rej_pc) &&
679  (stats_.doc_blob_quality / static_cast<float>(page_res->char_count) >=
680  quality_blob_pc) &&
681  (stats_.doc_outline_errs / static_cast<float>(page_res->char_count) <=
683  (stats_.doc_char_quality / static_cast<float>(page_res->char_count) >=
685 
686  // ****************** Pass 6 *******************
687  // Do whole document or whole block rejection pass
688  if (!tessedit_test_adaption) {
690  quality_based_rejection(page_res_it, good_quality_doc);
691  }
692 }
inT32 char_count
Definition: pageres.h:60
#define TRUE
Definition: capi.h:45
volatile inT8 ocr_alive
true if not last
Definition: ocrclass.h:123
void set_global_loc_code(int loc_code)
Definition: globaloc.cpp:79
WERD_CHOICE * best_choice
Definition: pageres.h:219
inT32 rej_count
Definition: pageres.h:61
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1794
TWERD * rebuild_word
Definition: pageres.h:244
#define tprintf(...)
Definition: tprintf.h:31
void quality_based_rejection(PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc)
Definition: docqual.cpp:143
#define LOC_MM_ADAPT
Definition: errcode.h:52
inT16 word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:65
TBOX bounding_box() const
Definition: werd.cpp:160
int16_t inT16
Definition: host.h:36
inT32 length() const
Definition: rejctmap.h:235
uinT8 permuter() const
Definition: ratngs.h:344
inT16 reject_count()
Definition: rejctmap.h:241
unsigned char BOOL8
Definition: host.h:44
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
void rej_word_bad_quality()
Definition: rejctmap.cpp:485
#define LOC_DOC_BLK_REJ
Definition: errcode.h:53
inT16 word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:77
WERD * word
Definition: pageres.h:175
inT16 progress
chars in this buffer(0)
Definition: ocrclass.h:118
REJMAP reject_map
Definition: pageres.h:271
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:121

◆ repeated_nonalphanum_wd()

BOOL8 tesseract::Tesseract::repeated_nonalphanum_wd ( WERD_RES word,
ROW row 
)

Definition at line 582 of file reject.cpp.

582  {
583  inT16 char_quality;
584  inT16 accepted_char_quality;
585 
586  if (word->best_choice->unichar_lengths().length() <= 1)
587  return FALSE;
588 
590  contains(word->best_choice->unichar_string()[0]))
591  return FALSE;
592 
593  UNICHAR_ID uch_id = word->best_choice->unichar_id(0);
594  for (int i = 1; i < word->best_choice->length(); ++i) {
595  if (word->best_choice->unichar_id(i) != uch_id) return FALSE;
596  }
597 
598  word_char_quality(word, row, &char_quality, &accepted_char_quality);
599 
600  if ((word->best_choice->unichar_lengths().length () == char_quality) &&
601  (char_quality == accepted_char_quality))
602  return TRUE;
603  else
604  return FALSE;
605 }
#define TRUE
Definition: capi.h:45
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
const STRING & unichar_lengths() const
Definition: ratngs.h:546
char * ok_repeated_ch_non_alphanum_wds
inT32 length() const
Definition: strngs.cpp:193
int16_t inT16
Definition: host.h:36
Definition: strngs.h:45
#define FALSE
Definition: capi.h:46
void word_char_quality(WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count)
Definition: docqual.cpp:97
const STRING & unichar_string() const
Definition: ratngs.h:539

◆ ReportFailedBox()

void tesseract::Tesseract::ReportFailedBox ( int  boxfile_lineno,
TBOX  box,
const char *  box_ch,
const char *  err_msg 
)

Logs a bad box by line in the box file and box coords.

Definition at line 764 of file applybox.cpp.

765  {
766  tprintf("APPLY_BOXES: boxfile line %d/%s ((%d,%d),(%d,%d)): %s\n",
767  boxfile_lineno + 1, box_ch,
768  box.left(), box.bottom(), box.right(), box.top(), err_msg);
769 }
#define tprintf(...)
Definition: tprintf.h:31
inT16 left() const
Definition: rect.h:68
inT16 top() const
Definition: rect.h:54
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61

◆ ReportXhtFixResult()

void tesseract::Tesseract::ReportXhtFixResult ( bool  accept_new_word,
float  new_x_ht,
WERD_RES word,
WERD_RES new_word 
)

Definition at line 1413 of file control.cpp.

1414  {
1415  tprintf("New XHT Match:%s = %s ",
1416  word->best_choice->unichar_string().string(),
1417  word->best_choice->debug_string().string());
1418  word->reject_map.print(debug_fp);
1419  tprintf(" -> %s = %s ",
1420  new_word->best_choice->unichar_string().string(),
1421  new_word->best_choice->debug_string().string());
1422  new_word->reject_map.print(debug_fp);
1423  tprintf(" %s->%s %s %s\n",
1424  word->guessed_x_ht ? "GUESS" : "CERT",
1425  new_word->guessed_x_ht ? "GUESS" : "CERT",
1426  new_x_ht > 0.1 ? "STILL DOUBT" : "OK",
1427  accept_new_word ? "ACCEPTED" : "");
1428 }
void print(FILE *fp)
Definition: rejctmap.cpp:391
WERD_CHOICE * best_choice
Definition: pageres.h:219
const STRING debug_string() const
Definition: ratngs.h:503
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
FILE * debug_fp
Definition: tessvars.cpp:24
BOOL8 guessed_x_ht
Definition: pageres.h:292
const STRING & unichar_string() const
Definition: ratngs.h:539
REJMAP reject_map
Definition: pageres.h:271

◆ ReSegmentByClassification()

void tesseract::Tesseract::ReSegmentByClassification ( PAGE_RES page_res)

Resegments the words by running the classifier in an attempt to find the correct segmentation that produces the required string.

Definition at line 509 of file applybox.cpp.

509  {
510  PAGE_RES_IT pr_it(page_res);
511  WERD_RES* word_res;
512  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
513  WERD* word = word_res->word;
514  if (word->text() == NULL || word->text()[0] == '\0')
515  continue; // Ignore words that have no text.
516  // Convert the correct text to a vector of UNICHAR_ID
517  GenericVector<UNICHAR_ID> target_text;
518  if (!ConvertStringToUnichars(word->text(), &target_text)) {
519  tprintf("APPLY_BOX: FAILURE: can't find class_id for '%s'\n",
520  word->text());
521  pr_it.DeleteCurrentWord();
522  continue;
523  }
524  if (!FindSegmentation(target_text, word_res)) {
525  tprintf("APPLY_BOX: FAILURE: can't find segmentation for '%s'\n",
526  word->text());
527  pr_it.DeleteCurrentWord();
528  continue;
529  }
530  }
531 }
#define tprintf(...)
Definition: tprintf.h:31
const char * text() const
Definition: werd.h:125
WERD * word
Definition: pageres.h:175
Definition: werd.h:60
bool FindSegmentation(const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res)
Definition: applybox.cpp:559
bool ConvertStringToUnichars(const char *utf8, GenericVector< UNICHAR_ID > *class_ids)
Definition: applybox.cpp:535

◆ ResegmentCharBox()

bool tesseract::Tesseract::ResegmentCharBox ( PAGE_RES page_res,
const TBOX prev_box,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.

Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an appropriate blob for a box.

This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.

Definition at line 340 of file applybox.cpp.

342  {
343  if (applybox_debug > 1) {
344  tprintf("\nAPPLY_BOX: in ResegmentCharBox() for %s\n", correct_text);
345  }
346  PAGE_RES_IT page_res_it(page_res);
347  WERD_RES* word_res;
348  for (word_res = page_res_it.word(); word_res != NULL;
349  word_res = page_res_it.forward()) {
350  if (!word_res->box_word->bounding_box().major_overlap(box))
351  continue;
352  if (applybox_debug > 1) {
353  tprintf("Checking word box:");
354  word_res->box_word->bounding_box().print();
355  }
356  int word_len = word_res->box_word->length();
357  for (int i = 0; i < word_len; ++i) {
358  TBOX char_box = TBOX();
359  int blob_count = 0;
360  for (blob_count = 0; i + blob_count < word_len; ++blob_count) {
361  TBOX blob_box = word_res->box_word->BlobBox(i + blob_count);
362  if (!blob_box.major_overlap(box))
363  break;
364  if (word_res->correct_text[i + blob_count].length() > 0)
365  break; // Blob is claimed already.
366  double current_box_miss_metric = BoxMissMetric(blob_box, box);
367  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
368  if (applybox_debug > 2) {
369  tprintf("Checking blob:");
370  blob_box.print();
371  tprintf("Current miss metric = %g, next = %g\n",
372  current_box_miss_metric, next_box_miss_metric);
373  }
374  if (current_box_miss_metric > next_box_miss_metric)
375  break; // Blob is a better match for next box.
376  char_box += blob_box;
377  }
378  if (blob_count > 0) {
379  if (applybox_debug > 1) {
380  tprintf("Index [%d, %d) seem good.\n", i, i + blob_count);
381  }
382  if (!char_box.almost_equal(box, 3) &&
383  (box.x_gap(next_box) < -3 ||
384  (prev_box != NULL && prev_box->x_gap(box) < -3))) {
385  return false;
386  }
387  // We refine just the box_word, best_state and correct_text here.
388  // The rebuild_word is made in TidyUp.
389  // blob_count blobs are put together to match the box. Merge the
390  // box_word boxes, save the blob_count in the state and the text.
391  word_res->box_word->MergeBoxes(i, i + blob_count);
392  word_res->best_state[i] = blob_count;
393  word_res->correct_text[i] = correct_text;
394  if (applybox_debug > 2) {
395  tprintf("%d Blobs match: blob box:", blob_count);
396  word_res->box_word->BlobBox(i).print();
397  tprintf("Matches box:");
398  box.print();
399  tprintf("With next box:");
400  next_box.print();
401  }
402  // Eliminated best_state and correct_text entries for the consumed
403  // blobs.
404  for (int j = 1; j < blob_count; ++j) {
405  word_res->best_state.remove(i + 1);
406  word_res->correct_text.remove(i + 1);
407  }
408  // Assume that no box spans multiple source words, so we are done with
409  // this box.
410  if (applybox_debug > 1) {
411  tprintf("Best state = ");
412  for (int j = 0; j < word_res->best_state.size(); ++j) {
413  tprintf("%d ", word_res->best_state[j]);
414  }
415  tprintf("\n");
416  tprintf("Correct text = [[ ");
417  for (int j = 0; j < word_res->correct_text.size(); ++j) {
418  tprintf("%s ", word_res->correct_text[j].string());
419  }
420  tprintf("]]\n");
421  }
422  return true;
423  }
424  }
425  }
426  if (applybox_debug > 0) {
427  tprintf("FAIL!\n");
428  }
429  return false; // Failure.
430 }
const TBOX & BlobBox(int index) const
Definition: boxword.h:86
GenericVector< int > best_state
Definition: pageres.h:255
void MergeBoxes(int start, int end)
Definition: boxword.cpp:134
void remove(int index)
GenericVector< STRING > correct_text
Definition: pageres.h:259
#define tprintf(...)
Definition: tprintf.h:31
const TBOX & bounding_box() const
Definition: boxword.h:82
int size() const
Definition: genericvector.h:72
tesseract::BoxWord * box_word
Definition: pageres.h:250
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
int length() const
Definition: genericvector.h:85
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
Definition: rect.h:30
WERD * word
Definition: pageres.h:175
void print() const
Definition: rect.h:270
int length() const
Definition: boxword.h:85
int x_gap(const TBOX &box) const
Definition: rect.h:217

◆ ResegmentWordBox()

bool tesseract::Tesseract::ResegmentWordBox ( BLOCK_LIST *  block_list,
const TBOX box,
const TBOX next_box,
const char *  correct_text 
)

Consume all source blobs that strongly overlap the given box, putting them into a new word, with the correct_text label. Fights over which box owns which blobs are settled by applying the blobs to box or next_box with the least non-overlap.

Returns
false if the box was in error, which can only be caused by failing to find an overlapping blob for a box.

Definition at line 438 of file applybox.cpp.

440  {
441  if (applybox_debug > 1) {
442  tprintf("\nAPPLY_BOX: in ResegmentWordBox() for %s\n", correct_text);
443  }
444  WERD* new_word = NULL;
445  BLOCK_IT b_it(block_list);
446  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
447  BLOCK* block = b_it.data();
448  if (!box.major_overlap(block->bounding_box()))
449  continue;
450  ROW_IT r_it(block->row_list());
451  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward()) {
452  ROW* row = r_it.data();
453  if (!box.major_overlap(row->bounding_box()))
454  continue;
455  WERD_IT w_it(row->word_list());
456  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
457  WERD* word = w_it.data();
458  if (applybox_debug > 2) {
459  tprintf("Checking word:");
460  word->bounding_box().print();
461  }
462  if (word->text() != NULL && word->text()[0] != '\0')
463  continue; // Ignore words that are already done.
464  if (!box.major_overlap(word->bounding_box()))
465  continue;
466  C_BLOB_IT blob_it(word->cblob_list());
467  for (blob_it.mark_cycle_pt(); !blob_it.cycled_list();
468  blob_it.forward()) {
469  C_BLOB* blob = blob_it.data();
470  TBOX blob_box = blob->bounding_box();
471  if (!blob_box.major_overlap(box))
472  continue;
473  double current_box_miss_metric = BoxMissMetric(blob_box, box);
474  double next_box_miss_metric = BoxMissMetric(blob_box, next_box);
475  if (applybox_debug > 2) {
476  tprintf("Checking blob:");
477  blob_box.print();
478  tprintf("Current miss metric = %g, next = %g\n",
479  current_box_miss_metric, next_box_miss_metric);
480  }
481  if (current_box_miss_metric > next_box_miss_metric)
482  continue; // Blob is a better match for next box.
483  if (applybox_debug > 2) {
484  tprintf("Blob match: blob:");
485  blob_box.print();
486  tprintf("Matches box:");
487  box.print();
488  tprintf("With next box:");
489  next_box.print();
490  }
491  if (new_word == NULL) {
492  // Make a new word with a single blob.
493  new_word = word->shallow_copy();
494  new_word->set_text(correct_text);
495  w_it.add_to_end(new_word);
496  }
497  C_BLOB_IT new_blob_it(new_word->cblob_list());
498  new_blob_it.add_to_end(blob_it.extract());
499  }
500  }
501  }
502  }
503  if (new_word == NULL && applybox_debug > 0) tprintf("FAIL!\n");
504  return new_word != NULL;
505 }
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
#define tprintf(...)
Definition: tprintf.h:31
void set_text(const char *new_text)
Definition: werd.h:126
WERD_LIST * word_list()
Definition: ocrrow.h:52
TBOX bounding_box() const
Definition: werd.cpp:160
const char * text() const
Definition: werd.h:125
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
TBOX bounding_box() const
Definition: ocrrow.h:85
WERD * shallow_copy()
Definition: werd.cpp:352
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
void print() const
Definition: rect.h:270
Definition: werd.h:60
Definition: ocrrow.h:32
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
Definition: ocrblock.h:30

◆ ResetAdaptiveClassifier()

void tesseract::Tesseract::ResetAdaptiveClassifier ( )

Definition at line 658 of file tesseractclass.cpp.

658  {
660  for (int i = 0; i < sub_langs_.size(); ++i) {
661  sub_langs_[i]->ResetAdaptiveClassifierInternal();
662  }
663 }
void ResetAdaptiveClassifierInternal()
Definition: adaptmatch.cpp:599

◆ ResetDocumentDictionary()

void tesseract::Tesseract::ResetDocumentDictionary ( )

Definition at line 666 of file tesseractclass.cpp.

666  {
668  for (int i = 0; i < sub_langs_.size(); ++i) {
669  sub_langs_[i]->getDict().ResetDocumentDictionary();
670  }
671 }
Dict & getDict()
Definition: classify.h:65
void ResetDocumentDictionary()
Definition: dict.h:310

◆ reskew()

const FCOORD& tesseract::Tesseract::reskew ( ) const
inline

Definition at line 181 of file tesseractclass.h.

181  {
182  return reskew_;
183  }

◆ RetryWithLanguage()

int tesseract::Tesseract::RetryWithLanguage ( const WordData word_data,
WordRecognizer  recognizer,
bool  debug,
WERD_RES **  in_word,
PointerVector< WERD_RES > *  best_words 
)

Definition at line 888 of file control.cpp.

891  {
892  if (debug) {
893  tprintf("Trying word using lang %s, oem %d\n",
894  lang.string(), static_cast<int>(tessedit_ocr_engine_mode));
895  }
896  // Run the recognizer on the word.
897  PointerVector<WERD_RES> new_words;
898  (this->*recognizer)(word_data, in_word, &new_words);
899  if (new_words.empty()) {
900  // Transfer input word to new_words, as the classifier must have put
901  // the result back in the input.
902  new_words.push_back(*in_word);
903  *in_word = NULL;
904  }
905  if (debug) {
906  for (int i = 0; i < new_words.size(); ++i)
907  new_words[i]->DebugTopChoice("Lang result");
908  }
909  // Initial version is a bit of a hack based on better certainty and rating
910  // or a dictionary vs non-dictionary word.
911  return SelectBestWords(classify_max_rating_ratio,
913  debug, &new_words, best_words);
914 }
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
STRING lang
Definition: ccutil.h:66
double classify_max_rating_ratio
Definition: classify.h:401
double classify_max_certainty_margin
Definition: classify.h:403

◆ right_to_left()

bool tesseract::Tesseract::right_to_left ( ) const
inline

Definition at line 250 of file tesseractclass.h.

250  {
251  return right_to_left_;
252  }

◆ RunOldFixXht()

bool tesseract::Tesseract::RunOldFixXht ( WERD_RES word,
BLOCK block,
ROW row 
)

◆ safe_dict_word()

inT16 tesseract::Tesseract::safe_dict_word ( const WERD_RES werd_res)

Definition at line 607 of file reject.cpp.

607  {
608  const WERD_CHOICE &word = *werd_res->best_choice;
609  int dict_word_type = werd_res->tesseract->dict_word(word);
610  return dict_word_type == DOC_DAWG_PERM ? 0 : dict_word_type;
611 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
tesseract::Tesseract * tesseract
Definition: pageres.h:266
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:128

◆ scaled_color()

Pix* tesseract::Tesseract::scaled_color ( ) const
inline

Definition at line 233 of file tesseractclass.h.

233  {
234  return scaled_color_;
235  }

◆ scaled_factor()

int tesseract::Tesseract::scaled_factor ( ) const
inline

Definition at line 236 of file tesseractclass.h.

236  {
237  return scaled_factor_;
238  }

◆ script_pos_pass()

void tesseract::Tesseract::script_pos_pass ( PAGE_RES page_res)

Definition at line 718 of file control.cpp.

718  {
719  PAGE_RES_IT page_res_it(page_res);
720  for (page_res_it.restart_page(); page_res_it.word() != NULL;
721  page_res_it.forward()) {
722  WERD_RES* word = page_res_it.word();
723  if (word->word->flag(W_REP_CHAR)) {
724  page_res_it.forward();
725  continue;
726  }
727  float x_height = page_res_it.block()->block->x_height();
728  float word_x_height = word->x_height;
729  if (word_x_height < word->best_choice->min_x_height() ||
730  word_x_height > word->best_choice->max_x_height()) {
731  word_x_height = (word->best_choice->min_x_height() +
732  word->best_choice->max_x_height()) / 2.0f;
733  }
734  // Test for small caps. Word capheight must be close to block xheight,
735  // and word must contain no lower case letters, and at least one upper case.
736  double small_cap_xheight = x_height * kXHeightCapRatio;
737  double small_cap_delta = (x_height - small_cap_xheight) / 2.0;
738  if (word->uch_set->script_has_xheight() &&
739  small_cap_xheight - small_cap_delta <= word_x_height &&
740  word_x_height <= small_cap_xheight + small_cap_delta) {
741  // Scan for upper/lower.
742  int num_upper = 0;
743  int num_lower = 0;
744  for (int i = 0; i < word->best_choice->length(); ++i) {
745  if (word->uch_set->get_isupper(word->best_choice->unichar_id(i)))
746  ++num_upper;
747  else if (word->uch_set->get_islower(word->best_choice->unichar_id(i)))
748  ++num_lower;
749  }
750  if (num_upper > 0 && num_lower == 0)
751  word->small_caps = true;
752  }
753  word->SetScriptPositions();
754  }
755 }
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
float max_x_height() const
Definition: ratngs.h:337
float min_x_height() const
Definition: ratngs.h:334
static const double kXHeightCapRatio
Definition: ccstruct.h:37
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
void SetScriptPositions()
Definition: pageres.cpp:853
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
bool small_caps
Definition: pageres.h:283
WERD * word
Definition: pageres.h:175
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:465
const UNICHARSET * uch_set
Definition: pageres.h:192
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:458
float x_height
Definition: pageres.h:295
bool script_has_xheight() const
Definition: unicharset.h:863

◆ SearchForText()

void tesseract::Tesseract::SearchForText ( const GenericVector< BLOB_CHOICE_LIST *> *  choices,
int  choices_pos,
int  choices_length,
const GenericVector< UNICHAR_ID > &  target_text,
int  text_index,
float  rating,
GenericVector< int > *  segmentation,
float *  best_rating,
GenericVector< int > *  best_segmentation 
)

Recursive helper to find a match to the target_text (from text_index position) in the choices (from choices_pos position).

Parameters
choicesis an array of GenericVectors, of length choices_length, with each element representing a starting position in the word, and the GenericVector holding classification results for a sequence of consecutive blobs, with index 0 being a single blob, index 1 being 2 blobs etc.
choices_pos
choices_length
target_text
text_index
rating
segmentation
best_rating
best_segmentation

Definition at line 629 of file applybox.cpp.

635  {
637  for (int length = 1; length <= choices[choices_pos].size(); ++length) {
638  // Rating of matching choice or worst choice if no match.
639  float choice_rating = 0.0f;
640  // Find the corresponding best BLOB_CHOICE.
641  BLOB_CHOICE_IT choice_it(choices[choices_pos][length - 1]);
642  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
643  choice_it.forward()) {
644  BLOB_CHOICE* choice = choice_it.data();
645  choice_rating = choice->rating();
646  UNICHAR_ID class_id = choice->unichar_id();
647  if (class_id == target_text[text_index]) {
648  break;
649  }
650  // Search ambigs table.
651  if (class_id < table.size() && table[class_id] != NULL) {
652  AmbigSpec_IT spec_it(table[class_id]);
653  for (spec_it.mark_cycle_pt(); !spec_it.cycled_list();
654  spec_it.forward()) {
655  const AmbigSpec *ambig_spec = spec_it.data();
656  // We'll only do 1-1.
657  if (ambig_spec->wrong_ngram[1] == INVALID_UNICHAR_ID &&
658  ambig_spec->correct_ngram_id == target_text[text_index])
659  break;
660  }
661  if (!spec_it.cycled_list())
662  break; // Found an ambig.
663  }
664  }
665  if (choice_it.cycled_list())
666  continue; // No match.
667  segmentation->push_back(length);
668  if (choices_pos + length == choices_length &&
669  text_index + 1 == target_text.size()) {
670  // This is a complete match. If the rating is good record a new best.
671  if (applybox_debug > 2) {
672  tprintf("Complete match, rating = %g, best=%g, seglength=%d, best=%d\n",
673  rating + choice_rating, *best_rating, segmentation->size(),
674  best_segmentation->size());
675  }
676  if (best_segmentation->empty() || rating + choice_rating < *best_rating) {
677  *best_segmentation = *segmentation;
678  *best_rating = rating + choice_rating;
679  }
680  } else if (choices_pos + length < choices_length &&
681  text_index + 1 < target_text.size()) {
682  if (applybox_debug > 3) {
683  tprintf("Match found for %d=%s:%s, at %d+%d, recursing...\n",
684  target_text[text_index],
685  unicharset.id_to_unichar(target_text[text_index]),
686  choice_it.data()->unichar_id() == target_text[text_index]
687  ? "Match" : "Ambig",
688  choices_pos, length);
689  }
690  SearchForText(choices, choices_pos + length, choices_length, target_text,
691  text_index + 1, rating + choice_rating, segmentation,
692  best_rating, best_segmentation);
693  if (applybox_debug > 3) {
694  tprintf("End recursion for %d=%s\n", target_text[text_index],
695  unicharset.id_to_unichar(target_text[text_index]));
696  }
697  }
698  segmentation->truncate(segmentation->size() - 1);
699  }
700 }
int UNICHAR_ID
Definition: unichar.h:33
void SearchForText(const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation)
Definition: applybox.cpp:629
Dict & getDict()
Definition: classify.h:65
int push_back(T object)
float rating() const
Definition: ratngs.h:79
#define tprintf(...)
Definition: tprintf.h:31
bool empty() const
Definition: genericvector.h:90
void truncate(int size)
int size() const
Definition: genericvector.h:72
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:266
const UnicharAmbigsVector & dang_ambigs() const
Definition: ambigs.h:154
UNICHARSET unicharset
Definition: ccutil.h:68
const UnicharAmbigs & getUnicharAmbigs() const
Definition: dict.h:103
GenericVector< AmbigSpec_LIST * > UnicharAmbigsVector
Definition: ambigs.h:143
UNICHAR_ID unichar_id() const
Definition: ratngs.h:76

◆ SearchWords()

void tesseract::Tesseract::SearchWords ( PointerVector< WERD_RES > *  words)

Definition at line 253 of file linerec.cpp.

253  {
254  // Run the segmentation search on the network outputs and make a BoxWord
255  // for each of the output words.
256  // If we drop a word as junk, then there is always a space in front of the
257  // next.
258  const Dict* stopper_dict = lstm_recognizer_->GetDict();
259  if (stopper_dict == nullptr) stopper_dict = &getDict();
260  bool any_nonspace_delimited = false;
261  for (int w = 0; w < words->size(); ++w) {
262  WERD_RES* word = (*words)[w];
263  if (word->best_choice != nullptr &&
265  any_nonspace_delimited = true;
266  break;
267  }
268  }
269  for (int w = 0; w < words->size(); ++w) {
270  WERD_RES* word = (*words)[w];
271  if (word->best_choice == NULL) {
272  // If we are using the beam search, the unicharset had better match!
274  WordSearch(word);
275  } else if (word->best_choice->unicharset() == &unicharset &&
276  !lstm_recognizer_->IsRecoding()) {
277  // We set up the word without using the dictionary, so set the permuter
278  // now, but we can only do it because the unicharsets match.
279  word->best_choice->set_permuter(
280  getDict().valid_word(*word->best_choice, true));
281  }
282  if (word->best_choice == NULL) {
283  // It is a dud.
284  word->SetupFake(lstm_recognizer_->GetUnicharset());
285  } else {
286  // Set the best state.
287  for (int i = 0; i < word->best_choice->length(); ++i) {
288  int length = word->best_choice->state(i);
289  word->best_state.push_back(length);
290  }
291  word->reject_map.initialise(word->best_choice->length());
292  word->tess_failed = false;
293  word->tess_accepted = true;
294  word->tess_would_adapt = false;
295  word->done = true;
296  word->tesseract = this;
297  float word_certainty = MIN(word->space_certainty,
298  word->best_choice->certainty());
299  word_certainty *= kCertaintyScale;
300  // Arbitrary ding factor for non-dictionary words.
301  if (!lstm_recognizer_->IsRecoding() &&
303  word_certainty -= kNonDictionaryPenalty;
304  if (getDict().stopper_debug_level >= 1) {
305  tprintf("Best choice certainty=%g, space=%g, scaled=%g, final=%g\n",
306  word->best_choice->certainty(), word->space_certainty,
307  MIN(word->space_certainty, word->best_choice->certainty()) *
309  word_certainty);
310  word->best_choice->print();
311  }
312  word->best_choice->set_certainty(word_certainty);
313  // Discard words that are impossibly bad, but allow a bit more for
314  // dictionary words, and keep bad words in non-space-delimited langs.
315  if (word_certainty >= RecodeBeamSearch::kMinCertainty ||
316  any_nonspace_delimited ||
317  (word_certainty >= kWorstDictCertainty &&
319  word->tess_accepted = stopper_dict->AcceptableResult(word);
320  } else {
321  if (getDict().stopper_debug_level >= 1) {
322  tprintf("Deleting word with certainty %g\n", word_certainty);
323  word->best_choice->print();
324  }
325  // It is a dud.
326  word->SetupFake(lstm_recognizer_->GetUnicharset());
327  }
328  }
329  }
330 }
void set_certainty(float new_val)
Definition: ratngs.h:370
bool ContainsAnyNonSpaceDelimited() const
Definition: ratngs.h:512
const float kNonDictionaryPenalty
Definition: linerec.cpp:36
void print() const
Definition: ratngs.h:578
BOOL8 tess_failed
Definition: pageres.h:272
GenericVector< int > best_state
Definition: pageres.h:255
static const float kMinCertainty
Definition: recodebeam.h:213
WERD_CHOICE * best_choice
Definition: pageres.h:219
Dict & getDict()
Definition: classify.h:65
int length() const
Definition: ratngs.h:301
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:455
int push_back(T object)
const UNICHARSET & GetUnicharset() const
#define tprintf(...)
Definition: tprintf.h:31
const float kWorstDictCertainty
Definition: linerec.cpp:40
int size() const
Definition: genericvector.h:72
tesseract::Tesseract * tesseract
Definition: pageres.h:266
uinT8 permuter() const
Definition: ratngs.h:344
void SetupWordScript(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:376
int stopper_debug_level
Definition: dict.h:622
BOOL8 tess_would_adapt
Definition: pageres.h:281
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:344
BOOL8 tess_accepted
Definition: pageres.h:280
const float kCertaintyScale
Definition: linerec.cpp:38
const Dict * GetDict() const
UNICHARSET unicharset
Definition: ccutil.h:68
float certainty() const
Definition: ratngs.h:328
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
int state(int index) const
Definition: ratngs.h:317
#define MIN(x, y)
Definition: ndminx.h:28
float space_certainty
Definition: pageres.h:300
void initialise(inT16 length)
Definition: rejctmap.cpp:318
REJMAP reject_map
Definition: pageres.h:271
void set_permuter(uinT8 perm)
Definition: ratngs.h:373
void WordSearch(WERD_RES *word_res)
Definition: segsearch.cpp:130
BOOL8 done
Definition: pageres.h:282

◆ SegmentPage()

int tesseract::Tesseract::SegmentPage ( const STRING input_file,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr 
)

Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be NULL. On return the blocks list owns all the constructed page layout.

Definition at line 103 of file pagesegmain.cpp.

104  {
105  ASSERT_HOST(pix_binary_ != NULL);
106  int width = pixGetWidth(pix_binary_);
107  int height = pixGetHeight(pix_binary_);
108  // Get page segmentation mode.
109  PageSegMode pageseg_mode = static_cast<PageSegMode>(
110  static_cast<int>(tessedit_pageseg_mode));
111  // If a UNLV zone file can be found, use that instead of segmentation.
112  if (!PSM_COL_FIND_ENABLED(pageseg_mode) &&
113  input_file != NULL && input_file->length() > 0) {
114  STRING name = *input_file;
115  const char* lastdot = strrchr(name.string(), '.');
116  if (lastdot != NULL)
117  name[lastdot - name.string()] = '\0';
118  read_unlv_file(name, width, height, blocks);
119  }
120  if (blocks->empty()) {
121  // No UNLV file present. Work according to the PageSegMode.
122  // First make a single block covering the whole image.
123  BLOCK_IT block_it(blocks);
124  BLOCK* block = new BLOCK("", TRUE, 0, 0, 0, 0, width, height);
126  block_it.add_to_end(block);
127  } else {
128  // UNLV file present. Use PSM_SINGLE_BLOCK.
129  pageseg_mode = PSM_SINGLE_BLOCK;
130  }
131  // The diacritic_blobs holds noise blobs that may be diacritics. They
132  // are separated out on areas of the image that seem noisy and short-circuit
133  // the layout process, going straight from the initial partition creation
134  // right through to after word segmentation, where they are added to the
135  // rej_cblobs list of the most appropriate word. From there classification
136  // will determine whether they are used.
137  BLOBNBOX_LIST diacritic_blobs;
138  int auto_page_seg_ret_val = 0;
139  TO_BLOCK_LIST to_blocks;
140  if (PSM_OSD_ENABLED(pageseg_mode) || PSM_BLOCK_FIND_ENABLED(pageseg_mode) ||
141  PSM_SPARSE(pageseg_mode)) {
142  auto_page_seg_ret_val = AutoPageSeg(
143  pageseg_mode, blocks, &to_blocks,
144  enable_noise_removal ? &diacritic_blobs : NULL, osd_tess, osr);
145  if (pageseg_mode == PSM_OSD_ONLY)
146  return auto_page_seg_ret_val;
147  // To create blobs from the image region bounds uncomment this line:
148  // to_blocks.clear(); // Uncomment to go back to the old mode.
149  } else {
150  deskew_ = FCOORD(1.0f, 0.0f);
151  reskew_ = FCOORD(1.0f, 0.0f);
152  if (pageseg_mode == PSM_CIRCLE_WORD) {
153  Pix* pixcleaned = RemoveEnclosingCircle(pix_binary_);
154  if (pixcleaned != NULL) {
155  pixDestroy(&pix_binary_);
156  pix_binary_ = pixcleaned;
157  }
158  }
159  }
160 
161  if (auto_page_seg_ret_val < 0) {
162  return -1;
163  }
164 
165  if (blocks->empty()) {
167  tprintf("Empty page\n");
168  return 0; // AutoPageSeg found an empty page.
169  }
170  bool splitting =
172  bool cjk_mode = textord_use_cjk_fp_model;
173 
174  textord_.TextordPage(pageseg_mode, reskew_, width, height, pix_binary_,
175  pix_thresholds_, pix_grey_, splitting || cjk_mode,
176  &diacritic_blobs, blocks, &to_blocks);
177  return auto_page_seg_ret_val;
178 }
bool PSM_SPARSE(int pageseg_mode)
Definition: publictypes.h:188
bool read_unlv_file(STRING name, inT32 xsize, inT32 ysize, BLOCK_LIST *blocks)
Definition: blread.cpp:36
#define TRUE
Definition: capi.h:45
Definition: points.h:189
void set_right_to_left(bool value)
Definition: ocrblock.h:86
bool PSM_BLOCK_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:191
void TextordPage(PageSegMode pageseg_mode, const FCOORD &reskew, int width, int height, Pix *binary_pix, Pix *thresholds_pix, Pix *grey_pix, bool use_box_bottoms, BLOBNBOX_LIST *diacritic_blobs, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: textord.cpp:233
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:179
int AutoPageSeg(PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr)
#define tprintf(...)
Definition: tprintf.h:31
Assume a single uniform block of text. (Default.)
Definition: publictypes.h:160
const char * string() const
Definition: strngs.cpp:198
inT32 length() const
Definition: strngs.cpp:193
#define ASSERT_HOST(x)
Definition: errcode.h:84
int textord_debug_tabfind
Definition: alignedblob.cpp:27
Definition: strngs.h:45
Treat the image as a single word in a circle.
Definition: publictypes.h:163
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:185
bool right_to_left() const
Orientation and script detection only.
Definition: publictypes.h:152
Definition: ocrblock.h:30

◆ SelectGoodDiacriticOutlines()

bool tesseract::Tesseract::SelectGoodDiacriticOutlines ( int  pass,
float  certainty_threshold,
PAGE_RES_IT pr_it,
C_BLOB blob,
const GenericVector< C_OUTLINE *> &  outlines,
int  num_outlines,
GenericVector< bool > *  ok_outlines 
)

Definition at line 1122 of file control.cpp.

1125  {
1126  STRING best_str;
1127  float target_cert = certainty_threshold;
1128  if (blob != NULL) {
1129  float target_c2;
1130  target_cert = ClassifyBlobAsWord(pass, pr_it, blob, &best_str, &target_c2);
1131  if (debug_noise_removal) {
1132  tprintf("No Noise blob classified as %s=%g(%g) at:", best_str.string(),
1133  target_cert, target_c2);
1134  blob->bounding_box().print();
1135  }
1136  target_cert -= (target_cert - certainty_threshold) * noise_cert_factor;
1137  }
1138  GenericVector<bool> test_outlines = *ok_outlines;
1139  // Start with all the outlines in.
1140  STRING all_str;
1141  GenericVector<bool> best_outlines = *ok_outlines;
1142  float best_cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1143  pr_it, blob, &all_str);
1144  if (debug_noise_removal) {
1145  TBOX ol_box;
1146  for (int i = 0; i < test_outlines.size(); ++i) {
1147  if (test_outlines[i]) ol_box += outlines[i]->bounding_box();
1148  }
1149  tprintf("All Noise blob classified as %s=%g, delta=%g at:",
1150  all_str.string(), best_cert, best_cert - target_cert);
1151  ol_box.print();
1152  }
1153  // Iteratively zero out the bit that improves the certainty the most, until
1154  // we get past the threshold, have zero bits, or fail to improve.
1155  int best_index = 0; // To zero out.
1156  while (num_outlines > 1 && best_index >= 0 &&
1157  (blob == NULL || best_cert < target_cert || blob != NULL)) {
1158  // Find the best bit to zero out.
1159  best_index = -1;
1160  for (int i = 0; i < outlines.size(); ++i) {
1161  if (test_outlines[i]) {
1162  test_outlines[i] = false;
1163  STRING str;
1164  float cert = ClassifyBlobPlusOutlines(test_outlines, outlines, pass,
1165  pr_it, blob, &str);
1166  if (debug_noise_removal) {
1167  TBOX ol_box;
1168  for (int j = 0; j < outlines.size(); ++j) {
1169  if (test_outlines[j]) ol_box += outlines[j]->bounding_box();
1170  tprintf("%d", test_outlines[j]);
1171  }
1172  tprintf(" blob classified as %s=%g, delta=%g) at:", str.string(),
1173  cert, cert - target_cert);
1174  ol_box.print();
1175  }
1176  if (cert > best_cert) {
1177  best_cert = cert;
1178  best_index = i;
1179  best_outlines = test_outlines;
1180  }
1181  test_outlines[i] = true;
1182  }
1183  }
1184  if (best_index >= 0) {
1185  test_outlines[best_index] = false;
1186  --num_outlines;
1187  }
1188  }
1189  if (best_cert >= target_cert) {
1190  // Save the best combination.
1191  *ok_outlines = best_outlines;
1192  if (debug_noise_removal) {
1193  tprintf("%s noise combination ", blob ? "Adding" : "New");
1194  for (int i = 0; i < best_outlines.size(); ++i) {
1195  tprintf("%d", best_outlines[i]);
1196  }
1197  tprintf(" yields certainty %g, beating target of %g\n", best_cert,
1198  target_cert);
1199  }
1200  return true;
1201  }
1202  return false;
1203 }
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
int size() const
Definition: genericvector.h:72
Definition: strngs.h:45
float ClassifyBlobPlusOutlines(const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str)
Definition: control.cpp:1207
float ClassifyBlobAsWord(int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2)
Definition: control.cpp:1249
Definition: rect.h:30
TBOX bounding_box() const
Definition: stepblob.cpp:250
void print() const
Definition: rect.h:270

◆ set_done()

void tesseract::Tesseract::set_done ( WERD_RES word,
inT16  pass 
)

◆ set_pix_grey()

void tesseract::Tesseract::set_pix_grey ( Pix *  grey_pix)
inline

Definition at line 195 of file tesseractclass.h.

195  {
196  pixDestroy(&pix_grey_);
197  pix_grey_ = grey_pix;
198  }

◆ set_pix_original()

void tesseract::Tesseract::set_pix_original ( Pix *  original_pix)
inline

Definition at line 201 of file tesseractclass.h.

201  {
202  pixDestroy(&pix_original_);
203  pix_original_ = original_pix;
204  // Clone to sublangs as well.
205  for (int i = 0; i < sub_langs_.size(); ++i)
206  sub_langs_[i]->set_pix_original(original_pix ? pixClone(original_pix)
207  : nullptr);
208  }
void set_pix_original(Pix *original_pix)

◆ set_pix_thresholds()

void tesseract::Tesseract::set_pix_thresholds ( Pix *  thresholds)
inline

Definition at line 217 of file tesseractclass.h.

217  {
218  pixDestroy(&pix_thresholds_);
219  pix_thresholds_ = thresholds;
220  }

◆ set_source_resolution()

void tesseract::Tesseract::set_source_resolution ( int  ppi)
inline

Definition at line 224 of file tesseractclass.h.

224  {
225  source_resolution_ = ppi;
226  }

◆ set_unlv_suspects()

void tesseract::Tesseract::set_unlv_suspects ( WERD_RES word)

Definition at line 305 of file output.cpp.

305  {
306  int len = word_res->reject_map.length();
307  const WERD_CHOICE &word = *(word_res->best_choice);
308  const UNICHARSET &uchset = *word.unicharset();
309  int i;
310  float rating_per_ch;
311 
312  if (suspect_level == 0) {
313  for (i = 0; i < len; i++) {
314  if (word_res->reject_map[i].rejected())
315  word_res->reject_map[i].setrej_minimal_rej_accept();
316  }
317  return;
318  }
319 
320  if (suspect_level >= 3)
321  return; //Use defaults
322 
323  /* NOW FOR LEVELS 1 and 2 Find some stuff to unreject*/
324 
325  if (safe_dict_word(word_res) &&
326  (count_alphas(word) > suspect_short_words)) {
327  /* Unreject alphas in dictionary words */
328  for (i = 0; i < len; ++i) {
329  if (word_res->reject_map[i].rejected() &&
330  uchset.get_isalpha(word.unichar_id(i)))
331  word_res->reject_map[i].setrej_minimal_rej_accept();
332  }
333  }
334 
335  rating_per_ch = word.rating() / word_res->reject_map.length();
336 
337  if (rating_per_ch >= suspect_rating_per_ch)
338  return; // Don't touch bad ratings
339 
340  if ((word_res->tess_accepted) || (rating_per_ch < suspect_accept_rating)) {
341  /* Unreject any Tess Acceptable word - but NOT tess reject chs*/
342  for (i = 0; i < len; ++i) {
343  if (word_res->reject_map[i].rejected() &&
344  (!uchset.eq(word.unichar_id(i), " ")))
345  word_res->reject_map[i].setrej_minimal_rej_accept();
346  }
347  }
348 
349  for (i = 0; i < len; i++) {
350  if (word_res->reject_map[i].rejected()) {
351  if (word_res->reject_map[i].flag(R_DOC_REJ))
352  word_res->reject_map[i].setrej_minimal_rej_accept();
353  if (word_res->reject_map[i].flag(R_BLOCK_REJ))
354  word_res->reject_map[i].setrej_minimal_rej_accept();
355  if (word_res->reject_map[i].flag(R_ROW_REJ))
356  word_res->reject_map[i].setrej_minimal_rej_accept();
357  }
358  }
359 
360  if (suspect_level == 2)
361  return;
362 
363  if (!suspect_constrain_1Il ||
364  (word_res->reject_map.length() <= suspect_short_words)) {
365  for (i = 0; i < len; i++) {
366  if (word_res->reject_map[i].rejected()) {
367  if ((word_res->reject_map[i].flag(R_1IL_CONFLICT) ||
368  word_res->reject_map[i].flag(R_POSTNN_1IL)))
369  word_res->reject_map[i].setrej_minimal_rej_accept();
370 
371  if (!suspect_constrain_1Il &&
372  word_res->reject_map[i].flag(R_MM_REJECT))
373  word_res->reject_map[i].setrej_minimal_rej_accept();
374  }
375  }
376  }
377 
378  if (acceptable_word_string(*word_res->uch_set,
379  word.unichar_string().string(),
380  word.unichar_lengths().string()) !=
381  AC_UNACCEPTABLE ||
383  word.unichar_lengths().string())) {
384  if (word_res->reject_map.length() > suspect_short_words) {
385  for (i = 0; i < len; i++) {
386  if (word_res->reject_map[i].rejected() &&
387  (!word_res->reject_map[i].perm_rejected() ||
388  word_res->reject_map[i].flag (R_1IL_CONFLICT) ||
389  word_res->reject_map[i].flag (R_POSTNN_1IL) ||
390  word_res->reject_map[i].flag (R_MM_REJECT))) {
391  word_res->reject_map[i].setrej_minimal_rej_accept();
392  }
393  }
394  }
395  }
396 }
inT16 count_alphas(const WERD_CHOICE &word)
Definition: output.cpp:398
Unacceptable word.
Definition: control.h:36
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
const STRING & unichar_lengths() const
Definition: ratngs.h:546
const char * string() const
Definition: strngs.cpp:198
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:451
const UNICHARSET * unicharset() const
Definition: ratngs.h:298
BOOL8 acceptable_number_string(const char *s, const char *lengths)
Definition: output.cpp:419
const STRING & unichar_string() const
Definition: ratngs.h:539
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1690
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:656
float rating() const
Definition: ratngs.h:325

◆ set_word_fonts()

void tesseract::Tesseract::set_word_fonts ( WERD_RES word)

set_word_fonts

Get the fonts for the word.

Definition at line 1907 of file control.cpp.

1907  {
1908  // Don't try to set the word fonts for an lstm word, as the configs
1909  // will be meaningless.
1910  if (word->chopped_word == NULL) return;
1911  ASSERT_HOST(word->best_choice != NULL);
1912 
1913  int fontinfo_size = get_fontinfo_table().size();
1914  if (fontinfo_size == 0) return;
1915  GenericVector<int> font_total_score;
1916  font_total_score.init_to_size(fontinfo_size, 0);
1917 
1918  word->italic = 0;
1919  word->bold = 0;
1920  // Compute the font scores for the word
1921  if (tessedit_debug_fonts) {
1922  tprintf("Examining fonts in %s\n",
1923  word->best_choice->debug_string().string());
1924  }
1925  for (int b = 0; b < word->best_choice->length(); ++b) {
1926  BLOB_CHOICE* choice = word->GetBlobChoice(b);
1927  if (choice == NULL) continue;
1928  const GenericVector<ScoredFont>& fonts = choice->fonts();
1929  for (int f = 0; f < fonts.size(); ++f) {
1930  int fontinfo_id = fonts[f].fontinfo_id;
1931  if (0 <= fontinfo_id && fontinfo_id < fontinfo_size) {
1932  font_total_score[fontinfo_id] += fonts[f].score;
1933  }
1934  }
1935  }
1936  // Find the top and 2nd choice for the word.
1937  int score1 = 0, score2 = 0;
1938  inT16 font_id1 = -1, font_id2 = -1;
1939  for (int f = 0; f < fontinfo_size; ++f) {
1940  if (tessedit_debug_fonts && font_total_score[f] > 0) {
1941  tprintf("Font %s, total score = %d\n",
1942  fontinfo_table_.get(f).name, font_total_score[f]);
1943  }
1944  if (font_total_score[f] > score1) {
1945  score2 = score1;
1946  font_id2 = font_id1;
1947  score1 = font_total_score[f];
1948  font_id1 = f;
1949  } else if (font_total_score[f] > score2) {
1950  score2 = font_total_score[f];
1951  font_id2 = f;
1952  }
1953  }
1954  word->fontinfo = font_id1 >= 0 ? &fontinfo_table_.get(font_id1) : NULL;
1955  word->fontinfo2 = font_id2 >= 0 ? &fontinfo_table_.get(font_id2) : NULL;
1956  // Each score has a limit of MAX_UINT16, so divide by that to get the number
1957  // of "votes" for that font, ie number of perfect scores.
1958  word->fontinfo_id_count = ClipToRange(score1 / MAX_UINT16, 1, MAX_INT8);
1959  word->fontinfo_id2_count = ClipToRange(score2 / MAX_UINT16, 0, MAX_INT8);
1960  if (score1 > 0) {
1961  FontInfo fi = fontinfo_table_.get(font_id1);
1962  if (tessedit_debug_fonts) {
1963  if (word->fontinfo_id2_count > 0) {
1964  tprintf("Word modal font=%s, score=%d, 2nd choice %s/%d\n",
1965  fi.name, word->fontinfo_id_count,
1966  fontinfo_table_.get(font_id2).name,
1967  word->fontinfo_id2_count);
1968  } else {
1969  tprintf("Word modal font=%s, score=%d. No 2nd choice\n",
1970  fi.name, word->fontinfo_id_count);
1971  }
1972  }
1973  word->italic = (fi.is_italic() ? 1 : -1) * word->fontinfo_id_count;
1974  word->bold = (fi.is_bold() ? 1 : -1) * word->fontinfo_id_count;
1975  }
1976 }
const GenericVector< tesseract::ScoredFont > & fonts() const
Definition: ratngs.h:91
void init_to_size(int size, T t)
const FontInfo * fontinfo2
Definition: pageres.h:289
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
inT8 bold
Definition: pageres.h:286
bool is_bold() const
Definition: fontinfo.h:112
const STRING debug_string() const
Definition: ratngs.h:503
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
inT8 italic
Definition: pageres.h:285
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
int size() const
Definition: genericvector.h:72
int16_t inT16
Definition: host.h:36
#define ASSERT_HOST(x)
Definition: errcode.h:84
const FontInfo * fontinfo
Definition: pageres.h:288
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:122
inT8 fontinfo_id2_count
Definition: pageres.h:291
inT8 fontinfo_id_count
Definition: pageres.h:290
BLOB_CHOICE * GetBlobChoice(int index) const
Definition: pageres.cpp:742
bool is_italic() const
Definition: fontinfo.h:111
#define MAX_INT8
Definition: host.h:60
TWERD * chopped_word
Definition: pageres.h:201
UnicityTable< FontInfo > fontinfo_table_
Definition: classify.h:487
#define MAX_UINT16
Definition: host.h:64

◆ SetBlackAndWhitelist()

void tesseract::Tesseract::SetBlackAndWhitelist ( )

Definition at line 673 of file tesseractclass.cpp.

673  {
674  // Set the white and blacklists (if any)
676  tessedit_char_whitelist.string(),
677  tessedit_char_unblacklist.string());
678  // Black and white lists should apply to all loaded classifiers.
679  for (int i = 0; i < sub_langs_.size(); ++i) {
680  sub_langs_[i]->unicharset.set_black_and_whitelist(
682  tessedit_char_unblacklist.string());
683  }
684 }
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
Definition: unicharset.cpp:950
UNICHARSET unicharset
Definition: ccutil.h:68

◆ SetEquationDetect()

void tesseract::Tesseract::SetEquationDetect ( EquationDetect detector)

Definition at line 652 of file tesseractclass.cpp.

652  {
653  equ_detect_ = detector;
654  equ_detect_->SetLangTesseract(this);
655 }
void SetLangTesseract(Tesseract *lang_tesseract)

◆ SetScaledColor()

void tesseract::Tesseract::SetScaledColor ( int  factor,
Pix *  color 
)
inline

Definition at line 239 of file tesseractclass.h.

239  {
240  scaled_factor_ = factor;
241  scaled_color_ = color;
242  }

◆ SetupAllWordsPassN()

void tesseract::Tesseract::SetupAllWordsPassN ( int  pass_n,
const TBOX target_word_box,
const char *  word_config,
PAGE_RES page_res,
GenericVector< WordData > *  words 
)

If tesseract is to be run, sets the words up ready for it.

Definition at line 151 of file control.cpp.

155  {
156  // Prepare all the words.
157  PAGE_RES_IT page_res_it(page_res);
158  for (page_res_it.restart_page(); page_res_it.word() != NULL;
159  page_res_it.forward()) {
160  if (target_word_box == NULL ||
161  ProcessTargetWord(page_res_it.word()->word->bounding_box(),
162  *target_word_box, word_config, 1)) {
163  words->push_back(WordData(page_res_it));
164  }
165  }
166  // Setup all the words for recognition with polygonal approximation.
167  for (int w = 0; w < words->size(); ++w) {
168  SetupWordPassN(pass_n, &(*words)[w]);
169  if (w > 0) (*words)[w].prev_word = &(*words)[w - 1];
170  }
171 }
int push_back(T object)
int size() const
Definition: genericvector.h:72
void SetupWordPassN(int pass_n, WordData *word)
Definition: control.cpp:174
bool ProcessTargetWord(const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass)
Definition: control.cpp:121

◆ SetupApplyBoxes()

PAGE_RES * tesseract::Tesseract::SetupApplyBoxes ( const GenericVector< TBOX > &  boxes,
BLOCK_LIST *  block_list 
)

Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.

Definition at line 217 of file applybox.cpp.

218  {
219  PreenXHeights(block_list);
220  // Strip all fuzzy space markers to simplify the PAGE_RES.
221  BLOCK_IT b_it(block_list);
222  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
223  BLOCK* block = b_it.data();
224  ROW_IT r_it(block->row_list());
225  for (r_it.mark_cycle_pt(); !r_it.cycled_list(); r_it.forward ()) {
226  ROW* row = r_it.data();
227  WERD_IT w_it(row->word_list());
228  for (w_it.mark_cycle_pt(); !w_it.cycled_list(); w_it.forward()) {
229  WERD* word = w_it.data();
230  if (word->cblob_list()->empty()) {
231  delete w_it.extract();
232  } else {
233  word->set_flag(W_FUZZY_SP, false);
234  word->set_flag(W_FUZZY_NON, false);
235  }
236  }
237  }
238  }
239  PAGE_RES* page_res = new PAGE_RES(false, block_list, NULL);
240  PAGE_RES_IT pr_it(page_res);
241  WERD_RES* word_res;
242  while ((word_res = pr_it.word()) != NULL) {
243  MaximallyChopWord(boxes, pr_it.block()->block,
244  pr_it.row()->row, word_res);
245  pr_it.forward();
246  }
247  return page_res;
248 }
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
WERD_LIST * word_list()
Definition: ocrrow.h:52
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD * word
Definition: pageres.h:175
void PreenXHeights(BLOCK_LIST *block_list)
Definition: applybox.cpp:193
void MaximallyChopWord(const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res)
Definition: applybox.cpp:253
Definition: werd.h:60
Definition: ocrrow.h:32
ROW_LIST * row_list()
get rows
Definition: ocrblock.h:120
Definition: ocrblock.h:30

◆ SetupPageSegAndDetectOrientation()

ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation ( PageSegMode  pageseg_mode,
BLOCK_LIST *  blocks,
Tesseract osd_tess,
OSResults osr,
TO_BLOCK_LIST *  to_blocks,
Pix **  photo_mask_pix,
Pix **  music_mask_pix 
)

Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a NULL pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.

Definition at line 274 of file pagesegmain.cpp.

277  {
278  int vertical_x = 0;
279  int vertical_y = 1;
280  TabVector_LIST v_lines;
281  TabVector_LIST h_lines;
282  ICOORD bleft(0, 0);
283 
284  ASSERT_HOST(pix_binary_ != NULL);
286  pixa_debug_.AddPix(pix_binary_, "PageSegInput");
287  }
288  // Leptonica is used to find the rule/separator lines in the input.
289  LineFinder::FindAndRemoveLines(source_resolution_,
290  textord_tabfind_show_vlines, pix_binary_,
291  &vertical_x, &vertical_y, music_mask_pix,
292  &v_lines, &h_lines);
294  pixa_debug_.AddPix(pix_binary_, "NoLines");
295  }
296  // Leptonica is used to find a mask of the photo regions in the input.
297  *photo_mask_pix = ImageFind::FindImages(pix_binary_, &pixa_debug_);
299  pixa_debug_.AddPix(pix_binary_, "NoImages");
300  }
301  if (!PSM_COL_FIND_ENABLED(pageseg_mode)) v_lines.clear();
302 
303  // The rest of the algorithm uses the usual connected components.
304  textord_.find_components(pix_binary_, blocks, to_blocks);
305 
306  TO_BLOCK_IT to_block_it(to_blocks);
307  // There must be exactly one input block.
308  // TODO(rays) handle new textline finding with a UNLV zone file.
309  ASSERT_HOST(to_blocks->singleton());
310  TO_BLOCK* to_block = to_block_it.data();
311  TBOX blkbox = to_block->block->bounding_box();
312  ColumnFinder* finder = NULL;
313 
314  if (to_block->line_size >= 2) {
315  finder = new ColumnFinder(static_cast<int>(to_block->line_size),
316  blkbox.botleft(), blkbox.topright(),
317  source_resolution_, textord_use_cjk_fp_model,
319  &v_lines, &h_lines, vertical_x, vertical_y);
320 
321  finder->SetupAndFilterNoise(pageseg_mode, *photo_mask_pix, to_block);
322 
323  if (equ_detect_) {
324  equ_detect_->LabelSpecialText(to_block);
325  }
326 
327  BLOBNBOX_CLIST osd_blobs;
328  // osd_orientation is the number of 90 degree rotations to make the
329  // characters upright. (See osdetect.h for precise definition.)
330  // We want the text lines horizontal, (vertical text indicates vertical
331  // textlines) which may conflict (eg vertically written CJK).
332  int osd_orientation = 0;
333  bool vertical_text = textord_tabfind_force_vertical_text ||
334  pageseg_mode == PSM_SINGLE_BLOCK_VERT_TEXT;
335  if (!vertical_text && textord_tabfind_vertical_text &&
336  PSM_ORIENTATION_ENABLED(pageseg_mode)) {
337  vertical_text =
338  finder->IsVerticallyAlignedText(textord_tabfind_vertical_text_ratio,
339  to_block, &osd_blobs);
340  }
341  if (PSM_OSD_ENABLED(pageseg_mode) && osd_tess != NULL && osr != NULL) {
342  GenericVector<int> osd_scripts;
343  if (osd_tess != this) {
344  // We are running osd as part of layout analysis, so constrain the
345  // scripts to those allowed by *this.
346  AddAllScriptsConverted(unicharset, osd_tess->unicharset, &osd_scripts);
347  for (int s = 0; s < sub_langs_.size(); ++s) {
348  AddAllScriptsConverted(sub_langs_[s]->unicharset,
349  osd_tess->unicharset, &osd_scripts);
350  }
351  }
352  os_detect_blobs(&osd_scripts, &osd_blobs, osr, osd_tess);
353  if (pageseg_mode == PSM_OSD_ONLY) {
354  delete finder;
355  return NULL;
356  }
357  osd_orientation = osr->best_result.orientation_id;
358  double osd_score = osr->orientations[osd_orientation];
359  double osd_margin = min_orientation_margin * 2;
360  for (int i = 0; i < 4; ++i) {
361  if (i != osd_orientation &&
362  osd_score - osr->orientations[i] < osd_margin) {
363  osd_margin = osd_score - osr->orientations[i];
364  }
365  }
366  int best_script_id = osr->best_result.script_id;
367  const char* best_script_str =
368  osd_tess->unicharset.get_script_from_script_id(best_script_id);
369  bool cjk = best_script_id == osd_tess->unicharset.han_sid() ||
370  best_script_id == osd_tess->unicharset.hiragana_sid() ||
371  best_script_id == osd_tess->unicharset.katakana_sid() ||
372  strcmp("Japanese", best_script_str) == 0 ||
373  strcmp("Korean", best_script_str) == 0 ||
374  strcmp("Hangul", best_script_str) == 0;
375  if (cjk) {
376  finder->set_cjk_script(true);
377  }
378  if (osd_margin < min_orientation_margin) {
379  // The margin is weak.
380  if (!cjk && !vertical_text && osd_orientation == 2) {
381  // upside down latin text is improbable with such a weak margin.
382  tprintf("OSD: Weak margin (%.2f), horiz textlines, not CJK: "
383  "Don't rotate.\n", osd_margin);
384  osd_orientation = 0;
385  } else {
386  tprintf(
387  "OSD: Weak margin (%.2f) for %d blob text block, "
388  "but using orientation anyway: %d\n",
389  osd_margin, osd_blobs.length(), osd_orientation);
390  }
391  }
392  }
393  osd_blobs.shallow_clear();
394  finder->CorrectOrientation(to_block, vertical_text, osd_orientation);
395  }
396 
397  return finder;
398 }
double textord_tabfind_vertical_text_ratio
int orientation_id
Definition: osdetect.h:41
bool PSM_OSD_ENABLED(int pageseg_mode)
Definition: publictypes.h:179
#define tprintf(...)
Definition: tprintf.h:31
int LabelSpecialText(TO_BLOCK *to_block)
static Pix * FindImages(Pix *pix, DebugPixa *pixa_debug)
Definition: imagefind.cpp:66
#define ASSERT_HOST(x)
Definition: errcode.h:84
float orientations[4]
Definition: osdetect.h:74
OSBestResult best_result
Definition: osdetect.h:79
void find_components(Pix *pix, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks)
Definition: tordmain.cpp:205
UNICHARSET unicharset
Definition: ccutil.h:68
const ICOORD & topright() const
Definition: rect.h:100
Definition: rect.h:30
bool PSM_COL_FIND_ENABLED(int pageseg_mode)
Definition: publictypes.h:185
void AddPix(const Pix *pix, const char *caption)
Definition: debugpixa.h:26
const ICOORD & botleft() const
Definition: rect.h:88
int os_detect_blobs(const GenericVector< int > *allowed_scripts, BLOBNBOX_CLIST *blob_list, OSResults *osr, tesseract::Tesseract *tess)
Definition: osdetect.cpp:276
static void FindAndRemoveLines(int resolution, bool debug, Pix *pix, int *vertical_x, int *vertical_y, Pix **pix_music_mask, TabVector_LIST *v_lines, TabVector_LIST *h_lines)
Definition: linefind.cpp:243
int script_id
Definition: osdetect.h:42
Orientation and script detection only.
Definition: publictypes.h:152
double textord_tabfind_aligned_gap_fraction
bool PSM_ORIENTATION_ENABLED(int pageseg_mode)
Definition: publictypes.h:182
integer coordinate
Definition: points.h:30
bool textord_tabfind_force_vertical_text

◆ SetupUniversalFontIds()

void tesseract::Tesseract::SetupUniversalFontIds ( )

Definition at line 436 of file tessedit.cpp.

436  {
437  // Note that we can get away with bitwise copying FontInfo in
438  // all_fonts, as it is a temporary structure and we avoid setting the
439  // delete callback.
440  UnicityTable<FontInfo> all_fonts;
442 
443  // Create the universal ID table.
444  CollectFonts(get_fontinfo_table(), &all_fonts);
445  for (int i = 0; i < sub_langs_.size(); ++i) {
446  CollectFonts(sub_langs_[i]->get_fontinfo_table(), &all_fonts);
447  }
448  // Assign ids from the table to each font table.
449  AssignIds(all_fonts, &get_fontinfo_table());
450  for (int i = 0; i < sub_langs_.size(); ++i) {
451  AssignIds(all_fonts, &sub_langs_[i]->get_fontinfo_table());
452  }
453  font_table_size_ = all_fonts.size();
454 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
int size() const
Return the size used.
bool CompareFontInfo(const FontInfo &fi1, const FontInfo &fi2)
Definition: fontinfo.cpp:120
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
void set_compare_callback(TessResultCallback2< bool, T const &, T const &> *cb)

◆ SetupWordPassN()

void tesseract::Tesseract::SetupWordPassN ( int  pass_n,
WordData word 
)

Definition at line 174 of file control.cpp.

174  {
175  if (pass_n == 1 || !word->word->done) {
176  if (pass_n == 1) {
177  word->word->SetupForRecognition(unicharset, this, BestPix(),
182  word->row, word->block);
183  } else if (pass_n == 2) {
184  // TODO(rays) Should we do this on pass1 too?
185  word->word->caps_height = 0.0;
186  if (word->word->x_height == 0.0f)
187  word->word->x_height = word->row->x_height();
188  }
189  word->lang_words.truncate(0);
190  for (int s = 0; s <= sub_langs_.size(); ++s) {
191  // The sub_langs_.size() entry is for the master language.
192  Tesseract* lang_t = s < sub_langs_.size() ? sub_langs_[s] : this;
193  WERD_RES* word_res = new WERD_RES;
194  word_res->InitForRetryRecognition(*word->word);
195  word->lang_words.push_back(word_res);
196  // LSTM doesn't get setup for pass2.
197  if (pass_n == 1 || lang_t->tessedit_ocr_engine_mode != OEM_LSTM_ONLY) {
198  word_res->SetupForRecognition(
199  lang_t->unicharset, lang_t, BestPix(),
200  lang_t->tessedit_ocr_engine_mode, NULL,
201  lang_t->classify_bln_numeric_mode,
202  lang_t->textord_use_cjk_fp_model,
203  lang_t->poly_allow_detailed_fx, word->row, word->block);
204  }
205  }
206  }
207 }
void InitForRetryRecognition(const WERD_RES &source)
Definition: pageres.cpp:269
bool classify_bln_numeric_mode
Definition: classify.h:499
Pix * BestPix() const
UNICHARSET unicharset
Definition: ccutil.h:68
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294

◆ SetupWordScripts()

void tesseract::Tesseract::SetupWordScripts ( BLOCK_LIST *  blocks)

◆ source_resolution()

int tesseract::Tesseract::source_resolution ( ) const
inline

Definition at line 221 of file tesseractclass.h.

221  {
222  return source_resolution_;
223  }

◆ split_and_recog_word()

void tesseract::Tesseract::split_and_recog_word ( WERD_RES word)

Definition at line 144 of file tfacepp.cpp.

144  {
145  // Find the biggest blob gap in the chopped_word.
146  int bestgap = -MAX_INT32;
147  int split_index = 0;
148  for (int b = 1; b < word->chopped_word->NumBlobs(); ++b) {
149  TBOX prev_box = word->chopped_word->blobs[b - 1]->bounding_box();
150  TBOX blob_box = word->chopped_word->blobs[b]->bounding_box();
151  int gap = blob_box.left() - prev_box.right();
152  if (gap > bestgap) {
153  bestgap = gap;
154  split_index = b;
155  }
156  }
157  ASSERT_HOST(split_index > 0);
158 
159  WERD_RES *word2 = NULL;
160  BlamerBundle *orig_bb = NULL;
161  split_word(word, split_index, &word2, &orig_bb);
162 
163  // Recognize the first part of the word.
164  recog_word_recursive(word);
165  // Recognize the second part of the word.
166  recog_word_recursive(word2);
167 
168  join_words(word, word2, orig_bb);
169 }
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:182
#define MAX_INT32
Definition: host.h:62
#define ASSERT_HOST(x)
Definition: errcode.h:84
inT16 left() const
Definition: rect.h:68
int NumBlobs() const
Definition: blobs.h:425
Definition: rect.h:30
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:240
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:110
inT16 right() const
Definition: rect.h:75
TWERD * chopped_word
Definition: pageres.h:201

◆ split_word()

void tesseract::Tesseract::split_word ( WERD_RES word,
int  split_pt,
WERD_RES **  right_piece,
BlamerBundle **  orig_blamer_bundle 
) const

Definition at line 182 of file tfacepp.cpp.

185  {
186  ASSERT_HOST(split_pt >0 && split_pt < word->chopped_word->NumBlobs());
187 
188  // Save a copy of the blamer bundle so we can try to reconstruct it below.
189  BlamerBundle *orig_bb =
190  word->blamer_bundle ? new BlamerBundle(*word->blamer_bundle) : NULL;
191 
192  WERD_RES *word2 = new WERD_RES(*word);
193 
194  // blow away the copied chopped_word, as we want to work with
195  // the blobs from the input chopped_word so seam_arrays can be merged.
196  TWERD *chopped = word->chopped_word;
197  TWERD *chopped2 = new TWERD;
198  chopped2->blobs.reserve(chopped->NumBlobs() - split_pt);
199  for (int i = split_pt; i < chopped->NumBlobs(); ++i) {
200  chopped2->blobs.push_back(chopped->blobs[i]);
201  }
202  chopped->blobs.truncate(split_pt);
203  word->chopped_word = NULL;
204  delete word2->chopped_word;
205  word2->chopped_word = NULL;
206 
207  const UNICHARSET &unicharset = *word->uch_set;
208  word->ClearResults();
209  word2->ClearResults();
210  word->chopped_word = chopped;
211  word2->chopped_word = chopped2;
212  word->SetupBasicsFromChoppedWord(unicharset);
213  word2->SetupBasicsFromChoppedWord(unicharset);
214 
215  // Try to adjust the blamer bundle.
216  if (orig_bb != NULL) {
217  // TODO(rays) Looks like a leak to me.
218  // orig_bb should take, rather than copy.
219  word->blamer_bundle = new BlamerBundle();
220  word2->blamer_bundle = new BlamerBundle();
221  orig_bb->SplitBundle(chopped->blobs.back()->bounding_box().right(),
222  word2->chopped_word->blobs[0]->bounding_box().left(),
224  word->blamer_bundle, word2->blamer_bundle);
225  }
226 
227  *right_piece = word2;
228  *orig_blamer_bundle = orig_bb;
229 }
void reserve(int size)
BlamerBundle * blamer_bundle
Definition: pageres.h:230
int push_back(T object)
void ClearResults()
Definition: pageres.cpp:1142
void truncate(int size)
#define ASSERT_HOST(x)
Definition: errcode.h:84
Definition: blobs.h:395
void SetupBasicsFromChoppedWord(const UNICHARSET &unicharset_in)
Definition: pageres.cpp:335
UNICHARSET unicharset
Definition: ccutil.h:68
int NumBlobs() const
Definition: blobs.h:425
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
T & back() const
inT16 right() const
Definition: rect.h:75
bool wordrec_debug_blamer
Definition: wordrec.h:167
const UNICHARSET * uch_set
Definition: pageres.h:192
TBOX bounding_box() const
Definition: blobs.cpp:482
TWERD * chopped_word
Definition: pageres.h:201

◆ SubAndSuperscriptFix()

bool tesseract::Tesseract::SubAndSuperscriptFix ( WERD_RES word)

Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, acccept.

This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.

Returns
Whether we modified the given word.

Definition at line 101 of file superscript.cpp.

101  {
102  if (word->tess_failed || word->word->flag(W_REP_CHAR) ||
103  !word->best_choice) {
104  return false;
105  }
106  int num_leading, num_trailing;
107  ScriptPos sp_leading, sp_trailing;
108  float leading_certainty, trailing_certainty;
109  float avg_certainty, unlikely_threshold;
110 
111  // Calculate the number of whole suspicious characters at the edges.
113  word, &num_leading, &sp_leading, &leading_certainty,
114  &num_trailing, &sp_trailing, &trailing_certainty,
115  &avg_certainty, &unlikely_threshold);
116 
117  const char *leading_pos = sp_leading == SP_SUBSCRIPT ? "sub" : "super";
118  const char *trailing_pos = sp_trailing == SP_SUBSCRIPT ? "sub" : "super";
119 
120  int num_blobs = word->best_choice->length();
121 
122  // Calculate the remainder (partial characters) at the edges.
123  // This accounts for us having classified the best version of
124  // a word as [speaker?'] when it was instead [speaker.^{21}]
125  // (that is we accidentally thought the 2 was attached to the period).
126  int num_remainder_leading = 0, num_remainder_trailing = 0;
127  if (num_leading + num_trailing < num_blobs && unlikely_threshold < 0.0) {
128  int super_y_bottom =
130  int sub_y_top =
132  int last_word_char = num_blobs - 1 - num_trailing;
133  float last_char_certainty = word->best_choice->certainty(last_word_char);
134  if (word->best_choice->unichar_id(last_word_char) != 0 &&
135  last_char_certainty <= unlikely_threshold) {
136  ScriptPos rpos;
137  YOutlierPieces(word, last_word_char, super_y_bottom, sub_y_top,
138  NULL, NULL, &rpos, &num_remainder_trailing);
139  if (num_trailing > 0 && rpos != sp_trailing) num_remainder_trailing = 0;
140  if (num_remainder_trailing > 0 &&
141  last_char_certainty < trailing_certainty) {
142  trailing_certainty = last_char_certainty;
143  }
144  }
145  bool another_blob_available = (num_remainder_trailing == 0) ||
146  num_leading + num_trailing + 1 < num_blobs;
147  int first_char_certainty = word->best_choice->certainty(num_leading);
148  if (another_blob_available &&
149  word->best_choice->unichar_id(num_leading) != 0 &&
150  first_char_certainty <= unlikely_threshold) {
151  ScriptPos lpos;
152  YOutlierPieces(word, num_leading, super_y_bottom, sub_y_top,
153  &lpos, &num_remainder_leading, NULL, NULL);
154  if (num_leading > 0 && lpos != sp_leading) num_remainder_leading = 0;
155  if (num_remainder_leading > 0 &&
156  first_char_certainty < leading_certainty) {
157  leading_certainty = first_char_certainty;
158  }
159  }
160  }
161 
162  // If nothing to do, bail now.
163  if (num_leading + num_trailing +
164  num_remainder_leading + num_remainder_trailing == 0) {
165  return false;
166  }
167 
168  if (superscript_debug >= 1) {
169  tprintf("Candidate for superscript detection: %s (",
170  word->best_choice->unichar_string().string());
171  if (num_leading || num_remainder_leading) {
172  tprintf("%d.%d %s-leading ", num_leading, num_remainder_leading,
173  leading_pos);
174  }
175  if (num_trailing || num_remainder_trailing) {
176  tprintf("%d.%d %s-trailing ", num_trailing, num_remainder_trailing,
177  trailing_pos);
178  }
179  tprintf(")\n");
180  }
181  if (superscript_debug >= 3) {
182  word->best_choice->print();
183  }
184  if (superscript_debug >= 2) {
185  tprintf(" Certainties -- Average: %.2f Unlikely thresh: %.2f ",
186  avg_certainty, unlikely_threshold);
187  if (num_leading)
188  tprintf("Orig. leading (min): %.2f ", leading_certainty);
189  if (num_trailing)
190  tprintf("Orig. trailing (min): %.2f ", trailing_certainty);
191  tprintf("\n");
192  }
193 
194  // We've now calculated the number of rebuilt blobs we want to carve off.
195  // However, split_word() works from TBLOBs in chopped_word, so we need to
196  // convert to those.
197  int num_chopped_leading =
198  LeadingUnicharsToChopped(word, num_leading) + num_remainder_leading;
199  int num_chopped_trailing =
200  TrailingUnicharsToChopped(word, num_trailing) + num_remainder_trailing;
201 
202  int retry_leading = 0;
203  int retry_trailing = 0;
204  bool is_good = false;
205  WERD_RES *revised = TrySuperscriptSplits(
206  num_chopped_leading, leading_certainty, sp_leading,
207  num_chopped_trailing, trailing_certainty, sp_trailing,
208  word, &is_good, &retry_leading, &retry_trailing);
209  if (is_good) {
210  word->ConsumeWordResults(revised);
211  } else if (retry_leading || retry_trailing) {
212  int retry_chopped_leading =
213  LeadingUnicharsToChopped(revised, retry_leading);
214  int retry_chopped_trailing =
215  TrailingUnicharsToChopped(revised, retry_trailing);
216  WERD_RES *revised2 = TrySuperscriptSplits(
217  retry_chopped_leading, leading_certainty, sp_leading,
218  retry_chopped_trailing, trailing_certainty, sp_trailing,
219  revised, &is_good, &retry_leading, &retry_trailing);
220  if (is_good) {
221  word->ConsumeWordResults(revised2);
222  }
223  delete revised2;
224  }
225  delete revised;
226  return is_good;
227 }
WERD_RES * TrySuperscriptSplits(int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing)
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
void print() const
Definition: ratngs.h:578
BOOL8 tess_failed
Definition: pageres.h:272
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
const int kBlnXHeight
Definition: normalis.h:28
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
const int kBlnBaselineOffset
Definition: normalis.h:29
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void YOutlierPieces(WERD_RES *word, int rebuilt_blob_index, int super_y_bottom, int sub_y_top, ScriptPos *leading_pos, int *num_leading_outliers, ScriptPos *trailing_pos, int *num_trailing_outliers)
Definition: superscript.cpp:46
float certainty() const
Definition: ratngs.h:328
void GetSubAndSuperscriptCandidates(const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold)
const STRING & unichar_string() const
Definition: ratngs.h:539
WERD * word
Definition: pageres.h:175
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757

◆ terrible_word_crunch()

BOOL8 tesseract::Tesseract::terrible_word_crunch ( WERD_RES word,
GARBAGE_LEVEL  garbage_level 
)

Definition at line 508 of file docqual.cpp.

509  {
510  float rating_per_ch;
511  int adjusted_len;
512  int crunch_mode = 0;
513 
514  if ((word->best_choice->unichar_string().length () == 0) ||
515  (strspn (word->best_choice->unichar_string().string(), " ") ==
517  crunch_mode = 1;
518  else {
519  adjusted_len = word->reject_map.length ();
520  if (adjusted_len > crunch_rating_max)
521  adjusted_len = crunch_rating_max;
522  rating_per_ch = word->best_choice->rating () / adjusted_len;
523 
524  if (rating_per_ch > crunch_terrible_rating)
525  crunch_mode = 2;
526  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
527  crunch_mode = 3;
528  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
529  (garbage_level != G_OK))
530  crunch_mode = 4;
531  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
532  (garbage_level != G_OK))
533  crunch_mode = 5;
534  }
535  if (crunch_mode > 0) {
536  if (crunch_debug > 2) {
537  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
538  crunch_mode, word->best_choice->unichar_string().string());
539  }
540  return TRUE;
541  }
542  else
543  return FALSE;
544 }
#define TRUE
Definition: capi.h:45
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
inT32 length() const
Definition: strngs.cpp:193
inT32 length() const
Definition: rejctmap.h:235
#define FALSE
Definition: capi.h:46
Definition: docqual.h:28
float certainty() const
Definition: ratngs.h:328
const STRING & unichar_string() const
Definition: ratngs.h:539
uinT32 unsigned_size() const
Definition: strngs.h:71
REJMAP reject_map
Definition: pageres.h:271
float rating() const
Definition: ratngs.h:325

◆ tess_acceptable_word()

bool tesseract::Tesseract::tess_acceptable_word ( WERD_RES word)

Definition at line 69 of file tessbox.cpp.

69  {
70  return getDict().AcceptableResult(word);
71 }
Dict & getDict()
Definition: classify.h:65
bool AcceptableResult(WERD_RES *word) const
Definition: stopper.cpp:110

◆ tess_add_doc_word()

void tesseract::Tesseract::tess_add_doc_word ( WERD_CHOICE word_choice)

Definition at line 79 of file tessbox.cpp.

79  {
80  getDict().add_document_word(*word_choice);
81 }
Dict & getDict()
Definition: classify.h:65
void add_document_word(const WERD_CHOICE &best_choice)
Adds a word found on this document to the document specific dictionary.
Definition: dict.cpp:612

◆ tess_segment_pass_n()

void tesseract::Tesseract::tess_segment_pass_n ( int  pass_n,
WERD_RES word 
)

Definition at line 39 of file tessbox.cpp.

39  {
40  int saved_enable_assoc = 0;
41  int saved_chop_enable = 0;
42 
43  if (word->word->flag(W_DONT_CHOP)) {
44  saved_enable_assoc = wordrec_enable_assoc;
45  saved_chop_enable = chop_enable;
46  wordrec_enable_assoc.set_value(0);
47  chop_enable.set_value(0);
48  }
49  if (pass_n == 1)
50  set_pass1();
51  else
52  set_pass2();
53  recog_word(word);
54  if (word->best_choice == NULL)
55  word->SetupFake(*word->uch_set);
56  if (word->word->flag(W_DONT_CHOP)) {
57  wordrec_enable_assoc.set_value(saved_enable_assoc);
58  chop_enable.set_value(saved_chop_enable);
59  }
60 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
bool wordrec_enable_assoc
Definition: wordrec.h:130
void recog_word(WERD_RES *word)
Definition: tfacepp.cpp:46
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void set_pass1()
Definition: tface.cpp:89
void SetupFake(const UNICHARSET &uch)
Definition: pageres.cpp:344
WERD * word
Definition: pageres.h:175
void set_pass2()
Definition: tface.cpp:101
const UNICHARSET * uch_set
Definition: pageres.h:192

◆ TestNewNormalization()

bool tesseract::Tesseract::TestNewNormalization ( int  original_misfits,
float  baseline_shift,
float  new_x_ht,
WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1468 of file control.cpp.

1470  {
1471  bool accept_new_x_ht = false;
1472  WERD_RES new_x_ht_word(word->word);
1473  if (word->blamer_bundle != NULL) {
1474  new_x_ht_word.blamer_bundle = new BlamerBundle();
1475  new_x_ht_word.blamer_bundle->CopyTruth(*(word->blamer_bundle));
1476  }
1477  new_x_ht_word.x_height = new_x_ht;
1478  new_x_ht_word.baseline_shift = baseline_shift;
1479  new_x_ht_word.caps_height = 0.0;
1480  new_x_ht_word.SetupForRecognition(
1483  poly_allow_detailed_fx, row, block);
1484  match_word_pass_n(2, &new_x_ht_word, row, block);
1485  if (!new_x_ht_word.tess_failed) {
1486  int new_misfits = CountMisfitTops(&new_x_ht_word);
1487  if (debug_x_ht_level >= 1) {
1488  tprintf("Old misfits=%d with x-height %f, new=%d with x-height %f\n",
1489  original_misfits, word->x_height,
1490  new_misfits, new_x_ht);
1491  tprintf("Old rating= %f, certainty=%f, new=%f, %f\n",
1492  word->best_choice->rating(), word->best_choice->certainty(),
1493  new_x_ht_word.best_choice->rating(),
1494  new_x_ht_word.best_choice->certainty());
1495  }
1496  // The misfits must improve and either the rating or certainty.
1497  accept_new_x_ht = new_misfits < original_misfits &&
1498  (new_x_ht_word.best_choice->certainty() >
1499  word->best_choice->certainty() ||
1500  new_x_ht_word.best_choice->rating() <
1501  word->best_choice->rating());
1502  if (debug_x_ht_level >= 1) {
1503  ReportXhtFixResult(accept_new_x_ht, new_x_ht, word, &new_x_ht_word);
1504  }
1505  }
1506  if (accept_new_x_ht) {
1507  word->ConsumeWordResults(&new_x_ht_word);
1508  return true;
1509  }
1510  return false;
1511 }
bool classify_bln_numeric_mode
Definition: classify.h:499
WERD_CHOICE * best_choice
Definition: pageres.h:219
BlamerBundle * blamer_bundle
Definition: pageres.h:230
#define tprintf(...)
Definition: tprintf.h:31
void ReportXhtFixResult(bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word)
Definition: control.cpp:1413
Pix * BestPix() const
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:69
UNICHARSET unicharset
Definition: ccutil.h:68
float certainty() const
Definition: ratngs.h:328
void match_word_pass_n(int pass_n, WERD_RES *word, ROW *row, BLOCK *block)
Definition: control.cpp:1576
WERD * word
Definition: pageres.h:175
void ConsumeWordResults(WERD_RES *word)
Definition: pageres.cpp:757
float x_height
Definition: pageres.h:295
float rating() const
Definition: ratngs.h:325

◆ textord()

const Textord& tesseract::Tesseract::textord ( ) const
inline

Definition at line 243 of file tesseractclass.h.

243  {
244  return textord_;
245  }

◆ TidyUp()

void tesseract::Tesseract::TidyUp ( PAGE_RES page_res)
  • Counts up the labelled words and the blobs within.
  • Deletes all unused or emptied words, counting the unused ones.
  • Resets W_BOL and W_EOL flags correctly.
  • Builds the rebuild_word and rebuilds the box_word and the best_choice.

Definition at line 706 of file applybox.cpp.

706  {
707  int ok_blob_count = 0;
708  int bad_blob_count = 0;
709  int ok_word_count = 0;
710  int unlabelled_words = 0;
711  PAGE_RES_IT pr_it(page_res);
712  WERD_RES* word_res;
713  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
714  int ok_in_word = 0;
715  int blob_count = word_res->correct_text.size();
716  WERD_CHOICE* word_choice = new WERD_CHOICE(word_res->uch_set, blob_count);
717  word_choice->set_permuter(TOP_CHOICE_PERM);
718  for (int c = 0; c < blob_count; ++c) {
719  if (word_res->correct_text[c].length() > 0) {
720  ++ok_in_word;
721  }
722  // Since we only need a fake word_res->best_choice, the actual
723  // unichar_ids do not matter. Which is fortunate, since TidyUp()
724  // can be called while training Tesseract, at the stage where
725  // unicharset is not meaningful yet.
727  INVALID_UNICHAR_ID, word_res->best_state[c], 1.0f, -1.0f);
728  }
729  if (ok_in_word > 0) {
730  ok_blob_count += ok_in_word;
731  bad_blob_count += word_res->correct_text.size() - ok_in_word;
732  word_res->LogNewRawChoice(word_choice);
733  word_res->LogNewCookedChoice(1, false, word_choice);
734  } else {
735  ++unlabelled_words;
736  if (applybox_debug > 0) {
737  tprintf("APPLY_BOXES: Unlabelled word at :");
738  word_res->word->bounding_box().print();
739  }
740  pr_it.DeleteCurrentWord();
741  delete word_choice;
742  }
743  }
744  pr_it.restart_page();
745  for (; (word_res = pr_it.word()) != NULL; pr_it.forward()) {
746  // Denormalize back to a BoxWord.
747  word_res->RebuildBestState();
748  word_res->SetupBoxWord();
749  word_res->word->set_flag(W_BOL, pr_it.prev_row() != pr_it.row());
750  word_res->word->set_flag(W_EOL, pr_it.next_row() != pr_it.row());
751  }
752  if (applybox_debug > 0) {
753  tprintf(" Found %d good blobs.\n", ok_blob_count);
754  if (bad_blob_count > 0) {
755  tprintf(" Leaving %d unlabelled blobs in %d words.\n",
756  bad_blob_count, ok_word_count);
757  }
758  if (unlabelled_words > 0)
759  tprintf(" %d remaining unlabelled words deleted.\n", unlabelled_words);
760  }
761 }
void RebuildBestState()
Definition: pageres.cpp:800
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
Definition: pageres.cpp:612
GenericVector< int > best_state
Definition: pageres.h:255
Definition: werd.h:36
GenericVector< STRING > correct_text
Definition: pageres.h:259
void set_flag(WERD_FLAGS mask, BOOL8 value)
Definition: werd.h:129
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
TBOX bounding_box() const
Definition: werd.cpp:160
Definition: werd.h:35
int length() const
Definition: genericvector.h:85
void SetupBoxWord()
Definition: pageres.cpp:843
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:450
WERD * word
Definition: pageres.h:175
void print() const
Definition: rect.h:270
bool LogNewRawChoice(WERD_CHOICE *word_choice)
Definition: pageres.cpp:596
const UNICHARSET * uch_set
Definition: pageres.h:192
void set_permuter(uinT8 perm)
Definition: ratngs.h:373

◆ tilde_crunch()

void tesseract::Tesseract::tilde_crunch ( PAGE_RES_IT page_res_it)

Definition at line 422 of file docqual.cpp.

422  {
423  WERD_RES *word;
424  GARBAGE_LEVEL garbage_level;
425  PAGE_RES_IT copy_it;
426  BOOL8 prev_potential_marked = FALSE;
427  BOOL8 found_terrible_word = FALSE;
428  BOOL8 ok_dict_word;
429 
430  page_res_it.restart_page();
431  while (page_res_it.word() != NULL) {
432  POLY_BLOCK* pb = page_res_it.block()->block->poly_block();
433  if (pb != NULL && !pb->IsText()) {
434  page_res_it.forward();
435  continue;
436  }
437  word = page_res_it.word();
438 
440  convert_bad_unlv_chs(word);
441 
443  word->merge_tess_fails();
444 
445  if (word->reject_map.accept_count () != 0) {
446  found_terrible_word = FALSE;
447  //Forget earlier potential crunches
448  prev_potential_marked = FALSE;
449  }
450  else {
451  ok_dict_word = safe_dict_word(word);
452  garbage_level = garbage_word (word, ok_dict_word);
453 
454  if ((garbage_level != G_NEVER_CRUNCH) &&
455  (terrible_word_crunch (word, garbage_level))) {
456  if (crunch_debug > 0) {
457  tprintf ("T CRUNCHING: \"%s\"\n",
458  word->best_choice->unichar_string().string());
459  }
461  if (prev_potential_marked) {
462  while (copy_it.word () != word) {
463  if (crunch_debug > 0) {
464  tprintf ("P1 CRUNCHING: \"%s\"\n",
465  copy_it.word()->best_choice->unichar_string().string());
466  }
467  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
468  copy_it.forward ();
469  }
470  prev_potential_marked = FALSE;
471  }
472  found_terrible_word = TRUE;
473  }
474  else if ((garbage_level != G_NEVER_CRUNCH) &&
475  (potential_word_crunch (word,
476  garbage_level, ok_dict_word))) {
477  if (found_terrible_word) {
478  if (crunch_debug > 0) {
479  tprintf ("P2 CRUNCHING: \"%s\"\n",
480  word->best_choice->unichar_string().string());
481  }
483  }
484  else if (!prev_potential_marked) {
485  copy_it = page_res_it;
486  prev_potential_marked = TRUE;
487  if (crunch_debug > 1) {
488  tprintf ("P3 CRUNCHING: \"%s\"\n",
489  word->best_choice->unichar_string().string());
490  }
491  }
492  }
493  else {
494  found_terrible_word = FALSE;
495  //Forget earlier potential crunches
496  prev_potential_marked = FALSE;
497  if (crunch_debug > 2) {
498  tprintf ("NO CRUNCH: \"%s\"\n",
499  word->best_choice->unichar_string().string());
500  }
501  }
502  }
503  page_res_it.forward ();
504  }
505 }
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:664
#define TRUE
Definition: capi.h:45
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
BLOCK * block
Definition: pageres.h:99
const char * string() const
Definition: strngs.cpp:198
BOOL8 potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word)
Definition: docqual.cpp:546
inT16 accept_count()
Definition: rejctmap.cpp:329
WERD_RES * restart_page()
Definition: pageres.h:683
inT16 safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:607
bool IsText() const
Definition: polyblk.h:52
WERD_RES * forward()
Definition: pageres.h:716
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
GARBAGE_LEVEL garbage_word(WERD_RES *word, BOOL8 ok_dict_word)
Definition: docqual.cpp:684
BOOL8 terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:508
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
const STRING & unichar_string() const
Definition: ratngs.h:539
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
void merge_tess_fails()
Definition: pageres.cpp:1062
bool crunch_early_convert_bad_unlv_chs
WERD_RES * word() const
Definition: pageres.h:736
REJMAP reject_map
Definition: pageres.h:271
GARBAGE_LEVEL
Definition: docqual.h:25
BLOCK_RES * block() const
Definition: pageres.h:742

◆ tilde_delete()

void tesseract::Tesseract::tilde_delete ( PAGE_RES_IT page_res_it)

Definition at line 594 of file docqual.cpp.

594  {
595  WERD_RES *word;
596  PAGE_RES_IT copy_it;
597  BOOL8 deleting_from_bol = FALSE;
598  BOOL8 marked_delete_point = FALSE;
599  inT16 debug_delete_mode;
600  CRUNCH_MODE delete_mode;
601  inT16 x_debug_delete_mode;
602  CRUNCH_MODE x_delete_mode;
603 
604  page_res_it.restart_page();
605  while (page_res_it.word() != NULL) {
606  word = page_res_it.word();
607 
608  delete_mode = word_deletable (word, debug_delete_mode);
609  if (delete_mode != CR_NONE) {
610  if (word->word->flag (W_BOL) || deleting_from_bol) {
611  if (crunch_debug > 0) {
612  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
613  debug_delete_mode,
614  word->best_choice->unichar_string().string());
615  }
616  word->unlv_crunch_mode = delete_mode;
617  deleting_from_bol = TRUE;
618  } else if (word->word->flag(W_EOL)) {
619  if (marked_delete_point) {
620  while (copy_it.word() != word) {
621  x_delete_mode = word_deletable (copy_it.word (),
622  x_debug_delete_mode);
623  if (crunch_debug > 0) {
624  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
625  x_debug_delete_mode,
626  copy_it.word()->best_choice->unichar_string().string());
627  }
628  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
629  copy_it.forward ();
630  }
631  }
632  if (crunch_debug > 0) {
633  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
634  debug_delete_mode,
635  word->best_choice->unichar_string().string());
636  }
637  word->unlv_crunch_mode = delete_mode;
638  deleting_from_bol = FALSE;
639  marked_delete_point = FALSE;
640  }
641  else {
642  if (!marked_delete_point) {
643  copy_it = page_res_it;
644  marked_delete_point = TRUE;
645  }
646  }
647  }
648  else {
649  deleting_from_bol = FALSE;
650  //Forget earlier potential crunches
651  marked_delete_point = FALSE;
652  }
653  /*
654  The following step has been left till now as the tess fails are used to
655  determine if the word is deletable.
656  */
658  word->merge_tess_fails();
659  page_res_it.forward ();
660  }
661 }
#define TRUE
Definition: capi.h:45
WERD_CHOICE * best_choice
Definition: pageres.h:219
Definition: werd.h:36
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
int16_t inT16
Definition: host.h:36
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
WERD_RES * restart_page()
Definition: pageres.h:683
CRUNCH_MODE word_deletable(WERD_RES *word, inT16 &delete_mode)
Definition: docqual.cpp:899
CRUNCH_MODE
Definition: pageres.h:145
WERD_RES * forward()
Definition: pageres.h:716
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
Definition: werd.h:35
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
const STRING & unichar_string() const
Definition: ratngs.h:539
void merge_tess_fails()
Definition: pageres.cpp:1062
WERD * word
Definition: pageres.h:175
WERD_RES * word() const
Definition: pageres.h:736

◆ TrainedXheightFix()

bool tesseract::Tesseract::TrainedXheightFix ( WERD_RES word,
BLOCK block,
ROW row 
)

Definition at line 1434 of file control.cpp.

1434  {
1435  int original_misfits = CountMisfitTops(word);
1436  if (original_misfits == 0)
1437  return false;
1438  float baseline_shift = 0.0f;
1439  float new_x_ht = ComputeCompatibleXheight(word, &baseline_shift);
1440  if (baseline_shift != 0.0f) {
1441  // Try the shift on its own first.
1442  if (!TestNewNormalization(original_misfits, baseline_shift, word->x_height,
1443  word, block, row))
1444  return false;
1445  original_misfits = CountMisfitTops(word);
1446  if (original_misfits > 0) {
1447  float new_baseline_shift;
1448  // Now recompute the new x_height.
1449  new_x_ht = ComputeCompatibleXheight(word, &new_baseline_shift);
1450  if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1451  // No test of return value here, as we are definitely making a change
1452  // to the word by shifting the baseline.
1453  TestNewNormalization(original_misfits, baseline_shift, new_x_ht,
1454  word, block, row);
1455  }
1456  }
1457  return true;
1458  } else if (new_x_ht >= kMinRefitXHeightFraction * word->x_height) {
1459  return TestNewNormalization(original_misfits, 0.0f, new_x_ht,
1460  word, block, row);
1461  } else {
1462  return false;
1463  }
1464 }
bool TestNewNormalization(int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row)
Definition: control.cpp:1468
float ComputeCompatibleXheight(WERD_RES *word_res, float *baseline_shift)
Definition: fixxht.cpp:101
const double kMinRefitXHeightFraction
Definition: control.cpp:57
int CountMisfitTops(WERD_RES *word_res)
Definition: fixxht.cpp:69
float x_height
Definition: pageres.h:295

◆ TrainFromBoxes()

void tesseract::Tesseract::TrainFromBoxes ( const GenericVector< TBOX > &  boxes,
const GenericVector< STRING > &  texts,
BLOCK_LIST *  block_list,
DocumentData training_data 
)

Definition at line 76 of file linerec.cpp.

79  {
80  int box_count = boxes.size();
81  // Process all the text lines in this page, as defined by the boxes.
82  int end_box = 0;
83  // Don't let \t, which marks newlines in the box file, get into the line
84  // content, as that makes the line unusable in training.
85  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
86  for (int start_box = end_box; start_box < box_count; start_box = end_box) {
87  // Find the textline of boxes starting at start and their bounding box.
88  TBOX line_box = boxes[start_box];
89  STRING line_str = texts[start_box];
90  for (end_box = start_box + 1; end_box < box_count && texts[end_box] != "\t";
91  ++end_box) {
92  line_box += boxes[end_box];
93  line_str += texts[end_box];
94  }
95  // Find the most overlapping block.
96  BLOCK* best_block = NULL;
97  int best_overlap = 0;
98  BLOCK_IT b_it(block_list);
99  for (b_it.mark_cycle_pt(); !b_it.cycled_list(); b_it.forward()) {
100  BLOCK* block = b_it.data();
101  if (block->poly_block() != NULL && !block->poly_block()->IsText())
102  continue; // Not a text block.
103  TBOX block_box = block->bounding_box();
104  block_box.rotate(block->re_rotation());
105  if (block_box.major_overlap(line_box)) {
106  TBOX overlap_box = line_box.intersection(block_box);
107  if (overlap_box.area() > best_overlap) {
108  best_overlap = overlap_box.area();
109  best_block = block;
110  }
111  }
112  }
113  ImageData* imagedata = NULL;
114  if (best_block == NULL) {
115  tprintf("No block overlapping textline: %s\n", line_str.string());
116  } else {
117  imagedata = GetLineData(line_box, boxes, texts, start_box, end_box,
118  *best_block);
119  }
120  if (imagedata != NULL)
121  training_data->AddPageToDocument(imagedata);
122  // Don't let \t, which marks newlines in the box file, get into the line
123  // content, as that makes the line unusable in training.
124  while (end_box < texts.size() && texts[end_box] == "\t") ++end_box;
125  }
126 }
TBOX intersection(const TBOX &box) const
Definition: rect.cpp:87
void bounding_box(ICOORD &bottom_left, ICOORD &top_right) const
get box
Definition: pdblock.h:59
inT32 area() const
Definition: rect.h:118
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
int size() const
Definition: genericvector.h:72
bool IsText() const
Definition: polyblk.h:52
Definition: strngs.h:45
bool major_overlap(const TBOX &box) const
Definition: rect.h:358
Definition: rect.h:30
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
FCOORD re_rotation() const
Definition: ocrblock.h:138
ImageData * GetLineData(const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block)
Definition: linerec.cpp:131
void rotate(const FCOORD &vec)
Definition: rect.h:189
Definition: ocrblock.h:30

◆ TrainLineRecognizer()

void tesseract::Tesseract::TrainLineRecognizer ( const STRING input_imagename,
const STRING output_basename,
BLOCK_LIST *  block_list 
)

Definition at line 45 of file linerec.cpp.

47  {
48  STRING lstmf_name = output_basename + ".lstmf";
49  DocumentData images(lstmf_name);
50  if (applybox_page > 0) {
51  // Load existing document for the previous pages.
52  if (!images.LoadDocument(lstmf_name.string(), 0, 0, nullptr)) {
53  tprintf("Failed to read training data from %s!\n", lstmf_name.string());
54  return;
55  }
56  }
57  GenericVector<TBOX> boxes;
59  // Get the boxes for this page, if there are any.
60  if (!ReadAllBoxes(applybox_page, false, input_imagename, &boxes, &texts, NULL,
61  NULL) ||
62  boxes.empty()) {
63  tprintf("Failed to read boxes from %s\n", input_imagename.string());
64  return;
65  }
66  TrainFromBoxes(boxes, texts, block_list, &images);
67  images.Shuffle();
68  if (!images.SaveDocument(lstmf_name.string(), NULL)) {
69  tprintf("Failed to write training data to %s!\n", lstmf_name.string());
70  }
71 }
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
bool empty() const
Definition: genericvector.h:90
Definition: strngs.h:45
bool ReadAllBoxes(int target_page, bool skip_blanks, const STRING &filename, GenericVector< TBOX > *boxes, GenericVector< STRING > *texts, GenericVector< STRING > *box_texts, GenericVector< int > *pages)
Definition: boxread.cpp:50
void TrainFromBoxes(const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data)
Definition: linerec.cpp:76

◆ TrySuperscriptSplits()

WERD_RES * tesseract::Tesseract::TrySuperscriptSplits ( int  num_chopped_leading,
float  leading_certainty,
ScriptPos  leading_pos,
int  num_chopped_trailing,
float  trailing_certainty,
ScriptPos  trailing_pos,
WERD_RES word,
bool *  is_good,
int retry_rebuild_leading,
int retry_rebuild_trailing 
)

Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.

Parameters
[in]num_chopped_leadinghow many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript)
[in]leading_certaintythe (minimum) certainty had by the characters in the original leading section.
[in]leading_pos"super" or "sub" (for debugging)
[in]num_chopped_trailinghow many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript)
[in]trailing_certaintythe (minimum) certainty had by the characters in the original trailing section.
[in]trailing_pos"super" or "sub" (for debugging)
[in]wordthe word to try to chop up.
[out]is_gooddo we believe our result?
[out]retry_rebuild_leading,retry_rebuild_trailingIf non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars.
Returns
A word which is the result of re-recognizing as asked.

Definition at line 382 of file superscript.cpp.

388  {
389  int num_chopped = word->chopped_word->NumBlobs();
390 
391  *retry_rebuild_leading = *retry_rebuild_trailing = 0;
392 
393  // Chop apart the word into up to three pieces.
394 
395  BlamerBundle *bb0 = NULL;
396  BlamerBundle *bb1 = NULL;
397  WERD_RES *prefix = NULL;
398  WERD_RES *core = NULL;
399  WERD_RES *suffix = NULL;
400  if (num_chopped_leading > 0) {
401  prefix = new WERD_RES(*word);
402  split_word(prefix, num_chopped_leading, &core, &bb0);
403  } else {
404  core = new WERD_RES(*word);
405  }
406 
407  if (num_chopped_trailing > 0) {
408  int split_pt = num_chopped - num_chopped_trailing - num_chopped_leading;
409  split_word(core, split_pt, &suffix, &bb1);
410  }
411 
412  // Recognize the pieces in turn.
413  int saved_cp_multiplier = classify_class_pruner_multiplier;
414  int saved_im_multiplier = classify_integer_matcher_multiplier;
415  if (prefix) {
416  // Turn off Tesseract's y-position penalties for the leading superscript.
419 
420  // Adjust our expectations about the baseline for this prefix.
421  if (superscript_debug >= 3) {
422  tprintf(" recognizing first %d chopped blobs\n", num_chopped_leading);
423  }
424  recog_word_recursive(prefix);
425  if (superscript_debug >= 2) {
426  tprintf(" The leading bits look like %s %s\n",
427  ScriptPosToString(leading_pos),
428  prefix->best_choice->unichar_string().string());
429  }
430 
431  // Restore the normal y-position penalties.
432  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
433  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
434  }
435 
436  if (superscript_debug >= 3) {
437  tprintf(" recognizing middle %d chopped blobs\n",
438  num_chopped - num_chopped_leading - num_chopped_trailing);
439  }
440 
441  if (suffix) {
442  // Turn off Tesseract's y-position penalties for the trailing superscript.
445 
446  if (superscript_debug >= 3) {
447  tprintf(" recognizing last %d chopped blobs\n", num_chopped_trailing);
448  }
449  recog_word_recursive(suffix);
450  if (superscript_debug >= 2) {
451  tprintf(" The trailing bits look like %s %s\n",
452  ScriptPosToString(trailing_pos),
453  suffix->best_choice->unichar_string().string());
454  }
455 
456  // Restore the normal y-position penalties.
457  classify_class_pruner_multiplier.set_value(saved_cp_multiplier);
458  classify_integer_matcher_multiplier.set_value(saved_im_multiplier);
459  }
460 
461  // Evaluate whether we think the results are believably better
462  // than what we already had.
463  bool good_prefix = !prefix || BelievableSuperscript(
464  superscript_debug >= 1, *prefix,
465  superscript_bettered_certainty * leading_certainty,
466  retry_rebuild_leading, NULL);
467  bool good_suffix = !suffix || BelievableSuperscript(
468  superscript_debug >= 1, *suffix,
469  superscript_bettered_certainty * trailing_certainty,
470  NULL, retry_rebuild_trailing);
471 
472  *is_good = good_prefix && good_suffix;
473  if (!*is_good && !*retry_rebuild_leading && !*retry_rebuild_trailing) {
474  // None of it is any good. Quit now.
475  delete core;
476  delete prefix;
477  delete suffix;
478  return NULL;
479  }
480  recog_word_recursive(core);
481 
482  // Now paste the results together into core.
483  if (suffix) {
484  suffix->SetAllScriptPositions(trailing_pos);
485  join_words(core, suffix, bb1);
486  }
487  if (prefix) {
488  prefix->SetAllScriptPositions(leading_pos);
489  join_words(prefix, core, bb0);
490  core = prefix;
491  prefix = NULL;
492  }
493 
494  if (superscript_debug >= 1) {
495  tprintf("%s superscript fix: %s\n", *is_good ? "ACCEPT" : "REJECT",
496  core->best_choice->unichar_string().string());
497  }
498  return core;
499 }
void split_word(WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const
Definition: tfacepp.cpp:182
WERD_CHOICE * best_choice
Definition: pageres.h:219
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: pageres.cpp:860
int classify_integer_matcher_multiplier
Definition: classify.h:468
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
int classify_class_pruner_multiplier
Definition: classify.h:464
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:180
int NumBlobs() const
Definition: blobs.h:425
bool BelievableSuperscript(bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const
const STRING & unichar_string() const
Definition: ratngs.h:539
void join_words(WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const
Definition: tfacepp.cpp:240
void recog_word_recursive(WERD_RES *word)
Definition: tfacepp.cpp:110
double superscript_bettered_certainty
TWERD * chopped_word
Definition: pageres.h:201

◆ unrej_good_chs()

void tesseract::Tesseract::unrej_good_chs ( WERD_RES word,
ROW row 
)

Definition at line 120 of file docqual.cpp.

120  {
121  if (word->bln_boxes == NULL ||
122  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
123  return;
124 
125  DocQualCallbacks cb(word);
127  *word->rebuild_word,
129 }
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:49
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193
TWERD * rebuild_word
Definition: pageres.h:244
bool empty() const
Definition: genericvector.h:90
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184

◆ unrej_good_quality_words()

void tesseract::Tesseract::unrej_good_quality_words ( PAGE_RES_IT page_res_it)

Definition at line 165 of file docqual.cpp.

166  {
167  WERD_RES *word;
168  ROW_RES *current_row;
169  BLOCK_RES *current_block;
170  int i;
171 
172  page_res_it.restart_page ();
173  while (page_res_it.word () != NULL) {
174  check_debug_pt (page_res_it.word (), 100);
175  if (bland_unrej) {
176  word = page_res_it.word ();
177  for (i = 0; i < word->reject_map.length (); i++) {
178  if (word->reject_map[i].accept_if_good_quality ())
179  word->reject_map[i].setrej_quality_accept ();
180  }
181  page_res_it.forward ();
182  }
183  else if ((page_res_it.row ()->char_count > 0) &&
184  ((page_res_it.row ()->rej_count /
185  (float) page_res_it.row ()->char_count) <=
187  word = page_res_it.word ();
191  word->best_choice->unichar_string().string(),
193  != AC_UNACCEPTABLE)) {
194  unrej_good_chs(word, page_res_it.row ()->row);
195  }
196  page_res_it.forward ();
197  }
198  else {
199  /* Skip to end of dodgy row */
200  current_row = page_res_it.row ();
201  while ((page_res_it.word () != NULL) &&
202  (page_res_it.row () == current_row))
203  page_res_it.forward ();
204  }
205  check_debug_pt (page_res_it.word (), 110);
206  }
207  page_res_it.restart_page ();
208  page_res_it.page_res->char_count = 0;
209  page_res_it.page_res->rej_count = 0;
210  current_block = NULL;
211  current_row = NULL;
212  while (page_res_it.word () != NULL) {
213  if (current_block != page_res_it.block ()) {
214  current_block = page_res_it.block ();
215  current_block->char_count = 0;
216  current_block->rej_count = 0;
217  }
218  if (current_row != page_res_it.row ()) {
219  current_row = page_res_it.row ();
220  current_row->char_count = 0;
221  current_row->rej_count = 0;
222  current_row->whole_word_rej_count = 0;
223  }
224  page_res_it.rej_stat_word ();
225  page_res_it.forward ();
226  }
227 }
inT32 rej_count
Definition: pageres.h:129
inT32 char_count
Definition: pageres.h:60
Unacceptable word.
Definition: control.h:36
WERD_CHOICE * best_choice
Definition: pageres.h:219
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:120
const STRING & unichar_lengths() const
Definition: ratngs.h:546
inT32 rej_count
Definition: pageres.h:61
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1794
void rej_stat_word()
Definition: pageres.cpp:1675
const char * string() const
Definition: strngs.cpp:198
ROW * row
Definition: pageres.h:127
ROW_RES * row() const
Definition: pageres.h:739
inT32 length() const
Definition: rejctmap.h:235
WERD_RES * restart_page()
Definition: pageres.h:683
inT32 whole_word_rej_count
Definition: pageres.h:130
PAGE_RES * page_res
Definition: pageres.h:661
WERD_RES * forward()
Definition: pageres.h:716
inT32 rej_count
Definition: pageres.h:101
inT32 char_count
Definition: pageres.h:128
BOOL8 quality_recoverable_rejects()
Definition: rejctmap.cpp:352
const STRING & unichar_string() const
Definition: ratngs.h:539
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1690
WERD_RES * word() const
Definition: pageres.h:736
const UNICHARSET * uch_set
Definition: pageres.h:192
inT32 char_count
Definition: pageres.h:100
REJMAP reject_map
Definition: pageres.h:271
BLOCK_RES * block() const
Definition: pageres.h:742

◆ word_adaptable()

BOOL8 tesseract::Tesseract::word_adaptable ( WERD_RES word,
uinT16  mode 
)

Definition at line 45 of file adaptions.cpp.

47  {
49  tprintf("Running word_adaptable() for %s rating %.4f certainty %.4f\n",
50  word->best_choice == NULL ? "" :
52  word->best_choice->rating(), word->best_choice->certainty());
53  }
54 
55  BOOL8 status = FALSE;
56  BITS16 flags(mode);
57 
58  enum MODES
59  {
60  ADAPTABLE_WERD,
61  ACCEPTABLE_WERD,
62  CHECK_DAWGS,
63  CHECK_SPACES,
64  CHECK_ONE_ELL_CONFLICT,
65  CHECK_AMBIG_WERD
66  };
67 
68  /*
69  0: NO adaption
70  */
71  if (mode == 0) {
72  if (tessedit_adaption_debug) tprintf("adaption disabled\n");
73  return FALSE;
74  }
75 
76  if (flags.bit (ADAPTABLE_WERD)) {
77  status |= word->tess_would_adapt; // result of Classify::AdaptableWord()
78  if (tessedit_adaption_debug && !status) {
79  tprintf("tess_would_adapt bit is false\n");
80  }
81  }
82 
83  if (flags.bit (ACCEPTABLE_WERD)) {
84  status |= word->tess_accepted;
85  if (tessedit_adaption_debug && !status) {
86  tprintf("tess_accepted bit is false\n");
87  }
88  }
89 
90  if (!status) { // If not set then
91  return FALSE; // ignore other checks
92  }
93 
94  if (flags.bit (CHECK_DAWGS) &&
95  (word->best_choice->permuter () != SYSTEM_DAWG_PERM) &&
96  (word->best_choice->permuter () != FREQ_DAWG_PERM) &&
97  (word->best_choice->permuter () != USER_DAWG_PERM) &&
98  (word->best_choice->permuter () != NUMBER_PERM)) {
99  if (tessedit_adaption_debug) tprintf("word not in dawgs\n");
100  return FALSE;
101  }
102 
103  if (flags.bit (CHECK_ONE_ELL_CONFLICT) && one_ell_conflict (word, FALSE)) {
104  if (tessedit_adaption_debug) tprintf("word has ell conflict\n");
105  return FALSE;
106  }
107 
108  if (flags.bit (CHECK_SPACES) &&
109  (strchr(word->best_choice->unichar_string().string(), ' ') != NULL)) {
110  if (tessedit_adaption_debug) tprintf("word contains spaces\n");
111  return FALSE;
112  }
113 
114  if (flags.bit (CHECK_AMBIG_WERD) &&
116  if (tessedit_adaption_debug) tprintf("word is ambiguous\n");
117  return FALSE;
118  }
119 
121  tprintf("returning status %d\n", status);
122  }
123  return status;
124 }
BOOL8 one_ell_conflict(WERD_RES *word_res, BOOL8 update_map)
Definition: reject.cpp:292
Definition: bits16.h:25
WERD_CHOICE * best_choice
Definition: pageres.h:219
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
uinT8 permuter() const
Definition: ratngs.h:344
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
const char int mode
Definition: ioapi.h:38
bool dangerous_ambig_found() const
Definition: ratngs.h:361
BOOL8 tess_would_adapt
Definition: pageres.h:281
BOOL8 tess_accepted
Definition: pageres.h:280
float certainty() const
Definition: ratngs.h:328
const STRING & unichar_string() const
Definition: ratngs.h:539
float rating() const
Definition: ratngs.h:325

◆ word_blank_and_set_display()

BOOL8 tesseract::Tesseract::word_blank_and_set_display ( PAGE_RES_IT pr_its)

Definition at line 716 of file pgedit.cpp.

716  {
717  pr_it->word()->word->bounding_box().plot(image_win, ScrollView::BLACK,
719  return word_set_display(pr_it);
720 }
ScrollView * image_win
Definition: pgedit.cpp:107
BOOL8 word_set_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:945

◆ word_bln_display()

BOOL8 tesseract::Tesseract::word_bln_display ( PAGE_RES_IT pr_it)

word_bln_display()

Normalize word and display in word window

Definition at line 728 of file pgedit.cpp.

728  {
729  WERD_RES* word_res = pr_it->word();
730  if (word_res->chopped_word == NULL) {
731  // Setup word normalization parameters.
732  word_res->SetupForRecognition(unicharset, this, BestPix(),
737  pr_it->row()->row, pr_it->block()->block);
738  }
741  1.0, 0.0f, -1000.0f, 1000.0f);
742  C_BLOB_IT it(word_res->word->cblob_list());
744  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
745  it.data()->plot_normed(word_res->denorm, color, ScrollView::BROWN,
747  color = WERD::NextColor(color);
748  }
750  return TRUE;
751 }
static ScrollView::Color NextColor(ScrollView::Color colour)
Definition: werd.cpp:306
#define TRUE
Definition: capi.h:45
bool classify_bln_numeric_mode
Definition: classify.h:499
void display_bln_lines(ScrollView *window, ScrollView::Color colour, float scale_factor, float y_offset, float minx, float maxx)
Definition: pgedit.cpp:210
BLOCK * block
Definition: pageres.h:99
ROW * row
Definition: pageres.h:127
ScrollView * bln_word_window_handle()
Definition: pgedit.cpp:172
ROW_RES * row() const
Definition: pageres.h:739
void Clear()
Definition: scrollview.cpp:595
static void Update()
Definition: scrollview.cpp:715
Pix * BestPix() const
UNICHARSET unicharset
Definition: ccutil.h:68
DENORM denorm
Definition: pageres.h:190
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
WERD * word
Definition: pageres.h:175
bool SetupForRecognition(const UNICHARSET &unicharset_in, tesseract::Tesseract *tesseract, Pix *pix, int norm_mode, const TBOX *norm_box, bool numeric_mode, bool use_body_size, bool allow_detailed_fx, ROW *row, const BLOCK *block)
Definition: pageres.cpp:294
WERD_RES * word() const
Definition: pageres.h:736
TWERD * chopped_word
Definition: pageres.h:201
BLOCK_RES * block() const
Definition: pageres.h:742

◆ word_blob_quality()

inT16 tesseract::Tesseract::word_blob_quality ( WERD_RES word,
ROW row 
)

Definition at line 65 of file docqual.cpp.

65  {
66  if (word->bln_boxes == NULL ||
67  word->rebuild_word == NULL || word->rebuild_word->blobs.empty())
68  return 0;
69 
70  DocQualCallbacks cb(word);
72  *word->rebuild_word,
74  return cb.match_count;
75 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193
TWERD * rebuild_word
Definition: pageres.h:244
bool empty() const
Definition: genericvector.h:90
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184
void CountMatchingBlobs(int index)
Definition: docqual.cpp:39

◆ word_char_quality()

void tesseract::Tesseract::word_char_quality ( WERD_RES word,
ROW row,
inT16 match_count,
inT16 accepted_match_count 
)

Definition at line 97 of file docqual.cpp.

100  {
101  if (word->bln_boxes == NULL || word->rebuild_word == NULL ||
102  word->rebuild_word->blobs.empty()) {
103  *match_count = 0;
104  *accepted_match_count = 0;
105  return;
106  }
107 
108  DocQualCallbacks cb(word);
110  *word->rebuild_word,
112  *match_count = cb.match_count;
113  *accepted_match_count = cb.accepted_match_count;
114 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:193
TWERD * rebuild_word
Definition: pageres.h:244
bool empty() const
Definition: genericvector.h:90
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
tesseract::BoxWord * bln_boxes
Definition: pageres.h:184
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:43

◆ word_contains_non_1_digit()

BOOL8 tesseract::Tesseract::word_contains_non_1_digit ( const char *  word,
const char *  word_lengths 
)

Definition at line 509 of file reject.cpp.

510  {
511  inT16 i;
512  inT16 offset;
513 
514  for (i = 0, offset = 0; word[offset] != '\0'; offset += word_lengths[i++]) {
515  if (unicharset.get_isdigit (word + offset, word_lengths[i]) &&
516  (word_lengths[i] != 1 || word[offset] != '1'))
517  return TRUE;
518  }
519  return FALSE;
520 }
#define TRUE
Definition: capi.h:45
voidpf uLong offset
Definition: ioapi.h:42
int16_t inT16
Definition: host.h:36
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:472
#define FALSE
Definition: capi.h:46
UNICHARSET unicharset
Definition: ccutil.h:68

◆ word_deletable()

CRUNCH_MODE tesseract::Tesseract::word_deletable ( WERD_RES word,
inT16 delete_mode 
)

Definition at line 899 of file docqual.cpp.

899  {
900  int word_len = word->reject_map.length ();
901  float rating_per_ch;
902  TBOX box; //BB of word
903 
904  if (word->unlv_crunch_mode == CR_NONE) {
905  delete_mode = 0;
906  return CR_NONE;
907  }
908 
909  if (word_len == 0) {
910  delete_mode = 1;
911  return CR_DELETE;
912  }
913 
914  if (word->rebuild_word != NULL) {
915  // Cube leaves rebuild_word NULL.
916  box = word->rebuild_word->bounding_box();
917  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
918  delete_mode = 4;
919  return CR_DELETE;
920  }
921 
922  if (noise_outlines(word->rebuild_word)) {
923  delete_mode = 5;
924  return CR_DELETE;
925  }
926  }
927 
928  if ((failure_count (word) * 1.5) > word_len) {
929  delete_mode = 2;
930  return CR_LOOSE_SPACE;
931  }
932 
933  if (word->best_choice->certainty () < crunch_del_cert) {
934  delete_mode = 7;
935  return CR_LOOSE_SPACE;
936  }
937 
938  rating_per_ch = word->best_choice->rating () / word_len;
939 
940  if (rating_per_ch > crunch_del_rating) {
941  delete_mode = 8;
942  return CR_LOOSE_SPACE;
943  }
944 
946  delete_mode = 9;
947  return CR_LOOSE_SPACE;
948  }
949 
950  if (box.bottom () >
952  delete_mode = 10;
953  return CR_LOOSE_SPACE;
954  }
955 
956  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
957  delete_mode = 11;
958  return CR_LOOSE_SPACE;
959  }
960 
961  if (box.width () < crunch_del_min_width * kBlnXHeight) {
962  delete_mode = 3;
963  return CR_LOOSE_SPACE;
964  }
965 
966  delete_mode = 0;
967  return CR_NONE;
968 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
const int kBlnXHeight
Definition: normalis.h:28
TWERD * rebuild_word
Definition: pageres.h:244
const int kBlnBaselineOffset
Definition: normalis.h:29
inT32 length() const
Definition: rejctmap.h:235
float certainty() const
Definition: ratngs.h:328
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
inT16 top() const
Definition: rect.h:54
Definition: rect.h:30
inT16 height() const
Definition: rect.h:104
inT16 width() const
Definition: rect.h:111
BOOL8 noise_outlines(TWERD *word)
Definition: docqual.cpp:982
inT16 bottom() const
Definition: rect.h:61
REJMAP reject_map
Definition: pageres.h:271
inT16 failure_count(WERD_RES *word)
Definition: docqual.cpp:970
TBOX bounding_box() const
Definition: blobs.cpp:879
float rating() const
Definition: ratngs.h:325

◆ word_display()

BOOL8 tesseract::Tesseract::word_display ( PAGE_RES_IT pr_it)

word_display() Word Processor

Display a word according to its display modes

Definition at line 760 of file pgedit.cpp.

760  {
761  WERD_RES* word_res = pr_it->word();
762  WERD* word = word_res->word;
763  TBOX word_bb; // word bounding box
764  int word_height; // ht of word BB
765  BOOL8 displayed_something = FALSE;
766  float shift; // from bot left
767  C_BLOB_IT c_it; // cblob iterator
768 
769  if (color_mode != CM_RAINBOW && word_res->box_word != NULL) {
770  BoxWord* box_word = word_res->box_word;
771  WERD_CHOICE* best_choice = word_res->best_choice;
772  int length = box_word->length();
773  if (word_res->fontinfo == NULL) return false;
774  const FontInfo& font_info = *word_res->fontinfo;
775  for (int i = 0; i < length; ++i) {
777  switch (color_mode) {
778  case CM_SUBSCRIPT:
779  if (best_choice->BlobPosition(i) == SP_SUBSCRIPT)
780  color = ScrollView::RED;
781  break;
782  case CM_SUPERSCRIPT:
783  if (best_choice->BlobPosition(i) == SP_SUPERSCRIPT)
784  color = ScrollView::RED;
785  break;
786  case CM_ITALIC:
787  if (font_info.is_italic())
788  color = ScrollView::RED;
789  break;
790  case CM_BOLD:
791  if (font_info.is_bold())
792  color = ScrollView::RED;
793  break;
794  case CM_FIXEDPITCH:
795  if (font_info.is_fixed_pitch())
796  color = ScrollView::RED;
797  break;
798  case CM_SERIF:
799  if (font_info.is_serif())
800  color = ScrollView::RED;
801  break;
802  case CM_SMALLCAPS:
803  if (word_res->small_caps)
804  color = ScrollView::RED;
805  break;
806  case CM_DROPCAPS:
807  if (best_choice->BlobPosition(i) == SP_DROPCAP)
808  color = ScrollView::RED;
809  break;
810  // TODO(rays) underline is currently completely unsupported.
811  case CM_UNDERLINE:
812  default:
813  break;
814  }
815  image_win->Pen(color);
816  TBOX box = box_word->BlobBox(i);
817  image_win->Rectangle(box.left(), box.bottom(), box.right(), box.top());
818  }
819  return true;
820  }
821  /*
822  Note the double coercions of(COLOUR)((inT32)editor_image_word_bb_color)
823  etc. are to keep the compiler happy.
824  */
825  // display bounding box
826  if (word->display_flag(DF_BOX)) {
827  word->bounding_box().plot(image_win,
831  editor_image_word_bb_color));
832 
835  image_win->Pen(c);
836  c_it.set_to_list(word->cblob_list());
837  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward())
838  c_it.data()->bounding_box().plot(image_win);
839  displayed_something = TRUE;
840  }
841 
842  // display edge steps
843  if (word->display_flag(DF_EDGE_STEP)) { // edgesteps available
844  word->plot(image_win); // rainbow colors
845  displayed_something = TRUE;
846  }
847 
848  // display poly approx
849  if (word->display_flag(DF_POLYGONAL)) {
850  // need to convert
852  tword->plot(image_win);
853  delete tword;
854  displayed_something = TRUE;
855  }
856 
857  // Display correct text and blamer information.
858  STRING text;
859  STRING blame;
860  if (word->display_flag(DF_TEXT) && word->text() != NULL) {
861  text = word->text();
862  }
863  if (word->display_flag(DF_BLAMER) &&
864  !(word_res->blamer_bundle != NULL &&
866  text = "";
867  const BlamerBundle *blamer_bundle = word_res->blamer_bundle;
868  if (blamer_bundle == NULL) {
869  text += "NULL";
870  } else {
871  text = blamer_bundle->TruthString();
872  }
873  text += " -> ";
874  STRING best_choice_str;
875  if (word_res->best_choice == NULL) {
876  best_choice_str = "NULL";
877  } else {
878  word_res->best_choice->string_and_lengths(&best_choice_str, NULL);
879  }
880  text += best_choice_str;
881  IncorrectResultReason reason = (blamer_bundle == NULL) ?
882  IRR_PAGE_LAYOUT : blamer_bundle->incorrect_result_reason();
883  ASSERT_HOST(reason < IRR_NUM_REASONS)
884  blame += " [";
885  blame += BlamerBundle::IncorrectReasonName(reason);
886  blame += "]";
887  }
888  if (text.length() > 0) {
889  word_bb = word->bounding_box();
891  word_height = word_bb.height();
892  int text_height = 0.50 * word_height;
893  if (text_height > 20) text_height = 20;
894  image_win->TextAttributes("Arial", text_height, false, false, false);
895  shift = (word_height < word_bb.width()) ? 0.25 * word_height : 0.0f;
896  image_win->Text(word_bb.left() + shift,
897  word_bb.bottom() + 0.25 * word_height, text.string());
898  if (blame.length() > 0) {
899  image_win->Text(word_bb.left() + shift,
900  word_bb.bottom() + 0.25 * word_height - text_height,
901  blame.string());
902  }
903 
904  displayed_something = TRUE;
905  }
906 
907  if (!displayed_something) // display BBox anyway
908  word->bounding_box().plot(image_win,
909  (ScrollView::Color)((inT32) editor_image_word_bb_color),
911  editor_image_word_bb_color));
912  return TRUE;
913 }
BOOL8 display_flag(uinT8 flag) const
Definition: werd.h:131
void plot(ScrollView *fd) const
Definition: rect.h:278
#define TRUE
Definition: capi.h:45
int32_t inT32
Definition: host.h:38
void plot(ScrollView *window)
Definition: blobs.cpp:916
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
BlamerBundle * blamer_bundle
Definition: pageres.h:230
bool is_bold() const
Definition: fontinfo.h:112
Definition: werd.h:55
int editor_image_word_bb_color
Definition: pgedit.cpp:136
const char * string() const
Definition: strngs.cpp:198
Definition: werd.h:50
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:320
inT32 length() const
Definition: strngs.cpp:193
ScrollView * image_win
Definition: pgedit.cpp:107
TBOX bounding_box() const
Definition: werd.cpp:160
tesseract::BoxWord * box_word
Definition: pageres.h:250
#define ASSERT_HOST(x)
Definition: errcode.h:84
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:427
inT16 left() const
Definition: rect.h:68
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
Definition: blobs.h:395
void Rectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:606
unsigned char BOOL8
Definition: host.h:44
Definition: strngs.h:45
#define FALSE
Definition: capi.h:46
const FontInfo * fontinfo
Definition: pageres.h:288
const char * text() const
Definition: werd.h:125
inT16 top() const
Definition: rect.h:54
int editor_image_blob_bb_color
Definition: pgedit.cpp:138
Definition: rect.h:30
C_BLOB_LIST * cblob_list()
Definition: werd.h:100
IncorrectResultReason
Definition: blamer.h:37
inT16 height() const
Definition: rect.h:104
bool small_caps
Definition: pageres.h:283
WERD * word
Definition: pageres.h:175
inT16 right() const
Definition: rect.h:75
inT16 width() const
Definition: rect.h:111
void Text(int x, int y, const char *mystring)
Definition: scrollview.cpp:658
bool is_italic() const
Definition: fontinfo.h:111
WERD_RES * word() const
Definition: pageres.h:736
inT16 bottom() const
Definition: rect.h:61
bool is_fixed_pitch() const
Definition: fontinfo.h:113
static const char * IncorrectReasonName(IncorrectResultReason irr)
Definition: blamer.cpp:56
Definition: werd.h:60
Definition: werd.h:51
static TWERD * PolygonalCopy(bool allow_detailed_fx, WERD *src)
Definition: blobs.cpp:793
STRING TruthString() const
Definition: blamer.h:100
void Pen(Color color)
Definition: scrollview.cpp:726
void plot(ScrollView *window, ScrollView::Color colour)
Definition: werd.cpp:297
bool is_serif() const
Definition: fontinfo.h:114
void TextAttributes(const char *font, int pixel_size, bool bold, bool italic, bool underlined)
Definition: scrollview.cpp:641

◆ word_dumper()

BOOL8 tesseract::Tesseract::word_dumper ( PAGE_RES_IT pr_it)

word_dumper()

Dump members to the debug window

Definition at line 921 of file pgedit.cpp.

921  {
922  if (pr_it->block()->block != NULL) {
923  tprintf("\nBlock data...\n");
924  pr_it->block()->block->print(NULL, FALSE);
925  }
926  tprintf("\nRow data...\n");
927  pr_it->row()->row->print(NULL);
928  tprintf("\nWord data...\n");
929  WERD_RES* word_res = pr_it->word();
930  word_res->word->print();
931  if (word_res->blamer_bundle != NULL && wordrec_debug_blamer &&
933  tprintf("Current blamer debug: %s\n",
934  word_res->blamer_bundle->debug().string());
935  }
936  return TRUE;
937 }
#define TRUE
Definition: capi.h:45
BlamerBundle * blamer_bundle
Definition: pageres.h:230
void print()
Definition: werd.cpp:266
#define tprintf(...)
Definition: tprintf.h:31
BLOCK * block
Definition: pageres.h:99
const char * string() const
Definition: strngs.cpp:198
ROW * row
Definition: pageres.h:127
void print(FILE *fp)
Definition: ocrrow.cpp:167
ROW_RES * row() const
Definition: pageres.h:739
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
const STRING & debug() const
Definition: blamer.h:116
#define FALSE
Definition: capi.h:46
void print(FILE *fp, BOOL8 dump)
dump whole table
Definition: ocrblock.cpp:197
WERD * word
Definition: pageres.h:175
WERD_RES * word() const
Definition: pageres.h:736
bool wordrec_debug_blamer
Definition: wordrec.h:167
BLOCK_RES * block() const
Definition: pageres.h:742

◆ word_outline_errs()

inT16 tesseract::Tesseract::word_outline_errs ( WERD_RES word)

Definition at line 77 of file docqual.cpp.

77  {
78  inT16 i = 0;
79  inT16 err_count = 0;
80 
81  if (word->rebuild_word != NULL) {
82  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
83  TBLOB* blob = word->rebuild_word->blobs[b];
84  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
85  blob->NumOutlines());
86  i++;
87  }
88  }
89  return err_count;
90 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
TWERD * rebuild_word
Definition: pageres.h:244
int NumOutlines() const
Definition: blobs.cpp:469
int16_t inT16
Definition: host.h:36
int NumBlobs() const
Definition: blobs.h:425
const STRING & unichar_string() const
Definition: ratngs.h:539
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
inT16 count_outline_errs(char c, inT16 outline_count)
Definition: docqual.cpp:131

◆ word_set_display()

BOOL8 tesseract::Tesseract::word_set_display ( PAGE_RES_IT pr_it)

word_set_display() Word processor

Display word according to current display mode settings

Definition at line 945 of file pgedit.cpp.

945  {
946  WERD* word = pr_it->word()->word;
954  return word_display(pr_it);
955 }
void set_display_flag(uinT8 flag, BOOL8 value)
Definition: werd.h:132
Definition: werd.h:55
Definition: werd.h:50
WERD * word
Definition: pageres.h:175
BOOL8 word_display(PAGE_RES_IT *pr_it)
Definition: pgedit.cpp:760
WERD_RES * word() const
Definition: pageres.h:736
BOOL8 bit(uinT8 bit_num) const
Definition: bits16.h:56
Definition: werd.h:60
Definition: werd.h:51
BITS16 word_display_mode
Definition: pgedit.cpp:122

◆ worst_noise_blob()

inT16 tesseract::Tesseract::worst_noise_blob ( WERD_RES word_res,
float *  worst_noise_score 
)

Definition at line 680 of file fixspace.cpp.

681  {
682  float noise_score[512];
683  int i;
684  int min_noise_blob; // 1st contender
685  int max_noise_blob; // last contender
686  int non_noise_count;
687  int worst_noise_blob; // Worst blob
688  float small_limit = kBlnXHeight * fixsp_small_outlines_size;
689  float non_noise_limit = kBlnXHeight * 0.8;
690 
691  if (word_res->rebuild_word == NULL)
692  return -1; // Can't handle cube words.
693 
694  // Normalised.
695  int blob_count = word_res->box_word->length();
696  ASSERT_HOST(blob_count <= 512);
697  if (blob_count < 5)
698  return -1; // too short to split
699 
700  /* Get the noise scores for all blobs */
701 
702  #ifndef SECURE_NAMES
703  if (debug_fix_space_level > 5)
704  tprintf("FP fixspace Noise metrics for \"%s\": ",
705  word_res->best_choice->unichar_string().string());
706  #endif
707 
708  for (i = 0; i < blob_count && i < word_res->rebuild_word->NumBlobs(); i++) {
709  TBLOB* blob = word_res->rebuild_word->blobs[i];
710  if (word_res->reject_map[i].accepted())
711  noise_score[i] = non_noise_limit;
712  else
713  noise_score[i] = blob_noise_score(blob);
714 
715  if (debug_fix_space_level > 5)
716  tprintf("%1.1f ", noise_score[i]);
717  }
718  if (debug_fix_space_level > 5)
719  tprintf("\n");
720 
721  /* Now find the worst one which is far enough away from the end of the word */
722 
723  non_noise_count = 0;
724  for (i = 0; i < blob_count && non_noise_count < fixsp_non_noise_limit; i++) {
725  if (noise_score[i] >= non_noise_limit) {
726  non_noise_count++;
727  }
728  }
729  if (non_noise_count < fixsp_non_noise_limit)
730  return -1;
731 
732  min_noise_blob = i;
733 
734  non_noise_count = 0;
735  for (i = blob_count - 1; i >= 0 && non_noise_count < fixsp_non_noise_limit;
736  i--) {
737  if (noise_score[i] >= non_noise_limit) {
738  non_noise_count++;
739  }
740  }
741  if (non_noise_count < fixsp_non_noise_limit)
742  return -1;
743 
744  max_noise_blob = i;
745 
746  if (min_noise_blob > max_noise_blob)
747  return -1;
748 
749  *worst_noise_score = small_limit;
750  worst_noise_blob = -1;
751  for (i = min_noise_blob; i <= max_noise_blob; i++) {
752  if (noise_score[i] < *worst_noise_score) {
753  worst_noise_blob = i;
754  *worst_noise_score = noise_score[i];
755  }
756  }
757  return worst_noise_blob;
758 }
inT16 worst_noise_blob(WERD_RES *word_res, float *worst_noise_score)
Definition: fixspace.cpp:680
WERD_CHOICE * best_choice
Definition: pageres.h:219
const int kBlnXHeight
Definition: normalis.h:28
TWERD * rebuild_word
Definition: pageres.h:244
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
tesseract::BoxWord * box_word
Definition: pageres.h:250
#define ASSERT_HOST(x)
Definition: errcode.h:84
int NumBlobs() const
Definition: blobs.h:425
float blob_noise_score(TBLOB *blob)
Definition: fixspace.cpp:760
const STRING & unichar_string() const
Definition: ratngs.h:539
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
REJMAP reject_map
Definition: pageres.h:271
int length() const
Definition: boxword.h:85

◆ write_results()

void tesseract::Tesseract::write_results ( PAGE_RES_IT page_res_it,
char  newline_type,
BOOL8  force_eol 
)

Definition at line 130 of file output.cpp.

132  { // override tilde crunch?
133  WERD_RES *word = page_res_it.word();
134  const UNICHARSET &uchset = *word->uch_set;
135  int i;
136  BOOL8 need_reject = FALSE;
137  UNICHAR_ID space = uchset.unichar_to_id(" ");
138 
139  if ((word->unlv_crunch_mode != CR_NONE ||
140  word->best_choice->length() == 0) &&
142  if ((word->unlv_crunch_mode != CR_DELETE) &&
143  (!stats_.tilde_crunch_written ||
144  ((word->unlv_crunch_mode == CR_KEEP_SPACE) &&
145  (word->word->space () > 0) &&
146  !word->word->flag (W_FUZZY_NON) &&
147  !word->word->flag (W_FUZZY_SP)))) {
148  if (!word->word->flag (W_BOL) &&
149  (word->word->space () > 0) &&
150  !word->word->flag (W_FUZZY_NON) &&
151  !word->word->flag (W_FUZZY_SP)) {
152  stats_.last_char_was_tilde = false;
153  }
154  need_reject = TRUE;
155  }
156  if ((need_reject && !stats_.last_char_was_tilde) ||
157  (force_eol && stats_.write_results_empty_block)) {
158  /* Write a reject char - mark as rejected unless zero_rejection mode */
159  stats_.last_char_was_tilde = TRUE;
160  stats_.tilde_crunch_written = true;
161  stats_.last_char_was_newline = false;
162  stats_.write_results_empty_block = false;
163  }
164 
165  if ((word->word->flag (W_EOL) && !stats_.last_char_was_newline) || force_eol) {
166  stats_.tilde_crunch_written = false;
167  stats_.last_char_was_newline = true;
168  stats_.last_char_was_tilde = false;
169  }
170 
171  if (force_eol)
172  stats_.write_results_empty_block = true;
173  return;
174  }
175 
176  /* NORMAL PROCESSING of non tilde crunched words */
177 
178  stats_.tilde_crunch_written = false;
179  if (newline_type)
180  stats_.last_char_was_newline = true;
181  else
182  stats_.last_char_was_newline = false;
183  stats_.write_results_empty_block = force_eol; // about to write a real word
184 
185  if (unlv_tilde_crunching &&
186  stats_.last_char_was_tilde &&
187  (word->word->space() == 0) &&
189  (word->best_choice->unichar_id(0) == space)) {
190  /* Prevent adjacent tilde across words - we know that adjacent tildes within
191  words have been removed */
192  word->MergeAdjacentBlobs(0);
193  }
194  if (newline_type ||
196  stats_.last_char_was_tilde = false;
197  else {
198  if (word->reject_map.length () > 0) {
199  if (word->best_choice->unichar_id(word->reject_map.length() - 1) == space)
200  stats_.last_char_was_tilde = true;
201  else
202  stats_.last_char_was_tilde = false;
203  }
204  else if (word->word->space () > 0)
205  stats_.last_char_was_tilde = false;
206  /* else it is unchanged as there are no output chars */
207  }
208 
209  ASSERT_HOST (word->best_choice->length() == word->reject_map.length());
210 
211  set_unlv_suspects(word);
212  check_debug_pt (word, 120);
214  tprintf ("Dict word: \"%s\": %d\n",
215  word->best_choice->debug_string().string(),
216  dict_word(*(word->best_choice)));
217  }
218  if (!word->word->flag(W_REP_CHAR) || !tessedit_write_rep_codes) {
220  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
221  for (i = 0; i < word->best_choice->length(); ++i) {
222  if (word->reject_map[i].rejected())
223  word->reject_map[i].setrej_minimal_rej_accept();
224  }
225  }
227  /* OVERRIDE ALL REJECTION MECHANISMS - ONLY REJECT TESS FAILURES */
228  for (i = 0; i < word->best_choice->length(); ++i) {
229  if ((word->best_choice->unichar_id(i) != space) &&
230  word->reject_map[i].rejected())
231  word->reject_map[i].setrej_minimal_rej_accept();
232  }
233  }
234  }
235 }
#define TRUE
Definition: capi.h:45
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:313
int UNICHAR_ID
Definition: unichar.h:33
WERD_CHOICE * best_choice
Definition: pageres.h:219
int length() const
Definition: ratngs.h:301
Definition: werd.h:36
const STRING debug_string() const
Definition: ratngs.h:503
BOOL8 check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1794
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
void set_unlv_suspects(WERD_RES *word)
Definition: output.cpp:305
void MergeAdjacentBlobs(int index)
Definition: pageres.cpp:969
#define ASSERT_HOST(x)
Definition: errcode.h:84
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
inT32 length() const
Definition: rejctmap.h:235
unsigned char BOOL8
Definition: host.h:44
#define FALSE
Definition: capi.h:46
Definition: werd.h:35
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:294
WERD * word
Definition: pageres.h:175
int dict_word(const WERD_CHOICE &word)
Definition: tface.cpp:128
WERD_RES * word() const
Definition: pageres.h:736
const UNICHARSET * uch_set
Definition: pageres.h:192
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194
REJMAP reject_map
Definition: pageres.h:271
uinT8 space()
Definition: werd.h:104

Member Data Documentation

◆ applybox_debug

int tesseract::Tesseract::applybox_debug = 1

"Debug level"

Definition at line 830 of file tesseractclass.h.

◆ applybox_exposure_pattern

char* tesseract::Tesseract::applybox_exposure_pattern = ".exp"

"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"

Definition at line 835 of file tesseractclass.h.

◆ applybox_learn_chars_and_char_frags_mode

bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false

"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."

Definition at line 839 of file tesseractclass.h.

◆ applybox_learn_ngrams_mode

bool tesseract::Tesseract::applybox_learn_ngrams_mode = false

"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."

Definition at line 842 of file tesseractclass.h.

◆ applybox_page

int tesseract::Tesseract::applybox_page = 0

"Page number to apply boxes from"

Definition at line 831 of file tesseractclass.h.

◆ bestrate_pruning_factor

double tesseract::Tesseract::bestrate_pruning_factor = 2.0

"Multiplying factor of" " current best rate to prune other hypotheses"

Definition at line 1117 of file tesseractclass.h.

◆ bidi_debug

int tesseract::Tesseract::bidi_debug = 0

"Debug level for BiDi"

Definition at line 829 of file tesseractclass.h.

◆ bland_unrej

bool tesseract::Tesseract::bland_unrej = false

"unrej potential with no checks"

Definition at line 943 of file tesseractclass.h.

◆ chs_leading_punct

char* tesseract::Tesseract::chs_leading_punct = "('`\""

"Leading punctuation"

Definition at line 882 of file tesseractclass.h.

◆ chs_trailing_punct1

char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!"

"1st Trailing punctuation"

Definition at line 883 of file tesseractclass.h.

◆ chs_trailing_punct2

char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\""

"2nd Trailing punctuation"

Definition at line 884 of file tesseractclass.h.

◆ conflict_set_I_l_1

char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]"

"Il1 conflict set"

Definition at line 1059 of file tesseractclass.h.

◆ crunch_accept_ok

bool tesseract::Tesseract::crunch_accept_ok = true

"Use acceptability in okstring"

Definition at line 972 of file tesseractclass.h.

◆ crunch_debug

int tesseract::Tesseract::crunch_debug = 0

"As it says"

Definition at line 981 of file tesseractclass.h.

◆ crunch_del_cert

double tesseract::Tesseract::crunch_del_cert = -10.0

"POTENTIAL crunch cert lt this"

Definition at line 961 of file tesseractclass.h.

◆ crunch_del_high_word

double tesseract::Tesseract::crunch_del_high_word = 1.5

"Del if word gt xht x this above bl"

Definition at line 966 of file tesseractclass.h.

◆ crunch_del_low_word

double tesseract::Tesseract::crunch_del_low_word = 0.5

"Del if word gt xht x this below bl"

Definition at line 967 of file tesseractclass.h.

◆ crunch_del_max_ht

double tesseract::Tesseract::crunch_del_max_ht = 3.0

"Del if word ht gt xht x this"

Definition at line 963 of file tesseractclass.h.

◆ crunch_del_min_ht

double tesseract::Tesseract::crunch_del_min_ht = 0.7

"Del if word ht lt xht x this"

Definition at line 962 of file tesseractclass.h.

◆ crunch_del_min_width

double tesseract::Tesseract::crunch_del_min_width = 3.0

"Del if word width lt xht x this"

Definition at line 964 of file tesseractclass.h.

◆ crunch_del_rating

double tesseract::Tesseract::crunch_del_rating = 60

"POTENTIAL crunch rating lt this"

Definition at line 960 of file tesseractclass.h.

◆ crunch_early_convert_bad_unlv_chs

bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false

"Take out ~^ early?"

Definition at line 951 of file tesseractclass.h.

◆ crunch_early_merge_tess_fails

bool tesseract::Tesseract::crunch_early_merge_tess_fails = true

"Before word crunch?"

Definition at line 950 of file tesseractclass.h.

◆ crunch_include_numerals

bool tesseract::Tesseract::crunch_include_numerals = false

"Fiddle alpha figures"

Definition at line 975 of file tesseractclass.h.

◆ crunch_leave_accept_strings

bool tesseract::Tesseract::crunch_leave_accept_strings = false

"Don't pot crunch sensible strings"

Definition at line 974 of file tesseractclass.h.

◆ crunch_leave_lc_strings

int tesseract::Tesseract::crunch_leave_lc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 977 of file tesseractclass.h.

◆ crunch_leave_ok_strings

bool tesseract::Tesseract::crunch_leave_ok_strings = true

"Don't touch sensible strings"

Definition at line 971 of file tesseractclass.h.

◆ crunch_leave_uc_strings

int tesseract::Tesseract::crunch_leave_uc_strings = 4

"Don't crunch words with long lower case strings"

Definition at line 979 of file tesseractclass.h.

◆ crunch_long_repetitions

int tesseract::Tesseract::crunch_long_repetitions = 3

"Crunch words with long repetitions"

Definition at line 980 of file tesseractclass.h.

◆ crunch_poor_garbage_cert

double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0

"crunch garbage cert lt this"

Definition at line 955 of file tesseractclass.h.

◆ crunch_poor_garbage_rate

double tesseract::Tesseract::crunch_poor_garbage_rate = 60

"crunch garbage rating lt this"

Definition at line 956 of file tesseractclass.h.

◆ crunch_pot_garbage

bool tesseract::Tesseract::crunch_pot_garbage = true

"POTENTIAL crunch garbage"

Definition at line 959 of file tesseractclass.h.

◆ crunch_pot_indicators

int tesseract::Tesseract::crunch_pot_indicators = 1

"How many potential indicators needed"

Definition at line 970 of file tesseractclass.h.

◆ crunch_pot_poor_cert

double tesseract::Tesseract::crunch_pot_poor_cert = -8.0

"POTENTIAL crunch cert lt this"

Definition at line 958 of file tesseractclass.h.

◆ crunch_pot_poor_rate

double tesseract::Tesseract::crunch_pot_poor_rate = 40

"POTENTIAL crunch rating lt this"

Definition at line 957 of file tesseractclass.h.

◆ crunch_rating_max

int tesseract::Tesseract::crunch_rating_max = 10

"For adj length in rating per ch"

Definition at line 969 of file tesseractclass.h.

◆ crunch_small_outlines_size

double tesseract::Tesseract::crunch_small_outlines_size = 0.6

"Small if lt xht x this"

Definition at line 968 of file tesseractclass.h.

◆ crunch_terrible_garbage

bool tesseract::Tesseract::crunch_terrible_garbage = true

"As it says"

Definition at line 953 of file tesseractclass.h.

◆ crunch_terrible_rating

double tesseract::Tesseract::crunch_terrible_rating = 80.0

"crunch rating lt this"

Definition at line 952 of file tesseractclass.h.

◆ debug_acceptable_wds

bool tesseract::Tesseract::debug_acceptable_wds = false

"Dump word pass/fail chk"

Definition at line 881 of file tesseractclass.h.

◆ debug_fix_space_level

int tesseract::Tesseract::debug_fix_space_level = 0

"Contextual fixspace debug"

Definition at line 987 of file tesseractclass.h.

◆ debug_noise_removal

int tesseract::Tesseract::debug_noise_removal = 0

"Debug reassignment of small outlines"

Definition at line 865 of file tesseractclass.h.

◆ debug_x_ht_level

int tesseract::Tesseract::debug_x_ht_level = 0

"Reestimate debug"

Definition at line 880 of file tesseractclass.h.

◆ docqual_excuse_outline_errs

bool tesseract::Tesseract::docqual_excuse_outline_errs = false

"Allow outline errs in unrejection?"

Definition at line 911 of file tesseractclass.h.

◆ enable_new_segsearch

bool tesseract::Tesseract::enable_new_segsearch = false

"Enable new segmentation search path."

Definition at line 1157 of file tesseractclass.h.

◆ enable_noise_removal

bool tesseract::Tesseract::enable_noise_removal = true

"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"

Definition at line 864 of file tesseractclass.h.

◆ file_type

char* tesseract::Tesseract::file_type = ".tif"

"Filename extension"

Definition at line 1066 of file tesseractclass.h.

◆ fixsp_done_mode

int tesseract::Tesseract::fixsp_done_mode = 1

"What constitues done for spacing"

Definition at line 986 of file tesseractclass.h.

◆ fixsp_non_noise_limit

int tesseract::Tesseract::fixsp_non_noise_limit = 1

"How many non-noise blbs either side?"

Definition at line 983 of file tesseractclass.h.

◆ fixsp_small_outlines_size

double tesseract::Tesseract::fixsp_small_outlines_size = 0.28

"Small if lt xht x this"

Definition at line 984 of file tesseractclass.h.

◆ heuristic_max_char_wh_ratio

double tesseract::Tesseract::heuristic_max_char_wh_ratio = 2.0

"max char width-to-height ratio allowed in segmentation"

Definition at line 1155 of file tesseractclass.h.

◆ heuristic_segcost_rating_base

double tesseract::Tesseract::heuristic_segcost_rating_base = 1.25

"base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost."

Definition at line 1146 of file tesseractclass.h.

◆ heuristic_weight_rating

double tesseract::Tesseract::heuristic_weight_rating = 1

"weight associated with char rating in combined cost of state"

Definition at line 1148 of file tesseractclass.h.

◆ heuristic_weight_seamcut

double tesseract::Tesseract::heuristic_weight_seamcut = 0

"weight associated with seam cut in combined cost of state"

Definition at line 1153 of file tesseractclass.h.

◆ heuristic_weight_width

double tesseract::Tesseract::heuristic_weight_width = 1000.0

"weight associated with width evidence in combined cost of" " state"

Definition at line 1151 of file tesseractclass.h.

◆ hocr_font_info

bool tesseract::Tesseract::hocr_font_info = false

"Add font info to hocr output"

Definition at line 949 of file tesseractclass.h.

◆ include_page_breaks

bool tesseract::Tesseract::include_page_breaks = false

"Include page separator string in output text after each " "image/page."

Definition at line 1097 of file tesseractclass.h.

◆ interactive_display_mode

bool tesseract::Tesseract::interactive_display_mode = false

"Run interactively?"

Definition at line 1065 of file tesseractclass.h.

◆ language_model_fixed_length_choices_depth

int tesseract::Tesseract::language_model_fixed_length_choices_depth = 3

"Depth of blob choice lists to explore" " when fixed length dawgs are on"

Definition at line 1140 of file tesseractclass.h.

◆ load_fixed_length_dawgs

bool tesseract::Tesseract::load_fixed_length_dawgs = true

"Load fixed length" " dawgs (e.g. for non-space delimited languages)"

Definition at line 1113 of file tesseractclass.h.

◆ lstm_use_matrix

bool tesseract::Tesseract::lstm_use_matrix = 1

"Use ratings matrix/beam searct with lstm"

Definition at line 907 of file tesseractclass.h.

◆ min_orientation_margin

double tesseract::Tesseract::min_orientation_margin = 7.0

"Min acceptable orientation margin"

Definition at line 1075 of file tesseractclass.h.

◆ min_sane_x_ht_pixels

int tesseract::Tesseract::min_sane_x_ht_pixels = 8

"Reject any x-ht lt or eq than this"

Definition at line 1060 of file tesseractclass.h.

◆ multilang_debug_level

int tesseract::Tesseract::multilang_debug_level = 0

"Print multilang debug info."

Definition at line 902 of file tesseractclass.h.

◆ ngram_permuter_activated

bool tesseract::Tesseract::ngram_permuter_activated = false

"Activate character-level n-gram-based permuter"

Definition at line 1136 of file tesseractclass.h.

◆ noise_cert_basechar

double tesseract::Tesseract::noise_cert_basechar = -8.0

"Hingepoint for base char certainty"

Definition at line 868 of file tesseractclass.h.

◆ noise_cert_disjoint

double tesseract::Tesseract::noise_cert_disjoint = -2.5

"Hingepoint for disjoint certainty"

Definition at line 871 of file tesseractclass.h.

◆ noise_cert_factor

double tesseract::Tesseract::noise_cert_factor = 0.375

"Scaling on certainty diff from Hingepoint"

Definition at line 877 of file tesseractclass.h.

◆ noise_cert_punc

double tesseract::Tesseract::noise_cert_punc = -2.5

"Threshold for new punc char certainty"

Definition at line 874 of file tesseractclass.h.

◆ noise_maxperblob

int tesseract::Tesseract::noise_maxperblob = 8

"Max diacritics to apply to a blob"

Definition at line 878 of file tesseractclass.h.

◆ noise_maxperword

int tesseract::Tesseract::noise_maxperword = 16

"Max diacritics to apply to a word"

Definition at line 879 of file tesseractclass.h.

◆ numeric_punctuation

char* tesseract::Tesseract::numeric_punctuation = ".,"

"Punct. chs expected WITHIN numbers"

Definition at line 989 of file tesseractclass.h.

◆ ocr_devanagari_split_strategy

int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."

Definition at line 824 of file tesseractclass.h.

◆ ok_repeated_ch_non_alphanum_wds

char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075"

"Allow NN to unrej"

Definition at line 1058 of file tesseractclass.h.

◆ outlines_2

char* tesseract::Tesseract::outlines_2 = "ij!?%\":;"

"Non standard number of outlines"

Definition at line 909 of file tesseractclass.h.

◆ outlines_odd

char* tesseract::Tesseract::outlines_odd = "%| "

"Non standard number of outlines"

Definition at line 908 of file tesseractclass.h.

◆ page_separator

char* tesseract::Tesseract::page_separator = "\f"

"Page separator (default is form feed control character)"

Definition at line 1099 of file tesseractclass.h.

◆ pageseg_devanagari_split_strategy

int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT

"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."

Definition at line 820 of file tesseractclass.h.

◆ paragraph_debug_level

int tesseract::Tesseract::paragraph_debug_level = 0

"Print paragraph debug info."

Definition at line 903 of file tesseractclass.h.

◆ paragraph_text_based

bool tesseract::Tesseract::paragraph_text_based = true

"Run paragraph detection on the post-text-recognition " "(more accurate)"

Definition at line 906 of file tesseractclass.h.

◆ permute_chartype_word

bool tesseract::Tesseract::permute_chartype_word = 0

"Turn on character type (property) consistency permuter"

Definition at line 1129 of file tesseractclass.h.

◆ permute_debug

bool tesseract::Tesseract::permute_debug = 0

"char permutation debug"

Definition at line 1115 of file tesseractclass.h.

◆ permute_fixed_length_dawg

bool tesseract::Tesseract::permute_fixed_length_dawg = 0

"Turn on fixed-length phrasebook search permuter"

Definition at line 1127 of file tesseractclass.h.

◆ permute_only_top

bool tesseract::Tesseract::permute_only_top = false

"Run only the top choice permuter"

Definition at line 1137 of file tesseractclass.h.

◆ permute_script_word

bool tesseract::Tesseract::permute_script_word = 0

"Turn on word script consistency permuter"

Definition at line 1119 of file tesseractclass.h.

◆ poly_allow_detailed_fx

bool tesseract::Tesseract::poly_allow_detailed_fx = false

"Allow feature extractors to see the original outline"

Definition at line 1079 of file tesseractclass.h.

◆ preserve_interword_spaces

bool tesseract::Tesseract::preserve_interword_spaces = false

"Preserve multiple interword spaces"

Definition at line 1094 of file tesseractclass.h.

◆ quality_blob_pc

double tesseract::Tesseract::quality_blob_pc = 0.0

"good_quality_doc gte good blobs limit"

Definition at line 886 of file tesseractclass.h.

◆ quality_char_pc

double tesseract::Tesseract::quality_char_pc = 0.95

"good_quality_doc gte good char limit"

Definition at line 889 of file tesseractclass.h.

◆ quality_min_initial_alphas_reqd

int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2

"alphas in a good word"

Definition at line 890 of file tesseractclass.h.

◆ quality_outline_pc

double tesseract::Tesseract::quality_outline_pc = 1.0

"good_quality_doc lte outline error limit"

Definition at line 888 of file tesseractclass.h.

◆ quality_rej_pc

double tesseract::Tesseract::quality_rej_pc = 0.08

"good_quality_doc lte rejection limit"

Definition at line 885 of file tesseractclass.h.

◆ quality_rowrej_pc

double tesseract::Tesseract::quality_rowrej_pc = 1.1

"good_quality_doc gte good char limit"

Definition at line 945 of file tesseractclass.h.

◆ rej_1Il_trust_permuter_type

bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true

"Don't double check"

Definition at line 1049 of file tesseractclass.h.

◆ rej_1Il_use_dict_word

bool tesseract::Tesseract::rej_1Il_use_dict_word = false

"Use dictword test"

Definition at line 1048 of file tesseractclass.h.

◆ rej_alphas_in_number_perm

bool tesseract::Tesseract::rej_alphas_in_number_perm = false

"Extend permuter check"

Definition at line 1054 of file tesseractclass.h.

◆ rej_trust_doc_dawg

bool tesseract::Tesseract::rej_trust_doc_dawg = false

"Use DOC dawg in 11l conf. detector"

Definition at line 1047 of file tesseractclass.h.

◆ rej_use_good_perm

bool tesseract::Tesseract::rej_use_good_perm = true

"Individual rejection control"

Definition at line 1052 of file tesseractclass.h.

◆ rej_use_sensible_wd

bool tesseract::Tesseract::rej_use_sensible_wd = false

"Extend permuter check"

Definition at line 1053 of file tesseractclass.h.

◆ rej_use_tess_accepted

bool tesseract::Tesseract::rej_use_tess_accepted = true

"Individual rejection control"

Definition at line 1050 of file tesseractclass.h.

◆ rej_use_tess_blanks

bool tesseract::Tesseract::rej_use_tess_blanks = true

"Individual rejection control"

Definition at line 1051 of file tesseractclass.h.

◆ rej_whole_of_mostly_reject_word_fract

double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85

"if >this fract"

Definition at line 1055 of file tesseractclass.h.

◆ segment_debug

int tesseract::Tesseract::segment_debug = 0

"Debug the whole segmentation process"

Definition at line 1114 of file tesseractclass.h.

◆ segment_reward_chartype

double tesseract::Tesseract::segment_reward_chartype = 0.97

"Score multipler for char type consistency within a word. "

Definition at line 1131 of file tesseractclass.h.

◆ segment_reward_ngram_best_choice

double tesseract::Tesseract::segment_reward_ngram_best_choice = 0.99

"Score multipler for ngram permuter's best choice" " (only used in the Han script path)."

Definition at line 1134 of file tesseractclass.h.

◆ segment_reward_script

double tesseract::Tesseract::segment_reward_script = 0.95

"Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " "Smaller value implies bigger reward."

Definition at line 1125 of file tesseractclass.h.

◆ segment_segcost_rating

bool tesseract::Tesseract::segment_segcost_rating = 0

"incorporate segmentation cost in word rating?"

Definition at line 1121 of file tesseractclass.h.

◆ segsearch_max_fixed_pitch_char_wh_ratio

double tesseract::Tesseract::segsearch_max_fixed_pitch_char_wh_ratio = 2.0

"Maximum character width-to-height ratio for" "fixed pitch fonts"

Definition at line 1160 of file tesseractclass.h.

◆ subscript_max_y_top

double tesseract::Tesseract::subscript_max_y_top = 0.5

"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."

Definition at line 1008 of file tesseractclass.h.

◆ superscript_bettered_certainty

double tesseract::Tesseract::superscript_bettered_certainty = 0.97

"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"

Definition at line 1000 of file tesseractclass.h.

◆ superscript_debug

int tesseract::Tesseract::superscript_debug = 0

"Debug level for sub & superscript fixer"

Definition at line 993 of file tesseractclass.h.

◆ superscript_min_y_bottom

double tesseract::Tesseract::superscript_min_y_bottom = 0.3

"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."

Definition at line 1012 of file tesseractclass.h.

◆ superscript_scaledown_ratio

double tesseract::Tesseract::superscript_scaledown_ratio = 0.4

"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."

Definition at line 1004 of file tesseractclass.h.

◆ superscript_worse_certainty

double tesseract::Tesseract::superscript_worse_certainty = 2.0

"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"

Definition at line 996 of file tesseractclass.h.

◆ suspect_accept_rating

double tesseract::Tesseract::suspect_accept_rating = -999.9

"Accept good rating limit"

Definition at line 1032 of file tesseractclass.h.

◆ suspect_constrain_1Il

bool tesseract::Tesseract::suspect_constrain_1Il = false

"UNLV keep 1Il chars rejected"

Definition at line 1030 of file tesseractclass.h.

◆ suspect_level

int tesseract::Tesseract::suspect_level = 99

"Suspect marker level"

Definition at line 1026 of file tesseractclass.h.

◆ suspect_rating_per_ch

double tesseract::Tesseract::suspect_rating_per_ch = 999.9

"Don't touch bad rating limit"

Definition at line 1031 of file tesseractclass.h.

◆ suspect_short_words

int tesseract::Tesseract::suspect_short_words = 2

"Don't Suspect dict wds longer than this"

Definition at line 1029 of file tesseractclass.h.

◆ suspect_space_level

int tesseract::Tesseract::suspect_space_level = 100

"Min suspect level for rejecting spaces"

Definition at line 1028 of file tesseractclass.h.

◆ tessedit_adaption_debug

bool tesseract::Tesseract::tessedit_adaption_debug = false

"Generate and print debug information for adaption"

Definition at line 828 of file tesseractclass.h.

◆ tessedit_ambigs_training

bool tesseract::Tesseract::tessedit_ambigs_training = false

"Perform training for ambiguities"

Definition at line 816 of file tesseractclass.h.

◆ tessedit_bigram_debug

int tesseract::Tesseract::tessedit_bigram_debug = 0

"Amount of debug output for bigram " "correction."

Definition at line 861 of file tesseractclass.h.

◆ tessedit_char_blacklist

char* tesseract::Tesseract::tessedit_char_blacklist = ""

"Blacklist of chars not to recognize"

Definition at line 810 of file tesseractclass.h.

◆ tessedit_char_unblacklist

char* tesseract::Tesseract::tessedit_char_unblacklist = ""

"List of chars to override tessedit_char_blacklist"

Definition at line 814 of file tesseractclass.h.

◆ tessedit_char_whitelist

char* tesseract::Tesseract::tessedit_char_whitelist = ""

"Whitelist of chars to recognize"

Definition at line 812 of file tesseractclass.h.

◆ tessedit_consistent_reps

bool tesseract::Tesseract::tessedit_consistent_reps = true

"Force all rep chars the same"

Definition at line 1039 of file tesseractclass.h.

◆ tessedit_create_boxfile

bool tesseract::Tesseract::tessedit_create_boxfile = false

"Output text with boxes"

Definition at line 1061 of file tesseractclass.h.

◆ tessedit_create_hocr

bool tesseract::Tesseract::tessedit_create_hocr = false

"Write .html hOCR output file"

Definition at line 1019 of file tesseractclass.h.

◆ tessedit_create_pdf

bool tesseract::Tesseract::tessedit_create_pdf = false

"Write .pdf output file"

Definition at line 1021 of file tesseractclass.h.

◆ tessedit_create_tsv

bool tesseract::Tesseract::tessedit_create_tsv = false

"Write .tsv output file"

Definition at line 1020 of file tesseractclass.h.

◆ tessedit_create_txt

bool tesseract::Tesseract::tessedit_create_txt = false

"Write .txt output file"

Definition at line 1018 of file tesseractclass.h.

◆ tessedit_debug_block_rejection

bool tesseract::Tesseract::tessedit_debug_block_rejection = false

"Block and Row stats"

Definition at line 855 of file tesseractclass.h.

◆ tessedit_debug_doc_rejection

bool tesseract::Tesseract::tessedit_debug_doc_rejection = false

"Page stats"

Definition at line 940 of file tesseractclass.h.

◆ tessedit_debug_fonts

bool tesseract::Tesseract::tessedit_debug_fonts = false

"Output font info per char"

Definition at line 854 of file tesseractclass.h.

◆ tessedit_debug_quality_metrics

bool tesseract::Tesseract::tessedit_debug_quality_metrics = false

"Output data to debug file"

Definition at line 942 of file tesseractclass.h.

◆ tessedit_display_outwords

bool tesseract::Tesseract::tessedit_display_outwords = false

"Draw output words"

Definition at line 843 of file tesseractclass.h.

◆ tessedit_dont_blkrej_good_wds

bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 929 of file tesseractclass.h.

◆ tessedit_dont_rowrej_good_wds

bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false

"Use word segmentation quality metric"

Definition at line 931 of file tesseractclass.h.

◆ tessedit_dump_choices

bool tesseract::Tesseract::tessedit_dump_choices = false

"Dump char choices"

Definition at line 844 of file tesseractclass.h.

◆ tessedit_dump_pageseg_images

bool tesseract::Tesseract::tessedit_dump_pageseg_images = false

"Dump intermediate images made during page segmentation"

Definition at line 801 of file tesseractclass.h.

◆ tessedit_enable_bigram_correction

bool tesseract::Tesseract::tessedit_enable_bigram_correction = true

"Enable correction based on the word bigram dictionary."

Definition at line 857 of file tesseractclass.h.

◆ tessedit_enable_dict_correction

bool tesseract::Tesseract::tessedit_enable_dict_correction = false

"Enable single word correction based on the dictionary."

Definition at line 859 of file tesseractclass.h.

◆ tessedit_enable_doc_dict

bool tesseract::Tesseract::tessedit_enable_doc_dict = true

"Add words to the document dictionary"

Definition at line 853 of file tesseractclass.h.

◆ tessedit_fix_fuzzy_spaces

bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true

"Try to improve fuzzy spaces"

Definition at line 847 of file tesseractclass.h.

◆ tessedit_fix_hyphens

bool tesseract::Tesseract::tessedit_fix_hyphens = true

"Crunch double hyphens?"

Definition at line 850 of file tesseractclass.h.

◆ tessedit_flip_0O

bool tesseract::Tesseract::tessedit_flip_0O = true

"Contextual 0O O0 flips"

Definition at line 1042 of file tesseractclass.h.

◆ tessedit_good_doc_still_rowrej_wd

double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1

"rej good doc wd if more than this fraction rejected"

Definition at line 937 of file tesseractclass.h.

◆ tessedit_good_quality_unrej

bool tesseract::Tesseract::tessedit_good_quality_unrej = true

"Reduce rejection on good docs"

Definition at line 913 of file tesseractclass.h.

◆ tessedit_image_border

int tesseract::Tesseract::tessedit_image_border = 2

"Rej blbs near image edge limit"

Definition at line 1056 of file tesseractclass.h.

◆ tessedit_init_config_only

bool tesseract::Tesseract::tessedit_init_config_only = false

"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."

Definition at line 1082 of file tesseractclass.h.

◆ tessedit_load_sublangs

char* tesseract::Tesseract::tessedit_load_sublangs = ""

"List of languages to load with this one"

Definition at line 1069 of file tesseractclass.h.

◆ tessedit_lower_flip_hyphen

double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5

"Aspect ratio dot/hyphen test"

Definition at line 1044 of file tesseractclass.h.

◆ tessedit_make_boxes_from_boxes

bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false

"Generate more boxes from boxed chars"

Definition at line 797 of file tesseractclass.h.

◆ tessedit_matcher_log

bool tesseract::Tesseract::tessedit_matcher_log = false

"Log matcher activity"

Definition at line 896 of file tesseractclass.h.

◆ tessedit_minimal_rej_pass1

bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false

"Do minimal rejection on pass 1 output"

Definition at line 894 of file tesseractclass.h.

◆ tessedit_minimal_rejection

bool tesseract::Tesseract::tessedit_minimal_rejection = false

"Only reject tess failures"

Definition at line 1033 of file tesseractclass.h.

◆ tessedit_ocr_engine_mode

int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT

"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available."

Definition at line 808 of file tesseractclass.h.

◆ tessedit_ok_mode

int tesseract::Tesseract::tessedit_ok_mode = 5

"Acceptance decision algorithm"

Definition at line 1111 of file tesseractclass.h.

◆ tessedit_override_permuter

bool tesseract::Tesseract::tessedit_override_permuter = true

"According to dict_word"

Definition at line 1067 of file tesseractclass.h.

◆ tessedit_page_number

int tesseract::Tesseract::tessedit_page_number = -1

"-1 -> All pages, else specific page to process"

Definition at line 1063 of file tesseractclass.h.

◆ tessedit_pageseg_mode

int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK

"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"

Definition at line 805 of file tesseractclass.h.

◆ tessedit_parallelize

int tesseract::Tesseract::tessedit_parallelize = 0

"Run in parallel where possible"

Definition at line 1092 of file tesseractclass.h.

◆ tessedit_prefer_joined_punct

bool tesseract::Tesseract::tessedit_prefer_joined_punct = false

"Reward punctation joins"

Definition at line 985 of file tesseractclass.h.

◆ tessedit_preserve_blk_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true

"Only rej partially rejected words in block rejection"

Definition at line 925 of file tesseractclass.h.

◆ tessedit_preserve_min_wd_len

int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2

"Only preserve wds longer than this"

Definition at line 933 of file tesseractclass.h.

◆ tessedit_preserve_row_rej_perfect_wds

bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true

"Only rej partially rejected words in row rejection"

Definition at line 927 of file tesseractclass.h.

◆ tessedit_redo_xheight

bool tesseract::Tesseract::tessedit_redo_xheight = true

"Check/Correct x-height"

Definition at line 851 of file tesseractclass.h.

◆ tessedit_reject_bad_qual_wds

bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true

"Reject all bad quality wds"

Definition at line 939 of file tesseractclass.h.

◆ tessedit_reject_block_percent

double tesseract::Tesseract::tessedit_reject_block_percent = 45.00

"%rej allowed before rej whole block"

Definition at line 918 of file tesseractclass.h.

◆ tessedit_reject_doc_percent

double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00

"%rej allowed before rej whole doc"

Definition at line 916 of file tesseractclass.h.

◆ tessedit_reject_mode

int tesseract::Tesseract::tessedit_reject_mode = 0

"Rejection algorithm"

Definition at line 1040 of file tesseractclass.h.

◆ tessedit_reject_row_percent

double tesseract::Tesseract::tessedit_reject_row_percent = 40.00

"%rej allowed before rej whole row"

Definition at line 920 of file tesseractclass.h.

◆ tessedit_rejection_debug

bool tesseract::Tesseract::tessedit_rejection_debug = false

"Adaption debug"

Definition at line 1041 of file tesseractclass.h.

◆ tessedit_resegment_from_boxes

bool tesseract::Tesseract::tessedit_resegment_from_boxes = false

"Take segmentation and labeling from box file"

Definition at line 791 of file tesseractclass.h.

◆ tessedit_resegment_from_line_boxes

bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false

"Conversion of word/line box file to char box file"

Definition at line 793 of file tesseractclass.h.

◆ tessedit_row_rej_good_docs

bool tesseract::Tesseract::tessedit_row_rej_good_docs = true

"Apply row rejection to good docs"

Definition at line 935 of file tesseractclass.h.

◆ tessedit_tess_adaption_mode

int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27

"Adaptation decision algorithm for tess"

Definition at line 892 of file tesseractclass.h.

◆ tessedit_test_adaption

bool tesseract::Tesseract::tessedit_test_adaption = false

"Test adaption criteria"

Definition at line 895 of file tesseractclass.h.

◆ tessedit_test_adaption_mode

int tesseract::Tesseract::tessedit_test_adaption_mode = 3

"Adaptation decision algorithm for tess"

Definition at line 898 of file tesseractclass.h.

◆ tessedit_timing_debug

bool tesseract::Tesseract::tessedit_timing_debug = false

"Print timing stats"

Definition at line 845 of file tesseractclass.h.

◆ tessedit_train_from_boxes

bool tesseract::Tesseract::tessedit_train_from_boxes = false

"Generate training data from boxed chars"

Definition at line 795 of file tesseractclass.h.

◆ tessedit_train_line_recognizer

bool tesseract::Tesseract::tessedit_train_line_recognizer = false

"Break input into lines and remap boxes if present"

Definition at line 799 of file tesseractclass.h.

◆ tessedit_unrej_any_wd

bool tesseract::Tesseract::tessedit_unrej_any_wd = false

"Don't bother with word plausibility"

Definition at line 849 of file tesseractclass.h.

◆ tessedit_upper_flip_hyphen

double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8

"Aspect ratio dot/hyphen test"

Definition at line 1046 of file tesseractclass.h.

◆ tessedit_use_primary_params_model

bool tesseract::Tesseract::tessedit_use_primary_params_model = false

"In multilingual mode use params model of the primary language"

Definition at line 1071 of file tesseractclass.h.

◆ tessedit_use_reject_spaces

bool tesseract::Tesseract::tessedit_use_reject_spaces = true

"Reject spaces?"

Definition at line 914 of file tesseractclass.h.

◆ tessedit_whole_wd_rej_row_percent

double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00

"Number of row rejects in whole word rejects" "which prevents whole row rejection"

Definition at line 923 of file tesseractclass.h.

◆ tessedit_word_for_word

bool tesseract::Tesseract::tessedit_word_for_word = false

"Make output have exactly one word per WERD"

Definition at line 1036 of file tesseractclass.h.

◆ tessedit_write_block_separators

bool tesseract::Tesseract::tessedit_write_block_separators = false

"Write block separators in output"

Definition at line 1014 of file tesseractclass.h.

◆ tessedit_write_images

bool tesseract::Tesseract::tessedit_write_images = false

"Capture the image from the IPE"

Definition at line 1064 of file tesseractclass.h.

◆ tessedit_write_params_to_file

char* tesseract::Tesseract::tessedit_write_params_to_file = ""

"Write all parameters to the given file."

Definition at line 826 of file tesseractclass.h.

◆ tessedit_write_rep_codes

bool tesseract::Tesseract::tessedit_write_rep_codes = false

"Write repetition char code"

Definition at line 1016 of file tesseractclass.h.

◆ tessedit_write_unlv

bool tesseract::Tesseract::tessedit_write_unlv = false

"Write .unlv output file"

Definition at line 1017 of file tesseractclass.h.

◆ tessedit_zero_kelvin_rejection

bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false

"Don't reject ANYTHING AT ALL"

Definition at line 1038 of file tesseractclass.h.

◆ tessedit_zero_rejection

bool tesseract::Tesseract::tessedit_zero_rejection = false

"Don't reject ANYTHING"

Definition at line 1034 of file tesseractclass.h.

◆ test_pt

bool tesseract::Tesseract::test_pt = false

"Test for point"

Definition at line 899 of file tesseractclass.h.

◆ test_pt_x

double tesseract::Tesseract::test_pt_x = 99999.99

"xcoord"

Definition at line 900 of file tesseractclass.h.

◆ test_pt_y

double tesseract::Tesseract::test_pt_y = 99999.99

"ycoord"

Definition at line 901 of file tesseractclass.h.

◆ textonly_pdf

bool tesseract::Tesseract::textonly_pdf = false

"Create PDF with only one invisible text layer"

Definition at line 1023 of file tesseractclass.h.

◆ textord_equation_detect

bool tesseract::Tesseract::textord_equation_detect = false

"Turn on equation detector"

Definition at line 1083 of file tesseractclass.h.

◆ textord_tabfind_aligned_gap_fraction

double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75

"Fraction of height used as a minimum gap for aligned blobs."

Definition at line 1091 of file tesseractclass.h.

◆ textord_tabfind_force_vertical_text

bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false

"Force using vertical text page mode"

Definition at line 1086 of file tesseractclass.h.

◆ textord_tabfind_show_vlines

bool tesseract::Tesseract::textord_tabfind_show_vlines = false

"Debug line finding"

Definition at line 1076 of file tesseractclass.h.

◆ textord_tabfind_vertical_horizontal_mix

bool tesseract::Tesseract::textord_tabfind_vertical_horizontal_mix = true

"find horizontal lines such as headers in vertical page mode"

Definition at line 1110 of file tesseractclass.h.

◆ textord_tabfind_vertical_text

bool tesseract::Tesseract::textord_tabfind_vertical_text = true

"Enable vertical detection"

Definition at line 1084 of file tesseractclass.h.

◆ textord_tabfind_vertical_text_ratio

double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5

"Fraction of textlines deemed vertical to use vertical page " "mode"

Definition at line 1089 of file tesseractclass.h.

◆ textord_use_cjk_fp_model

bool tesseract::Tesseract::textord_use_cjk_fp_model = FALSE

"Use CJK fixed pitch model"

Definition at line 1077 of file tesseractclass.h.

◆ unlv_tilde_crunching

bool tesseract::Tesseract::unlv_tilde_crunching = true

"Mark v.bad words for tilde crunch"

Definition at line 947 of file tesseractclass.h.

◆ unrecognised_char

char* tesseract::Tesseract::unrecognised_char = "|"

"Output char for unidentified blobs"

Definition at line 1025 of file tesseractclass.h.

◆ use_new_state_cost

bool tesseract::Tesseract::use_new_state_cost = FALSE

"use new state cost heuristics for segmentation state evaluation"

Definition at line 1142 of file tesseractclass.h.

◆ x_ht_acceptance_tolerance

int tesseract::Tesseract::x_ht_acceptance_tolerance = 8

"Max allowed deviation of blob top outside of font data"

Definition at line 991 of file tesseractclass.h.

◆ x_ht_min_change

int tesseract::Tesseract::x_ht_min_change = 8

"Min change in xht before actually trying it"

Definition at line 992 of file tesseractclass.h.


The documentation for this class was generated from the following files: