tesseract
4.00.00dev
|
#include <tesseractclass.h>
Public Member Functions | ||||||||||
Tesseract () | ||||||||||
~Tesseract () | ||||||||||
void | Clear () | |||||||||
void | ResetAdaptiveClassifier () | |||||||||
void | ResetDocumentDictionary () | |||||||||
void | SetEquationDetect (EquationDetect *detector) | |||||||||
const FCOORD & | reskew () const | |||||||||
Pix ** | mutable_pix_binary () | |||||||||
Pix * | pix_binary () const | |||||||||
Pix * | pix_grey () const | |||||||||
void | set_pix_grey (Pix *grey_pix) | |||||||||
Pix * | pix_original () const | |||||||||
void | set_pix_original (Pix *original_pix) | |||||||||
Pix * | BestPix () const | |||||||||
void | set_pix_thresholds (Pix *thresholds) | |||||||||
int | source_resolution () const | |||||||||
void | set_source_resolution (int ppi) | |||||||||
int | ImageWidth () const | |||||||||
int | ImageHeight () const | |||||||||
Pix * | scaled_color () const | |||||||||
int | scaled_factor () const | |||||||||
void | SetScaledColor (int factor, Pix *color) | |||||||||
const Textord & | textord () const | |||||||||
Textord * | mutable_textord () | |||||||||
bool | right_to_left () const | |||||||||
int | num_sub_langs () const | |||||||||
Tesseract * | get_sub_lang (int index) const | |||||||||
bool | AnyTessLang () const | |||||||||
bool | AnyLSTMLang () const | |||||||||
void | SetBlackAndWhitelist () | |||||||||
void | PrepareForPageseg () | |||||||||
void | PrepareForTessOCR (BLOCK_LIST *block_list, Tesseract *osd_tess, OSResults *osr) | |||||||||
int | SegmentPage (const STRING *input_file, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr) | |||||||||
void | SetupWordScripts (BLOCK_LIST *blocks) | |||||||||
int | AutoPageSeg (PageSegMode pageseg_mode, BLOCK_LIST *blocks, TO_BLOCK_LIST *to_blocks, BLOBNBOX_LIST *diacritic_blobs, Tesseract *osd_tess, OSResults *osr) | |||||||||
ColumnFinder * | SetupPageSegAndDetectOrientation (PageSegMode pageseg_mode, BLOCK_LIST *blocks, Tesseract *osd_tess, OSResults *osr, TO_BLOCK_LIST *to_blocks, Pix **photo_mask_pix, Pix **music_mask_pix) | |||||||||
void | PrerecAllWordsPar (const GenericVector< WordData > &words) | |||||||||
void | TrainLineRecognizer (const STRING &input_imagename, const STRING &output_basename, BLOCK_LIST *block_list) | |||||||||
void | TrainFromBoxes (const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, BLOCK_LIST *block_list, DocumentData *training_data) | |||||||||
ImageData * | GetLineData (const TBOX &line_box, const GenericVector< TBOX > &boxes, const GenericVector< STRING > &texts, int start_box, int end_box, const BLOCK &block) | |||||||||
ImageData * | GetRectImage (const TBOX &box, const BLOCK &block, int padding, TBOX *revised_box) const | |||||||||
void | LSTMRecognizeWord (const BLOCK &block, ROW *row, WERD_RES *word, PointerVector< WERD_RES > *words) | |||||||||
void | SearchWords (PointerVector< WERD_RES > *words) | |||||||||
bool | ProcessTargetWord (const TBOX &word_box, const TBOX &target_word_box, const char *word_config, int pass) | |||||||||
void | SetupAllWordsPassN (int pass_n, const TBOX *target_word_box, const char *word_config, PAGE_RES *page_res, GenericVector< WordData > *words) | |||||||||
void | SetupWordPassN (int pass_n, WordData *word) | |||||||||
bool | RecogAllWordsPassN (int pass_n, ETEXT_DESC *monitor, PAGE_RES_IT *pr_it, GenericVector< WordData > *words) | |||||||||
bool | recog_all_words (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config, int dopasses) | |||||||||
void | rejection_passes (PAGE_RES *page_res, ETEXT_DESC *monitor, const TBOX *target_word_box, const char *word_config) | |||||||||
void | bigram_correction_pass (PAGE_RES *page_res) | |||||||||
void | blamer_pass (PAGE_RES *page_res) | |||||||||
void | script_pos_pass (PAGE_RES *page_res) | |||||||||
int | RetryWithLanguage (const WordData &word_data, WordRecognizer recognizer, bool debug, WERD_RES **in_word, PointerVector< WERD_RES > *best_words) | |||||||||
bool | ReassignDiacritics (int pass, PAGE_RES_IT *pr_it, bool *make_next_word_fuzzy) | |||||||||
void | AssignDiacriticsToOverlappingBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< bool > *overlapped_any_blob, GenericVector< C_BLOB *> *target_blobs) | |||||||||
void | AssignDiacriticsToNewBlobs (const GenericVector< C_OUTLINE *> &outlines, int pass, WERD *real_word, PAGE_RES_IT *pr_it, GenericVector< bool > *word_wanted, GenericVector< C_BLOB *> *target_blobs) | |||||||||
bool | SelectGoodDiacriticOutlines (int pass, float certainty_threshold, PAGE_RES_IT *pr_it, C_BLOB *blob, const GenericVector< C_OUTLINE *> &outlines, int num_outlines, GenericVector< bool > *ok_outlines) | |||||||||
float | ClassifyBlobPlusOutlines (const GenericVector< bool > &ok_outlines, const GenericVector< C_OUTLINE *> &outlines, int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str) | |||||||||
float | ClassifyBlobAsWord (int pass_n, PAGE_RES_IT *pr_it, C_BLOB *blob, STRING *best_str, float *c2) | |||||||||
void | classify_word_and_language (int pass_n, PAGE_RES_IT *pr_it, WordData *word_data) | |||||||||
void | classify_word_pass1 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) | |||||||||
void | recog_pseudo_word (PAGE_RES *page_res, TBOX &selection_box) | |||||||||
void | fix_rep_char (PAGE_RES_IT *page_res_it) | |||||||||
ACCEPTABLE_WERD_TYPE | acceptable_word_string (const UNICHARSET &char_set, const char *s, const char *lengths) | |||||||||
void | match_word_pass_n (int pass_n, WERD_RES *word, ROW *row, BLOCK *block) | |||||||||
void | classify_word_pass2 (const WordData &word_data, WERD_RES **in_word, PointerVector< WERD_RES > *out_words) | |||||||||
void | ReportXhtFixResult (bool accept_new_word, float new_x_ht, WERD_RES *word, WERD_RES *new_word) | |||||||||
bool | RunOldFixXht (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | TrainedXheightFix (WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
bool | TestNewNormalization (int original_misfits, float baseline_shift, float new_x_ht, WERD_RES *word, BLOCK *block, ROW *row) | |||||||||
BOOL8 | recog_interactive (PAGE_RES_IT *pr_it) | |||||||||
void | set_word_fonts (WERD_RES *word) | |||||||||
void | font_recognition_pass (PAGE_RES *page_res) | |||||||||
void | dictionary_correction_pass (PAGE_RES *page_res) | |||||||||
BOOL8 | check_debug_pt (WERD_RES *word, int location) | |||||||||
bool | SubAndSuperscriptFix (WERD_RES *word_res) | |||||||||
void | GetSubAndSuperscriptCandidates (const WERD_RES *word, int *num_rebuilt_leading, ScriptPos *leading_pos, float *leading_certainty, int *num_rebuilt_trailing, ScriptPos *trailing_pos, float *trailing_certainty, float *avg_certainty, float *unlikely_threshold) | |||||||||
WERD_RES * | TrySuperscriptSplits (int num_chopped_leading, float leading_certainty, ScriptPos leading_pos, int num_chopped_trailing, float trailing_certainty, ScriptPos trailing_pos, WERD_RES *word, bool *is_good, int *retry_leading, int *retry_trailing) | |||||||||
bool | BelievableSuperscript (bool debug, const WERD_RES &word, float certainty_threshold, int *left_ok, int *right_ok) const | |||||||||
void | output_pass (PAGE_RES_IT &page_res_it, const TBOX *target_word_box) | |||||||||
void | write_results (PAGE_RES_IT &page_res_it, char newline_type, BOOL8 force_eol) | |||||||||
void | set_unlv_suspects (WERD_RES *word) | |||||||||
UNICHAR_ID | get_rep_char (WERD_RES *word) | |||||||||
BOOL8 | acceptable_number_string (const char *s, const char *lengths) | |||||||||
inT16 | count_alphanums (const WERD_CHOICE &word) | |||||||||
inT16 | count_alphas (const WERD_CHOICE &word) | |||||||||
void | read_config_file (const char *filename, SetParamConstraint constraint) | |||||||||
int | init_tesseract (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) | |||||||||
int | init_tesseract (const char *datapath, const char *language, OcrEngineMode oem) | |||||||||
int | init_tesseract_internal (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) | |||||||||
void | SetupUniversalFontIds () | |||||||||
int | init_tesseract_lm (const char *arg0, const char *textbase, const char *language, TessdataManager *mgr) | |||||||||
void | recognize_page (STRING &image_name) | |||||||||
void | end_tesseract () | |||||||||
bool | init_tesseract_lang_data (const char *arg0, const char *textbase, const char *language, OcrEngineMode oem, char **configs, int configs_size, const GenericVector< STRING > *vars_vec, const GenericVector< STRING > *vars_values, bool set_only_init_params, TessdataManager *mgr) | |||||||||
void | ParseLanguageString (const char *lang_str, GenericVector< STRING > *to_load, GenericVector< STRING > *not_to_load) | |||||||||
SVMenuNode * | build_menu_new () | |||||||||
void | pgeditor_main (int width, int height, PAGE_RES *page_res) | |||||||||
void | process_image_event (const SVEvent &event) | |||||||||
BOOL8 | process_cmd_win_event (inT32 cmd_event, char *new_value) | |||||||||
void | debug_word (PAGE_RES *page_res, const TBOX &selection_box) | |||||||||
void | do_re_display (BOOL8(tesseract::Tesseract::*word_painter)(PAGE_RES_IT *pr_it)) | |||||||||
BOOL8 | word_display (PAGE_RES_IT *pr_it) | |||||||||
BOOL8 | word_bln_display (PAGE_RES_IT *pr_it) | |||||||||
BOOL8 | word_blank_and_set_display (PAGE_RES_IT *pr_its) | |||||||||
BOOL8 | word_set_display (PAGE_RES_IT *pr_it) | |||||||||
BOOL8 | word_dumper (PAGE_RES_IT *pr_it) | |||||||||
void | blob_feature_display (PAGE_RES *page_res, const TBOX &selection_box) | |||||||||
void | make_reject_map (WERD_RES *word, ROW *row, inT16 pass) | |||||||||
BOOL8 | one_ell_conflict (WERD_RES *word_res, BOOL8 update_map) | |||||||||
inT16 | first_alphanum_index (const char *word, const char *word_lengths) | |||||||||
inT16 | first_alphanum_offset (const char *word, const char *word_lengths) | |||||||||
inT16 | alpha_count (const char *word, const char *word_lengths) | |||||||||
BOOL8 | word_contains_non_1_digit (const char *word, const char *word_lengths) | |||||||||
void | dont_allow_1Il (WERD_RES *word) | |||||||||
inT16 | count_alphanums (WERD_RES *word) | |||||||||
void | flip_0O (WERD_RES *word) | |||||||||
BOOL8 | non_0_digit (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) | |||||||||
BOOL8 | non_O_upper (const UNICHARSET &ch_set, UNICHAR_ID unichar_id) | |||||||||
BOOL8 | repeated_nonalphanum_wd (WERD_RES *word, ROW *row) | |||||||||
void | nn_match_word (WERD_RES *word, ROW *row) | |||||||||
void | nn_recover_rejects (WERD_RES *word, ROW *row) | |||||||||
void | set_done (WERD_RES *word, inT16 pass) | |||||||||
inT16 | safe_dict_word (const WERD_RES *werd_res) | |||||||||
void | flip_hyphens (WERD_RES *word) | |||||||||
void | reject_I_1_L (WERD_RES *word) | |||||||||
void | reject_edge_blobs (WERD_RES *word) | |||||||||
void | reject_mostly_rejects (WERD_RES *word) | |||||||||
BOOL8 | word_adaptable (WERD_RES *word, uinT16 mode) | |||||||||
void | recog_word_recursive (WERD_RES *word) | |||||||||
void | recog_word (WERD_RES *word) | |||||||||
void | split_and_recog_word (WERD_RES *word) | |||||||||
void | split_word (WERD_RES *word, int split_pt, WERD_RES **right_piece, BlamerBundle **orig_blamer_bundle) const | |||||||||
void | join_words (WERD_RES *word, WERD_RES *word2, BlamerBundle *orig_bb) const | |||||||||
void | match_current_words (WERD_RES_LIST &words, ROW *row, BLOCK *block) | |||||||||
inT16 | fp_eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
void | dump_words (WERD_RES_LIST &perm, inT16 score, inT16 mode, BOOL8 improved) | |||||||||
BOOL8 | fixspace_thinks_word_done (WERD_RES *word) | |||||||||
GARBAGE_LEVEL | garbage_word (WERD_RES *word, BOOL8 ok_dict_word) | |||||||||
BOOL8 | potential_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level, BOOL8 ok_dict_word) | |||||||||
void | tilde_crunch (PAGE_RES_IT &page_res_it) | |||||||||
void | unrej_good_quality_words (PAGE_RES_IT &page_res_it) | |||||||||
void | doc_and_block_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) | |||||||||
void | quality_based_rejection (PAGE_RES_IT &page_res_it, BOOL8 good_quality_doc) | |||||||||
void | convert_bad_unlv_chs (WERD_RES *word_res) | |||||||||
void | tilde_delete (PAGE_RES_IT &page_res_it) | |||||||||
inT16 | word_blob_quality (WERD_RES *word, ROW *row) | |||||||||
void | word_char_quality (WERD_RES *word, ROW *row, inT16 *match_count, inT16 *accepted_match_count) | |||||||||
void | unrej_good_chs (WERD_RES *word, ROW *row) | |||||||||
inT16 | count_outline_errs (char c, inT16 outline_count) | |||||||||
inT16 | word_outline_errs (WERD_RES *word) | |||||||||
BOOL8 | terrible_word_crunch (WERD_RES *word, GARBAGE_LEVEL garbage_level) | |||||||||
CRUNCH_MODE | word_deletable (WERD_RES *word, inT16 &delete_mode) | |||||||||
inT16 | failure_count (WERD_RES *word) | |||||||||
BOOL8 | noise_outlines (TWERD *word) | |||||||||
void | tess_segment_pass_n (int pass_n, WERD_RES *word) | |||||||||
PAGE_RES * | ApplyBoxes (const STRING &fname, bool find_segmentation, BLOCK_LIST *block_list) | |||||||||
void | PreenXHeights (BLOCK_LIST *block_list) | |||||||||
PAGE_RES * | SetupApplyBoxes (const GenericVector< TBOX > &boxes, BLOCK_LIST *block_list) | |||||||||
void | MaximallyChopWord (const GenericVector< TBOX > &boxes, BLOCK *block, ROW *row, WERD_RES *word_res) | |||||||||
bool | ResegmentCharBox (PAGE_RES *page_res, const TBOX *prev_box, const TBOX &box, const TBOX &next_box, const char *correct_text) | |||||||||
bool | ResegmentWordBox (BLOCK_LIST *block_list, const TBOX &box, const TBOX &next_box, const char *correct_text) | |||||||||
void | ReSegmentByClassification (PAGE_RES *page_res) | |||||||||
bool | ConvertStringToUnichars (const char *utf8, GenericVector< UNICHAR_ID > *class_ids) | |||||||||
bool | FindSegmentation (const GenericVector< UNICHAR_ID > &target_text, WERD_RES *word_res) | |||||||||
void | SearchForText (const GenericVector< BLOB_CHOICE_LIST *> *choices, int choices_pos, int choices_length, const GenericVector< UNICHAR_ID > &target_text, int text_index, float rating, GenericVector< int > *segmentation, float *best_rating, GenericVector< int > *best_segmentation) | |||||||||
void | TidyUp (PAGE_RES *page_res) | |||||||||
void | ReportFailedBox (int boxfile_lineno, TBOX box, const char *box_ch, const char *err_msg) | |||||||||
void | CorrectClassifyWords (PAGE_RES *page_res) | |||||||||
void | ApplyBoxTraining (const STRING &fontname, PAGE_RES *page_res) | |||||||||
int | CountMisfitTops (WERD_RES *word_res) | |||||||||
float | ComputeCompatibleXheight (WERD_RES *word_res, float *baseline_shift) | |||||||||
FILE * | init_recog_training (const STRING &fname) | |||||||||
void | recog_training_segmented (const STRING &fname, PAGE_RES *page_res, volatile ETEXT_DESC *monitor, FILE *output_file) | |||||||||
void | ambigs_classify_and_output (const char *label, PAGE_RES_IT *pr_it, FILE *output_file) | |||||||||
eval_word_spacing() | ||||||||||
The basic measure is the number of characters in contextually confirmed words. (I.e the word is done) If all words are contextually confirmed the evaluation is deemed perfect. Some fiddles are done to handle "1"s as these are VERY frequent causes of fuzzy spaces. The problem with the basic measure is that "561 63" would score the same as "56163", though given our knowledge that the space is fuzzy, and that there is a "1" next to the fuzzy space, we need to ensure that "56163" is preferred. The solution is to NOT COUNT the score of any word which has a digit at one end and a "1Il" as the character the other side of the space. Conversly, any character next to a "1" within a word is counted as a positive score. Thus "561 63" would score 4 (3 chars in a numeric word plus 1 side of the "1" joined). "56163" would score 7 - all chars in a numeric word + 2 sides of a "1" joined. The joined 1 rule is applied to any word REGARDLESS of contextual confirmation. Thus "PS7a71 3/7a" scores 1 (neither word is contexutally confirmed. The only score is from the joined 1. "PS7a713/7a" scores 2. | ||||||||||
BOOL8 | digit_or_numeric_punct (WERD_RES *word, int char_position) | |||||||||
inT16 | eval_word_spacing (WERD_RES_LIST &word_res_list) | |||||||||
fix_sp_fp_word() | ||||||||||
Test the current word to see if it can be split by deleting noise blobs. If so, do the business. Return with the iterator pointing to the same place if the word is unchanged, or the last of the replacement words. | ||||||||||
void | fix_noisy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_sp_fp_word (WERD_RES_IT &word_res_it, ROW *row, BLOCK *block) | |||||||||
inT16 | worst_noise_blob (WERD_RES *word_res, float *worst_noise_score) | |||||||||
float | blob_noise_score (TBLOB *blob) | |||||||||
void | break_noisiest_blob_word (WERD_RES_LIST &words) | |||||||||
fix_fuzzy_spaces() | ||||||||||
Walk over the page finding sequences of words joined by fuzzy spaces. Extract them as a sublist, process the sublist to find the optimal arrangement of spaces then replace the sublist in the ROW_RES.
| ||||||||||
void | fix_fuzzy_space_list (WERD_RES_LIST &best_perm, ROW *row, BLOCK *block) | |||||||||
void | fix_fuzzy_spaces (ETEXT_DESC *monitor, inT32 word_count, PAGE_RES *page_res) | |||||||||
process_selected_words() | ||||||||||
Walk the current block list applying the specified word processor function to each word that overlaps the selection_box. | ||||||||||
void | process_selected_words (PAGE_RES *page_res, TBOX &selection_box, BOOL8(tesseract::Tesseract::*word_processor)(PAGE_RES_IT *pr_it)) | |||||||||
tess_add_doc_word | ||||||||||
Add the given word to the document dictionary | ||||||||||
void | tess_add_doc_word (WERD_CHOICE *word_choice) | |||||||||
tess_acceptable_word | ||||||||||
| ||||||||||
bool | tess_acceptable_word (WERD_RES *word) | |||||||||
![]() | ||||||||||
Wordrec () | ||||||||||
virtual | ~Wordrec () | |||||||||
void | SaveAltChoices (const LIST &best_choices, WERD_RES *word) | |||||||||
void | FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) | |||||||||
void | CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle) | |||||||||
void | SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) | |||||||||
void | WordSearch (WERD_RES *word_res) | |||||||||
void | InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) | |||||||||
void | DoSegSearch (WERD_RES *word_res) | |||||||||
SEAM * | attempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams) | |||||||||
SEAM * | chop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams) | |||||||||
SEAM * | chop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number) | |||||||||
void | add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams) | |||||||||
void | choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile) | |||||||||
void | combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue) | |||||||||
SEAM * | pick_good_seam (TBLOB *blob) | |||||||||
void | try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) | |||||||||
void | try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob) | |||||||||
PRIORITY | grade_split_length (register SPLIT *split) | |||||||||
PRIORITY | grade_sharpness (register SPLIT *split) | |||||||||
bool | near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt) | |||||||||
virtual BLOB_CHOICE_LIST * | classify_piece (const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle) | |||||||||
void | merge_fragments (MATRIX *ratings, inT16 num_blobs) | |||||||||
void | get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists) | |||||||||
void | merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings) | |||||||||
void | fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices) | |||||||||
void | program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict) | |||||||||
void | cc_recog (WERD_RES *word) | |||||||||
void | program_editdown (inT32 elasped_time) | |||||||||
void | set_pass1 () | |||||||||
void | set_pass2 () | |||||||||
int | end_recog () | |||||||||
BLOB_CHOICE_LIST * | call_matcher (TBLOB *blob) | |||||||||
int | dict_word (const WERD_CHOICE &word) | |||||||||
BLOB_CHOICE_LIST * | classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle) | |||||||||
PRIORITY | point_priority (EDGEPT *point) | |||||||||
void | add_point_to_list (PointHeap *point_heap, EDGEPT *point) | |||||||||
bool | is_inside_angle (EDGEPT *pt) | |||||||||
int | angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3) | |||||||||
EDGEPT * | pick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist) | |||||||||
void | prioritize_points (TESSLINE *outline, PointHeap *points) | |||||||||
void | new_min_point (EDGEPT *local_min, PointHeap *points) | |||||||||
void | new_max_point (EDGEPT *local_max, PointHeap *points) | |||||||||
void | vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points) | |||||||||
SEAM * | improve_one_blob (const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number) | |||||||||
SEAM * | chop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number) | |||||||||
void | chop_word_main (WERD_RES *word) | |||||||||
void | improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending) | |||||||||
int | select_blob_to_split (const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment) | |||||||||
int | select_blob_to_split_from_fixpt (DANGERR *fixpt) | |||||||||
![]() | ||||||||||
Classify () | ||||||||||
virtual | ~Classify () | |||||||||
Dict & | getDict () | |||||||||
const ShapeTable * | shape_table () const | |||||||||
void | SetStaticClassifier (ShapeClassifier *static_classifier) | |||||||||
void | AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices) | |||||||||
bool | LargeSpeckle (const TBLOB &blob) | |||||||||
ADAPT_TEMPLATES | NewAdaptedTemplates (bool InitFromUnicharset) | |||||||||
int | GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId) | |||||||||
int | PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results) | |||||||||
void | ReadNewCutoffs (TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs) | |||||||||
void | PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates) | |||||||||
void | WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates) | |||||||||
ADAPT_TEMPLATES | ReadAdaptedTemplates (TFile *File) | |||||||||
FLOAT32 | ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch) | |||||||||
void | FreeNormProtos () | |||||||||
NORM_PROTOS * | ReadNormProtos (TFile *fp) | |||||||||
void | ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class) | |||||||||
INT_TEMPLATES | CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset) | |||||||||
void | LearnWord (const char *fontname, WERD_RES *word) | |||||||||
void | LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word) | |||||||||
void | InitAdaptiveClassifier (TessdataManager *mgr) | |||||||||
void | InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates) | |||||||||
void | AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results) | |||||||||
void | MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results) | |||||||||
void | ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results) | |||||||||
double | ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors) | |||||||||
void | ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices) | |||||||||
void | AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results) | |||||||||
int | GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures) | |||||||||
void | DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results) | |||||||||
PROTO_ID | MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask) | |||||||||
int | MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures) | |||||||||
void | MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob) | |||||||||
void | PrintAdaptiveMatchResults (const ADAPT_RESULTS &results) | |||||||||
void | RemoveExtraPuncs (ADAPT_RESULTS *Results) | |||||||||
void | RemoveBadMatches (ADAPT_RESULTS *Results) | |||||||||
void | SetAdaptiveThreshold (FLOAT32 Threshold) | |||||||||
void | ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features) | |||||||||
STRING | ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const | |||||||||
int | ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const | |||||||||
int | ShapeIDToClassID (int shape_id) const | |||||||||
UNICHAR_ID * | BaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results) | |||||||||
int | CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results) | |||||||||
int | CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results) | |||||||||
UNICHAR_ID * | GetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass) | |||||||||
void | DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results) | |||||||||
void | AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates) | |||||||||
void | DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class) | |||||||||
bool | AdaptableWord (WERD_RES *word) | |||||||||
void | EndAdaptiveClassifier () | |||||||||
void | SettupPass1 () | |||||||||
void | SettupPass2 () | |||||||||
void | AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices) | |||||||||
void | ClassifyAsNoise (ADAPT_RESULTS *Results) | |||||||||
void | ResetAdaptiveClassifierInternal () | |||||||||
void | SwitchAdaptiveClassifier () | |||||||||
void | StartBackupAdaptiveClassifier () | |||||||||
int | GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array) | |||||||||
void | ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array) | |||||||||
bool | TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config) | |||||||||
void | UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob) | |||||||||
bool | AdaptiveClassifierIsFull () const | |||||||||
bool | AdaptiveClassifierIsEmpty () const | |||||||||
bool | LooksLikeGarbage (TBLOB *blob) | |||||||||
void | RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox) | |||||||||
void | ClearCharNormArray (uinT8 *char_norm_array) | |||||||||
void | ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array) | |||||||||
void | ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures) | |||||||||
INT_TEMPLATES | ReadIntTemplates (TFile *fp) | |||||||||
void | WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset) | |||||||||
CLASS_ID | GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id) | |||||||||
void | ShowMatchDisplay () | |||||||||
UnicityTable< FontInfo > & | get_fontinfo_table () | |||||||||
const UnicityTable< FontInfo > & | get_fontinfo_table () const | |||||||||
UnicityTable< FontSet > & | get_fontset_table () | |||||||||
void | NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale) | |||||||||
FEATURE_SET | ExtractOutlineFeatures (TBLOB *Blob) | |||||||||
FEATURE_SET | ExtractPicoFeatures (TBLOB *Blob) | |||||||||
FEATURE_SET | ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) | |||||||||
FEATURE_SET | ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info) | |||||||||
void | LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text) | |||||||||
bool | WriteTRFile (const STRING &filename) | |||||||||
![]() | ||||||||||
CCStruct () | ||||||||||
~CCStruct () | ||||||||||
![]() | ||||||||||
CUtil () | ||||||||||
~CUtil () | ||||||||||
void | read_variables (const char *filename, bool global_only) | |||||||||
![]() | ||||||||||
CCUtil () | ||||||||||
virtual | ~CCUtil () | |||||||||
void | main_setup (const char *argv0, const char *basename) | |||||||||
CCUtil::main_setup - set location of tessdata and name of image. More... | ||||||||||
ParamsVectors * | params () | |||||||||
Additional Inherited Members | |
![]() | |
static void | SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info) |
static void | ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts) |
![]() | |
static const double | kDescenderFraction = 0.25 |
static const double | kXHeightFraction = 0.5 |
static const double | kAscenderFraction = 0.25 |
static const double | kXHeightCapRatio |
![]() | |
bool | SegSearchDone (int num_futile_classifications) |
void | UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle) |
void | ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle) |
void | ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending) |
void | InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug) |
![]() | |
IntegerMatcher | im_ |
FEATURE_DEFS_STRUCT | feature_defs_ |
ShapeTable * | shape_table_ |
Definition at line 164 of file tesseractclass.h.
tesseract::Tesseract::Tesseract | ( | ) |
Definition at line 54 of file tesseractclass.cpp.
tesseract::Tesseract::~Tesseract | ( | ) |
Definition at line 626 of file tesseractclass.cpp.
BOOL8 tesseract::Tesseract::acceptable_number_string | ( | const char * | s, |
const char * | lengths | ||
) |
Definition at line 419 of file output.cpp.
ACCEPTABLE_WERD_TYPE tesseract::Tesseract::acceptable_word_string | ( | const UNICHARSET & | char_set, |
const char * | s, | ||
const char * | lengths | ||
) |
Definition at line 1690 of file control.cpp.
inT16 tesseract::Tesseract::alpha_count | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 495 of file reject.cpp.
void tesseract::Tesseract::ambigs_classify_and_output | ( | const char * | label, |
PAGE_RES_IT * | pr_it, | ||
FILE * | output_file | ||
) |
Definition at line 202 of file recogtraining.cpp.
|
inline |
Definition at line 268 of file tesseractclass.h.
|
inline |
Definition at line 260 of file tesseractclass.h.
PAGE_RES * tesseract::Tesseract::ApplyBoxes | ( | const STRING & | fname, |
bool | find_segmentation, | ||
BLOCK_LIST * | block_list | ||
) |
Definition at line 117 of file applybox.cpp.
Calls LearnWord to extract features for labelled blobs within each word. Features are stored in an internal buffer.
Definition at line 796 of file applybox.cpp.
void tesseract::Tesseract::AssignDiacriticsToNewBlobs | ( | const GenericVector< C_OUTLINE *> & | outlines, |
int | pass, | ||
WERD * | real_word, | ||
PAGE_RES_IT * | pr_it, | ||
GenericVector< bool > * | word_wanted, | ||
GenericVector< C_BLOB *> * | target_blobs | ||
) |
Definition at line 1046 of file control.cpp.
void tesseract::Tesseract::AssignDiacriticsToOverlappingBlobs | ( | const GenericVector< C_OUTLINE *> & | outlines, |
int | pass, | ||
WERD * | real_word, | ||
PAGE_RES_IT * | pr_it, | ||
GenericVector< bool > * | word_wanted, | ||
GenericVector< bool > * | overlapped_any_blob, | ||
GenericVector< C_BLOB *> * | target_blobs | ||
) |
Definition at line 993 of file control.cpp.
int tesseract::Tesseract::AutoPageSeg | ( | PageSegMode | pageseg_mode, |
BLOCK_LIST * | blocks, | ||
TO_BLOCK_LIST * | to_blocks, | ||
BLOBNBOX_LIST * | diacritic_blobs, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Auto page segmentation. Divide the page image into blocks of uniform text linespacing and images.
Resolution (in ppi) is derived from the input image.
The output goes in the blocks list with corresponding TO_BLOCKs in the to_blocks list.
If !PSM_COL_FIND_ENABLED(pageseg_mode), then no attempt is made to divide the image into columns, but multiple blocks are still made if the text is of non-uniform linespacing.
If diacritic_blobs is non-null, then diacritics/noise blobs, that would confuse layout anaylsis by causing textline overlap, are placed there, with the expectation that they will be reassigned to words later and noise/diacriticness determined via classification.
If osd (orientation and script detection) is true then that is performed as well. If only_osd is true, then only orientation and script detection is performed. If osd is desired, (osd or only_osd) then osr_tess must be another Tesseract that was initialized especially for osd, and the results will be output into osr (orientation and script result).
Definition at line 204 of file pagesegmain.cpp.
bool tesseract::Tesseract::BelievableSuperscript | ( | bool | debug, |
const WERD_RES & | word, | ||
float | certainty_threshold, | ||
int * | left_ok, | ||
int * | right_ok | ||
) | const |
Return whether this is believable superscript or subscript text.
We insist that:
[in] | debug | If true, spew debug output |
[in] | word | The word whose best_choice we're evaluating |
[in] | certainty_threshold | If any of the characters have less certainty than this, reject. |
[out] | left_ok | How many left-side characters were ok? |
[out] | right_ok | How many right-side characters were ok? |
Definition at line 520 of file superscript.cpp.
|
inline |
Definition at line 216 of file tesseractclass.h.
void tesseract::Tesseract::bigram_correction_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 450 of file control.cpp.
void tesseract::Tesseract::blamer_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 694 of file control.cpp.
Definition at line 959 of file pgedit.cpp.
float tesseract::Tesseract::blob_noise_score | ( | TBLOB * | blob | ) |
Definition at line 760 of file fixspace.cpp.
void tesseract::Tesseract::break_noisiest_blob_word | ( | WERD_RES_LIST & | words | ) |
break_noisiest_blob_word() Find the word with the blob which looks like the worst noise. Break the word into two, deleting the noise blob.
Definition at line 615 of file fixspace.cpp.
SVMenuNode * tesseract::Tesseract::build_menu_new | ( | ) |
Definition at line 257 of file pgedit.cpp.
Definition at line 1794 of file control.cpp.
void tesseract::Tesseract::classify_word_and_language | ( | int | pass_n, |
PAGE_RES_IT * | pr_it, | ||
WordData * | word_data | ||
) |
Definition at line 1285 of file control.cpp.
void tesseract::Tesseract::classify_word_pass1 | ( | const WordData & | word_data, |
WERD_RES ** | in_word, | ||
PointerVector< WERD_RES > * | out_words | ||
) |
classify_word_pass1
Baseline normalize the word and pass it to Tess.
Definition at line 1362 of file control.cpp.
void tesseract::Tesseract::classify_word_pass2 | ( | const WordData & | word_data, |
WERD_RES ** | in_word, | ||
PointerVector< WERD_RES > * | out_words | ||
) |
classify_word_pass2
Control what to do with the word in pass 2
Definition at line 1519 of file control.cpp.
float tesseract::Tesseract::ClassifyBlobAsWord | ( | int | pass_n, |
PAGE_RES_IT * | pr_it, | ||
C_BLOB * | blob, | ||
STRING * | best_str, | ||
float * | c2 | ||
) |
Definition at line 1249 of file control.cpp.
float tesseract::Tesseract::ClassifyBlobPlusOutlines | ( | const GenericVector< bool > & | ok_outlines, |
const GenericVector< C_OUTLINE *> & | outlines, | ||
int | pass_n, | ||
PAGE_RES_IT * | pr_it, | ||
C_BLOB * | blob, | ||
STRING * | best_str | ||
) |
Definition at line 1207 of file control.cpp.
void tesseract::Tesseract::Clear | ( | ) |
Definition at line 637 of file tesseractclass.cpp.
float tesseract::Tesseract::ComputeCompatibleXheight | ( | WERD_RES * | word_res, |
float * | baseline_shift | ||
) |
Definition at line 101 of file fixxht.cpp.
void tesseract::Tesseract::convert_bad_unlv_chs | ( | WERD_RES * | word_res | ) |
Definition at line 664 of file docqual.cpp.
bool tesseract::Tesseract::ConvertStringToUnichars | ( | const char * | utf8, |
GenericVector< UNICHAR_ID > * | class_ids | ||
) |
Converts the space-delimited string of utf8 text to a vector of UNICHAR_ID.
Definition at line 535 of file applybox.cpp.
void tesseract::Tesseract::CorrectClassifyWords | ( | PAGE_RES * | page_res | ) |
Creates a fake best_choice entry in each WERD_RES with the correct text.
Definition at line 772 of file applybox.cpp.
inT16 tesseract::Tesseract::count_alphanums | ( | const WERD_CHOICE & | word | ) |
Definition at line 408 of file output.cpp.
Definition at line 558 of file reject.cpp.
inT16 tesseract::Tesseract::count_alphas | ( | const WERD_CHOICE & | word | ) |
Definition at line 398 of file output.cpp.
Definition at line 131 of file docqual.cpp.
Definition at line 69 of file fixxht.cpp.
debug_word
Process the whole image, but load word_config_ for the selected word(s).
Definition at line 640 of file pgedit.cpp.
void tesseract::Tesseract::dictionary_correction_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 2042 of file control.cpp.
Definition at line 343 of file fixspace.cpp.
void tesseract::Tesseract::do_re_display | ( | BOOL8(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) | word_painter | ) |
Redisplay page
Definition at line 308 of file pgedit.cpp.
void tesseract::Tesseract::doc_and_block_rejection | ( | PAGE_RES_IT & | page_res_it, |
BOOL8 | good_quality_doc | ||
) |
Definition at line 237 of file docqual.cpp.
void tesseract::Tesseract::dont_allow_1Il | ( | WERD_RES * | word | ) |
Definition at line 526 of file reject.cpp.
void tesseract::Tesseract::dump_words | ( | WERD_RES_LIST & | perm, |
inT16 | score, | ||
inT16 | mode, | ||
BOOL8 | improved | ||
) |
Definition at line 449 of file fixspace.cpp.
void tesseract::Tesseract::end_tesseract | ( | ) |
Definition at line 468 of file tessedit.cpp.
inT16 tesseract::Tesseract::eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
Definition at line 239 of file fixspace.cpp.
Definition at line 970 of file docqual.cpp.
bool tesseract::Tesseract::FindSegmentation | ( | const GenericVector< UNICHAR_ID > & | target_text, |
WERD_RES * | word_res | ||
) |
Resegments the word to achieve the target_text from the classifier. Returns false if the re-segmentation fails. Uses brute-force combination of up to kMaxGroupSize adjacent blobs, and applies a full search on the classifier results to find the best classified segmentation. As a compromise to obtain better recall, 1-1 ambiguity substitutions ARE used.
Definition at line 559 of file applybox.cpp.
inT16 tesseract::Tesseract::first_alphanum_index | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 469 of file reject.cpp.
inT16 tesseract::Tesseract::first_alphanum_offset | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 482 of file reject.cpp.
void tesseract::Tesseract::fix_fuzzy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
Definition at line 145 of file fixspace.cpp.
void tesseract::Tesseract::fix_fuzzy_spaces | ( | ETEXT_DESC * | monitor, |
inT32 | word_count, | ||
PAGE_RES * | page_res | ||
) |
Definition at line 48 of file fixspace.cpp.
void tesseract::Tesseract::fix_noisy_space_list | ( | WERD_RES_LIST & | best_perm, |
ROW * | row, | ||
BLOCK * | block | ||
) |
Definition at line 569 of file fixspace.cpp.
void tesseract::Tesseract::fix_rep_char | ( | PAGE_RES_IT * | page_res_it | ) |
fix_rep_char() The word is a repeated char. (Leader.) Find the repeated char character. Create the appropriate single-word or multi-word sequence according to the size of spaces in between blobs, and correct the classifications where some of the characters disagree with the majority.
Definition at line 1651 of file control.cpp.
Definition at line 535 of file fixspace.cpp.
Definition at line 503 of file fixspace.cpp.
void tesseract::Tesseract::flip_0O | ( | WERD_RES * | word | ) |
Definition at line 673 of file reject.cpp.
void tesseract::Tesseract::flip_hyphens | ( | WERD_RES * | word | ) |
Definition at line 616 of file reject.cpp.
void tesseract::Tesseract::font_recognition_pass | ( | PAGE_RES * | page_res | ) |
font_recognition_pass
Smooth the fonts for the document.
Definition at line 1985 of file control.cpp.
inT16 tesseract::Tesseract::fp_eval_word_spacing | ( | WERD_RES_LIST & | word_res_list | ) |
Definition at line 830 of file fixspace.cpp.
GARBAGE_LEVEL tesseract::Tesseract::garbage_word | ( | WERD_RES * | word, |
BOOL8 | ok_dict_word | ||
) |
Definition at line 684 of file docqual.cpp.
UNICHAR_ID tesseract::Tesseract::get_rep_char | ( | WERD_RES * | word | ) |
Definition at line 283 of file output.cpp.
Definition at line 256 of file tesseractclass.h.
ImageData * tesseract::Tesseract::GetLineData | ( | const TBOX & | line_box, |
const GenericVector< TBOX > & | boxes, | ||
const GenericVector< STRING > & | texts, | ||
int | start_box, | ||
int | end_box, | ||
const BLOCK & | block | ||
) |
Definition at line 131 of file linerec.cpp.
ImageData * tesseract::Tesseract::GetRectImage | ( | const TBOX & | box, |
const BLOCK & | block, | ||
int | padding, | ||
TBOX * | revised_box | ||
) | const |
Definition at line 165 of file linerec.cpp.
void tesseract::Tesseract::GetSubAndSuperscriptCandidates | ( | const WERD_RES * | word, |
int * | num_rebuilt_leading, | ||
ScriptPos * | leading_pos, | ||
float * | leading_certainty, | ||
int * | num_rebuilt_trailing, | ||
ScriptPos * | trailing_pos, | ||
float * | trailing_certainty, | ||
float * | avg_certainty, | ||
float * | unlikely_threshold | ||
) |
Determine how many characters (rebuilt blobs) on each end of a given word might plausibly be superscripts so SubAndSuperscriptFix can try to re-recognize them. Even if we find no whole blobs at either end, we will set *unlikely_threshold to a certainty that might be used to select "bad enough" outlier characters. If *unlikely_threshold is set to 0, though, there's really no hope.
[in] | word | The word to examine. |
[out] | num_rebuilt_leading | the number of rebuilt blobs at the start of the word which are all up or down and seem badly classified. |
[out] | leading_pos | "super" or "sub" (for debugging) |
[out] | leading_certainty | the worst certainty in the leading blobs. |
[out] | num_rebuilt_trailing | the number of rebuilt blobs at the end of the word which are all up or down and seem badly classified. |
[out] | trailing_pos | "super" or "sub" (for debugging) |
[out] | trailing_certainty | the worst certainty in the trailing blobs. |
[out] | avg_certainty | the average certainty of "normal" blobs in the word. |
[out] | unlikely_threshold | the threshold (on certainty) we used to select "bad enough" outlier characters. |
Definition at line 253 of file superscript.cpp.
|
inline |
Definition at line 230 of file tesseractclass.h.
|
inline |
Definition at line 227 of file tesseractclass.h.
FILE * tesseract::Tesseract::init_recog_training | ( | const STRING & | fname | ) |
Definition at line 36 of file recogtraining.cpp.
int tesseract::Tesseract::init_tesseract | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params, | ||
TessdataManager * | mgr | ||
) |
Definition at line 295 of file tessedit.cpp.
|
inline |
Definition at line 504 of file tesseractclass.h.
int tesseract::Tesseract::init_tesseract_internal | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params, | ||
TessdataManager * | mgr | ||
) |
Definition at line 393 of file tessedit.cpp.
bool tesseract::Tesseract::init_tesseract_lang_data | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
OcrEngineMode | oem, | ||
char ** | configs, | ||
int | configs_size, | ||
const GenericVector< STRING > * | vars_vec, | ||
const GenericVector< STRING > * | vars_values, | ||
bool | set_only_init_params, | ||
TessdataManager * | mgr | ||
) |
Definition at line 91 of file tessedit.cpp.
int tesseract::Tesseract::init_tesseract_lm | ( | const char * | arg0, |
const char * | textbase, | ||
const char * | language, | ||
TessdataManager * | mgr | ||
) |
Definition at line 457 of file tessedit.cpp.
void tesseract::Tesseract::join_words | ( | WERD_RES * | word, |
WERD_RES * | word2, | ||
BlamerBundle * | orig_bb | ||
) | const |
Definition at line 240 of file tfacepp.cpp.
void tesseract::Tesseract::LSTMRecognizeWord | ( | const BLOCK & | block, |
ROW * | row, | ||
WERD_RES * | word, | ||
PointerVector< WERD_RES > * | words | ||
) |
Definition at line 224 of file linerec.cpp.
Definition at line 196 of file fixspace.cpp.
void tesseract::Tesseract::match_word_pass_n | ( | int | pass_n, |
WERD_RES * | word, | ||
ROW * | row, | ||
BLOCK * | block | ||
) |
match_word_pass2
Baseline normalize the word and pass it to Tess.
Definition at line 1576 of file control.cpp.
void tesseract::Tesseract::MaximallyChopWord | ( | const GenericVector< TBOX > & | boxes, |
BLOCK * | block, | ||
ROW * | row, | ||
WERD_RES * | word_res | ||
) |
Tests the chopper by exhaustively running chop_one_blob. The word_res will contain filled chopped_word, seam_array, denorm, box_word and best_state for the maximally chopped word.
Definition at line 253 of file applybox.cpp.
|
inline |
Definition at line 185 of file tesseractclass.h.
|
inline |
Definition at line 246 of file tesseractclass.h.
Definition at line 982 of file docqual.cpp.
BOOL8 tesseract::Tesseract::non_0_digit | ( | const UNICHARSET & | ch_set, |
UNICHAR_ID | unichar_id | ||
) |
Definition at line 789 of file reject.cpp.
BOOL8 tesseract::Tesseract::non_O_upper | ( | const UNICHARSET & | ch_set, |
UNICHAR_ID | unichar_id | ||
) |
Definition at line 785 of file reject.cpp.
|
inline |
Definition at line 253 of file tesseractclass.h.
Definition at line 292 of file reject.cpp.
void tesseract::Tesseract::output_pass | ( | PAGE_RES_IT & | page_res_it, |
const TBOX * | target_word_box | ||
) |
Definition at line 68 of file output.cpp.
void tesseract::Tesseract::ParseLanguageString | ( | const char * | lang_str, |
GenericVector< STRING > * | to_load, | ||
GenericVector< STRING > * | not_to_load | ||
) |
Definition at line 261 of file tessedit.cpp.
Top level editor operation: Setup a new window and an according event handler
Definition at line 337 of file pgedit.cpp.
|
inline |
Definition at line 189 of file tesseractclass.h.
|
inline |
Definition at line 192 of file tesseractclass.h.
|
inline |
Definition at line 199 of file tesseractclass.h.
BOOL8 tesseract::Tesseract::potential_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level, | ||
BOOL8 | ok_dict_word | ||
) |
Definition at line 546 of file docqual.cpp.
void tesseract::Tesseract::PreenXHeights | ( | BLOCK_LIST * | block_list | ) |
Any row xheight that is significantly different from the median is set to the median.
Definition at line 193 of file applybox.cpp.
void tesseract::Tesseract::PrepareForPageseg | ( | ) |
Definition at line 688 of file tesseractclass.cpp.
void tesseract::Tesseract::PrepareForTessOCR | ( | BLOCK_LIST * | block_list, |
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Definition at line 719 of file tesseractclass.cpp.
void tesseract::Tesseract::PrerecAllWordsPar | ( | const GenericVector< WordData > & | words | ) |
Definition at line 39 of file par_control.cpp.
Definition at line 397 of file pgedit.cpp.
void tesseract::Tesseract::process_image_event | ( | const SVEvent & | event | ) |
User has done something in the image window - mouse down or up. Work out what it is and do something with it. If DOWN - just remember where it was. If UP - for each word in the selected area do the operation defined by the current mode.
Definition at line 564 of file pgedit.cpp.
void tesseract::Tesseract::process_selected_words | ( | PAGE_RES * | page_res, |
TBOX & | selection_box, | ||
BOOL8(tesseract::Tesseract::*)(PAGE_RES_IT *pr_it) | word_processor | ||
) |
Definition at line 30 of file pagewalk.cpp.
bool tesseract::Tesseract::ProcessTargetWord | ( | const TBOX & | word_box, |
const TBOX & | target_word_box, | ||
const char * | word_config, | ||
int | pass | ||
) |
Definition at line 121 of file control.cpp.
void tesseract::Tesseract::quality_based_rejection | ( | PAGE_RES_IT & | page_res_it, |
BOOL8 | good_quality_doc | ||
) |
Definition at line 143 of file docqual.cpp.
void tesseract::Tesseract::read_config_file | ( | const char * | filename, |
SetParamConstraint | constraint | ||
) |
Definition at line 60 of file tessedit.cpp.
bool tesseract::Tesseract::ReassignDiacritics | ( | int | pass, |
PAGE_RES_IT * | pr_it, | ||
bool * | make_next_word_fuzzy | ||
) |
Definition at line 927 of file control.cpp.
bool tesseract::Tesseract::recog_all_words | ( | PAGE_RES * | page_res, |
ETEXT_DESC * | monitor, | ||
const TBOX * | target_word_box, | ||
const char * | word_config, | ||
int | dopasses | ||
) |
Walk the page_res, recognizing all the words. If monitor is not null, it is used as a progress monitor/timeout/cancel. If dopasses is 0, all recognition passes are run, 1 just pass 1, 2 passes2 and higher. If target_word_box is not null, special things are done to words that overlap the target_word_box: if word_config is not null, the word config file is read for just the target word(s), otherwise, on pass 2 and beyond ONLY the target words are processed (Jetsoft modification.) Returns false if we cancelled prematurely.
page_res | page structure |
monitor | progress monitor |
word_config | word_config file |
target_word_box | specifies just to extract a rectangle |
dopasses | 0 - all, 1 just pass 1, 2 passes 2 and higher |
Definition at line 300 of file control.cpp.
BOOL8 tesseract::Tesseract::recog_interactive | ( | PAGE_RES_IT * | pr_it | ) |
Recognize a single word in interactive mode.
pr_it | the page results iterator |
Definition at line 82 of file control.cpp.
Definition at line 67 of file control.cpp.
void tesseract::Tesseract::recog_training_segmented | ( | const STRING & | fname, |
PAGE_RES * | page_res, | ||
volatile ETEXT_DESC * | monitor, | ||
FILE * | output_file | ||
) |
Definition at line 79 of file recogtraining.cpp.
void tesseract::Tesseract::recog_word | ( | WERD_RES * | word | ) |
Definition at line 46 of file tfacepp.cpp.
void tesseract::Tesseract::recog_word_recursive | ( | WERD_RES * | word | ) |
Definition at line 110 of file tfacepp.cpp.
bool tesseract::Tesseract::RecogAllWordsPassN | ( | int | pass_n, |
ETEXT_DESC * | monitor, | ||
PAGE_RES_IT * | pr_it, | ||
GenericVector< WordData > * | words | ||
) |
Definition at line 210 of file control.cpp.
void tesseract::Tesseract::recognize_page | ( | STRING & | image_name | ) |
void tesseract::Tesseract::reject_edge_blobs | ( | WERD_RES * | word | ) |
Definition at line 263 of file reject.cpp.
void tesseract::Tesseract::reject_I_1_L | ( | WERD_RES * | word | ) |
Definition at line 191 of file reject.cpp.
void tesseract::Tesseract::reject_mostly_rejects | ( | WERD_RES * | word | ) |
Definition at line 573 of file reject.cpp.
void tesseract::Tesseract::rejection_passes | ( | PAGE_RES * | page_res, |
ETEXT_DESC * | monitor, | ||
const TBOX * | target_word_box, | ||
const char * | word_config | ||
) |
Definition at line 598 of file control.cpp.
Definition at line 582 of file reject.cpp.
void tesseract::Tesseract::ReportFailedBox | ( | int | boxfile_lineno, |
TBOX | box, | ||
const char * | box_ch, | ||
const char * | err_msg | ||
) |
void tesseract::Tesseract::ReportXhtFixResult | ( | bool | accept_new_word, |
float | new_x_ht, | ||
WERD_RES * | word, | ||
WERD_RES * | new_word | ||
) |
Definition at line 1413 of file control.cpp.
void tesseract::Tesseract::ReSegmentByClassification | ( | PAGE_RES * | page_res | ) |
Resegments the words by running the classifier in an attempt to find the correct segmentation that produces the required string.
Definition at line 509 of file applybox.cpp.
bool tesseract::Tesseract::ResegmentCharBox | ( | PAGE_RES * | page_res, |
const TBOX * | prev_box, | ||
const TBOX & | box, | ||
const TBOX & | next_box, | ||
const char * | correct_text | ||
) |
Gather consecutive blobs that match the given box into the best_state and corresponding correct_text.
Fights over which box owns which blobs are settled by pre-chopping and applying the blobs to box or next_box with the least non-overlap.
This means that occasionally, blobs may be incorrectly segmented if the chopper fails to find a suitable chop point.
Definition at line 340 of file applybox.cpp.
bool tesseract::Tesseract::ResegmentWordBox | ( | BLOCK_LIST * | block_list, |
const TBOX & | box, | ||
const TBOX & | next_box, | ||
const char * | correct_text | ||
) |
Consume all source blobs that strongly overlap the given box, putting them into a new word, with the correct_text label. Fights over which box owns which blobs are settled by applying the blobs to box or next_box with the least non-overlap.
Definition at line 438 of file applybox.cpp.
void tesseract::Tesseract::ResetAdaptiveClassifier | ( | ) |
Definition at line 658 of file tesseractclass.cpp.
void tesseract::Tesseract::ResetDocumentDictionary | ( | ) |
Definition at line 666 of file tesseractclass.cpp.
|
inline |
Definition at line 181 of file tesseractclass.h.
int tesseract::Tesseract::RetryWithLanguage | ( | const WordData & | word_data, |
WordRecognizer | recognizer, | ||
bool | debug, | ||
WERD_RES ** | in_word, | ||
PointerVector< WERD_RES > * | best_words | ||
) |
Definition at line 888 of file control.cpp.
|
inline |
Definition at line 250 of file tesseractclass.h.
Definition at line 607 of file reject.cpp.
|
inline |
Definition at line 233 of file tesseractclass.h.
|
inline |
Definition at line 236 of file tesseractclass.h.
void tesseract::Tesseract::script_pos_pass | ( | PAGE_RES * | page_res | ) |
Definition at line 718 of file control.cpp.
void tesseract::Tesseract::SearchForText | ( | const GenericVector< BLOB_CHOICE_LIST *> * | choices, |
int | choices_pos, | ||
int | choices_length, | ||
const GenericVector< UNICHAR_ID > & | target_text, | ||
int | text_index, | ||
float | rating, | ||
GenericVector< int > * | segmentation, | ||
float * | best_rating, | ||
GenericVector< int > * | best_segmentation | ||
) |
Recursive helper to find a match to the target_text (from text_index position) in the choices (from choices_pos position).
choices | is an array of GenericVectors, of length choices_length, with each element representing a starting position in the word, and the GenericVector holding classification results for a sequence of consecutive blobs, with index 0 being a single blob, index 1 being 2 blobs etc. |
choices_pos | |
choices_length | |
target_text | |
text_index | |
rating | |
segmentation | |
best_rating | |
best_segmentation |
Definition at line 629 of file applybox.cpp.
void tesseract::Tesseract::SearchWords | ( | PointerVector< WERD_RES > * | words | ) |
Definition at line 253 of file linerec.cpp.
int tesseract::Tesseract::SegmentPage | ( | const STRING * | input_file, |
BLOCK_LIST * | blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr | ||
) |
Segment the page according to the current value of tessedit_pageseg_mode. pix_binary_ is used as the source image and should not be NULL. On return the blocks list owns all the constructed page layout.
Definition at line 103 of file pagesegmain.cpp.
bool tesseract::Tesseract::SelectGoodDiacriticOutlines | ( | int | pass, |
float | certainty_threshold, | ||
PAGE_RES_IT * | pr_it, | ||
C_BLOB * | blob, | ||
const GenericVector< C_OUTLINE *> & | outlines, | ||
int | num_outlines, | ||
GenericVector< bool > * | ok_outlines | ||
) |
Definition at line 1122 of file control.cpp.
|
inline |
Definition at line 195 of file tesseractclass.h.
|
inline |
Definition at line 201 of file tesseractclass.h.
|
inline |
Definition at line 217 of file tesseractclass.h.
|
inline |
Definition at line 224 of file tesseractclass.h.
void tesseract::Tesseract::set_unlv_suspects | ( | WERD_RES * | word | ) |
Definition at line 305 of file output.cpp.
void tesseract::Tesseract::set_word_fonts | ( | WERD_RES * | word | ) |
set_word_fonts
Get the fonts for the word.
Definition at line 1907 of file control.cpp.
void tesseract::Tesseract::SetBlackAndWhitelist | ( | ) |
Definition at line 673 of file tesseractclass.cpp.
void tesseract::Tesseract::SetEquationDetect | ( | EquationDetect * | detector | ) |
Definition at line 652 of file tesseractclass.cpp.
|
inline |
Definition at line 239 of file tesseractclass.h.
void tesseract::Tesseract::SetupAllWordsPassN | ( | int | pass_n, |
const TBOX * | target_word_box, | ||
const char * | word_config, | ||
PAGE_RES * | page_res, | ||
GenericVector< WordData > * | words | ||
) |
If tesseract is to be run, sets the words up ready for it.
Definition at line 151 of file control.cpp.
PAGE_RES * tesseract::Tesseract::SetupApplyBoxes | ( | const GenericVector< TBOX > & | boxes, |
BLOCK_LIST * | block_list | ||
) |
Builds a PAGE_RES from the block_list in the way required for ApplyBoxes: All fuzzy spaces are removed, and all the words are maximally chopped.
Definition at line 217 of file applybox.cpp.
ColumnFinder * tesseract::Tesseract::SetupPageSegAndDetectOrientation | ( | PageSegMode | pageseg_mode, |
BLOCK_LIST * | blocks, | ||
Tesseract * | osd_tess, | ||
OSResults * | osr, | ||
TO_BLOCK_LIST * | to_blocks, | ||
Pix ** | photo_mask_pix, | ||
Pix ** | music_mask_pix | ||
) |
Sets up auto page segmentation, determines the orientation, and corrects it. Somewhat arbitrary chunk of functionality, factored out of AutoPageSeg to facilitate testing. photo_mask_pix is a pointer to a NULL pointer that will be filled on return with the leptonica photo mask, which must be pixDestroyed by the caller. to_blocks is an empty list that will be filled with (usually a single) block that is used during layout analysis. This ugly API is required because of the possibility of a unlv zone file. TODO(rays) clean this up. See AutoPageSeg for other arguments. The returned ColumnFinder must be deleted after use.
Definition at line 274 of file pagesegmain.cpp.
void tesseract::Tesseract::SetupUniversalFontIds | ( | ) |
Definition at line 436 of file tessedit.cpp.
Definition at line 174 of file control.cpp.
void tesseract::Tesseract::SetupWordScripts | ( | BLOCK_LIST * | blocks | ) |
|
inline |
Definition at line 221 of file tesseractclass.h.
void tesseract::Tesseract::split_and_recog_word | ( | WERD_RES * | word | ) |
Definition at line 144 of file tfacepp.cpp.
void tesseract::Tesseract::split_word | ( | WERD_RES * | word, |
int | split_pt, | ||
WERD_RES ** | right_piece, | ||
BlamerBundle ** | orig_blamer_bundle | ||
) | const |
Definition at line 182 of file tfacepp.cpp.
bool tesseract::Tesseract::SubAndSuperscriptFix | ( | WERD_RES * | word | ) |
Attempt to split off any high (or low) bits at the ends of the word with poor certainty and recognize them separately. If the certainty gets much better and other sanity checks pass, acccept.
This superscript fix is meant to be called in the second pass of recognition when we have tried once and already have a preliminary answer for word.
Definition at line 101 of file superscript.cpp.
BOOL8 tesseract::Tesseract::terrible_word_crunch | ( | WERD_RES * | word, |
GARBAGE_LEVEL | garbage_level | ||
) |
Definition at line 508 of file docqual.cpp.
bool tesseract::Tesseract::tess_acceptable_word | ( | WERD_RES * | word | ) |
Definition at line 69 of file tessbox.cpp.
void tesseract::Tesseract::tess_add_doc_word | ( | WERD_CHOICE * | word_choice | ) |
Definition at line 79 of file tessbox.cpp.
Definition at line 39 of file tessbox.cpp.
bool tesseract::Tesseract::TestNewNormalization | ( | int | original_misfits, |
float | baseline_shift, | ||
float | new_x_ht, | ||
WERD_RES * | word, | ||
BLOCK * | block, | ||
ROW * | row | ||
) |
Definition at line 1468 of file control.cpp.
|
inline |
Definition at line 243 of file tesseractclass.h.
void tesseract::Tesseract::TidyUp | ( | PAGE_RES * | page_res | ) |
Definition at line 706 of file applybox.cpp.
void tesseract::Tesseract::tilde_crunch | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 422 of file docqual.cpp.
void tesseract::Tesseract::tilde_delete | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 594 of file docqual.cpp.
Definition at line 1434 of file control.cpp.
void tesseract::Tesseract::TrainFromBoxes | ( | const GenericVector< TBOX > & | boxes, |
const GenericVector< STRING > & | texts, | ||
BLOCK_LIST * | block_list, | ||
DocumentData * | training_data | ||
) |
Definition at line 76 of file linerec.cpp.
void tesseract::Tesseract::TrainLineRecognizer | ( | const STRING & | input_imagename, |
const STRING & | output_basename, | ||
BLOCK_LIST * | block_list | ||
) |
Definition at line 45 of file linerec.cpp.
WERD_RES * tesseract::Tesseract::TrySuperscriptSplits | ( | int | num_chopped_leading, |
float | leading_certainty, | ||
ScriptPos | leading_pos, | ||
int | num_chopped_trailing, | ||
float | trailing_certainty, | ||
ScriptPos | trailing_pos, | ||
WERD_RES * | word, | ||
bool * | is_good, | ||
int * | retry_rebuild_leading, | ||
int * | retry_rebuild_trailing | ||
) |
Try splitting off the given number of (chopped) blobs from the front and back of the given word and recognizing the pieces.
[in] | num_chopped_leading | how many chopped blobs from the left end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | leading_certainty | the (minimum) certainty had by the characters in the original leading section. |
[in] | leading_pos | "super" or "sub" (for debugging) |
[in] | num_chopped_trailing | how many chopped blobs from the right end of the word to chop off and try recognizing as a superscript (or subscript) |
[in] | trailing_certainty | the (minimum) certainty had by the characters in the original trailing section. |
[in] | trailing_pos | "super" or "sub" (for debugging) |
[in] | word | the word to try to chop up. |
[out] | is_good | do we believe our result? |
[out] | retry_rebuild_leading,retry_rebuild_trailing | If non-zero, and !is_good, then the caller may have luck trying to split the returned word with this number of (rebuilt) leading and trailing blobs / unichars. |
Definition at line 382 of file superscript.cpp.
Definition at line 120 of file docqual.cpp.
void tesseract::Tesseract::unrej_good_quality_words | ( | PAGE_RES_IT & | page_res_it | ) |
Definition at line 165 of file docqual.cpp.
Definition at line 45 of file adaptions.cpp.
BOOL8 tesseract::Tesseract::word_blank_and_set_display | ( | PAGE_RES_IT * | pr_its | ) |
Definition at line 716 of file pgedit.cpp.
BOOL8 tesseract::Tesseract::word_bln_display | ( | PAGE_RES_IT * | pr_it | ) |
Normalize word and display in word window
Definition at line 728 of file pgedit.cpp.
Definition at line 65 of file docqual.cpp.
void tesseract::Tesseract::word_char_quality | ( | WERD_RES * | word, |
ROW * | row, | ||
inT16 * | match_count, | ||
inT16 * | accepted_match_count | ||
) |
Definition at line 97 of file docqual.cpp.
BOOL8 tesseract::Tesseract::word_contains_non_1_digit | ( | const char * | word, |
const char * | word_lengths | ||
) |
Definition at line 509 of file reject.cpp.
CRUNCH_MODE tesseract::Tesseract::word_deletable | ( | WERD_RES * | word, |
inT16 & | delete_mode | ||
) |
Definition at line 899 of file docqual.cpp.
BOOL8 tesseract::Tesseract::word_display | ( | PAGE_RES_IT * | pr_it | ) |
word_display() Word Processor
Display a word according to its display modes
Definition at line 760 of file pgedit.cpp.
BOOL8 tesseract::Tesseract::word_dumper | ( | PAGE_RES_IT * | pr_it | ) |
Dump members to the debug window
Definition at line 921 of file pgedit.cpp.
Definition at line 77 of file docqual.cpp.
BOOL8 tesseract::Tesseract::word_set_display | ( | PAGE_RES_IT * | pr_it | ) |
word_set_display() Word processor
Display word according to current display mode settings
Definition at line 945 of file pgedit.cpp.
Definition at line 680 of file fixspace.cpp.
void tesseract::Tesseract::write_results | ( | PAGE_RES_IT & | page_res_it, |
char | newline_type, | ||
BOOL8 | force_eol | ||
) |
Definition at line 130 of file output.cpp.
int tesseract::Tesseract::applybox_debug = 1 |
"Debug level"
Definition at line 830 of file tesseractclass.h.
char* tesseract::Tesseract::applybox_exposure_pattern = ".exp" |
"Exposure value follows this pattern in the image" " filename. The name of the image files are expected" " to be in the form [lang].[fontname].exp[num].tif"
Definition at line 835 of file tesseractclass.h.
bool tesseract::Tesseract::applybox_learn_chars_and_char_frags_mode = false |
"Learn both character fragments (as is done in the" " special low exposure mode) as well as unfragmented" " characters."
Definition at line 839 of file tesseractclass.h.
bool tesseract::Tesseract::applybox_learn_ngrams_mode = false |
"Each bounding box is assumed to contain ngrams. Only" " learn the ngrams whose outlines overlap horizontally."
Definition at line 842 of file tesseractclass.h.
int tesseract::Tesseract::applybox_page = 0 |
"Page number to apply boxes from"
Definition at line 831 of file tesseractclass.h.
double tesseract::Tesseract::bestrate_pruning_factor = 2.0 |
"Multiplying factor of" " current best rate to prune other hypotheses"
Definition at line 1117 of file tesseractclass.h.
int tesseract::Tesseract::bidi_debug = 0 |
"Debug level for BiDi"
Definition at line 829 of file tesseractclass.h.
bool tesseract::Tesseract::bland_unrej = false |
"unrej potential with no checks"
Definition at line 943 of file tesseractclass.h.
char* tesseract::Tesseract::chs_leading_punct = "('`\"" |
"Leading punctuation"
Definition at line 882 of file tesseractclass.h.
char* tesseract::Tesseract::chs_trailing_punct1 = ").,;:?!" |
"1st Trailing punctuation"
Definition at line 883 of file tesseractclass.h.
char* tesseract::Tesseract::chs_trailing_punct2 = ")'`\"" |
"2nd Trailing punctuation"
Definition at line 884 of file tesseractclass.h.
char* tesseract::Tesseract::conflict_set_I_l_1 = "Il1[]" |
"Il1 conflict set"
Definition at line 1059 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_accept_ok = true |
"Use acceptability in okstring"
Definition at line 972 of file tesseractclass.h.
int tesseract::Tesseract::crunch_debug = 0 |
"As it says"
Definition at line 981 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_cert = -10.0 |
"POTENTIAL crunch cert lt this"
Definition at line 961 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_high_word = 1.5 |
"Del if word gt xht x this above bl"
Definition at line 966 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_low_word = 0.5 |
"Del if word gt xht x this below bl"
Definition at line 967 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_max_ht = 3.0 |
"Del if word ht gt xht x this"
Definition at line 963 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_min_ht = 0.7 |
"Del if word ht lt xht x this"
Definition at line 962 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_min_width = 3.0 |
"Del if word width lt xht x this"
Definition at line 964 of file tesseractclass.h.
double tesseract::Tesseract::crunch_del_rating = 60 |
"POTENTIAL crunch rating lt this"
Definition at line 960 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_early_convert_bad_unlv_chs = false |
"Take out ~^ early?"
Definition at line 951 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_early_merge_tess_fails = true |
"Before word crunch?"
Definition at line 950 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_include_numerals = false |
"Fiddle alpha figures"
Definition at line 975 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_leave_accept_strings = false |
"Don't pot crunch sensible strings"
Definition at line 974 of file tesseractclass.h.
int tesseract::Tesseract::crunch_leave_lc_strings = 4 |
"Don't crunch words with long lower case strings"
Definition at line 977 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_leave_ok_strings = true |
"Don't touch sensible strings"
Definition at line 971 of file tesseractclass.h.
int tesseract::Tesseract::crunch_leave_uc_strings = 4 |
"Don't crunch words with long lower case strings"
Definition at line 979 of file tesseractclass.h.
int tesseract::Tesseract::crunch_long_repetitions = 3 |
"Crunch words with long repetitions"
Definition at line 980 of file tesseractclass.h.
double tesseract::Tesseract::crunch_poor_garbage_cert = -9.0 |
"crunch garbage cert lt this"
Definition at line 955 of file tesseractclass.h.
double tesseract::Tesseract::crunch_poor_garbage_rate = 60 |
"crunch garbage rating lt this"
Definition at line 956 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_pot_garbage = true |
"POTENTIAL crunch garbage"
Definition at line 959 of file tesseractclass.h.
int tesseract::Tesseract::crunch_pot_indicators = 1 |
"How many potential indicators needed"
Definition at line 970 of file tesseractclass.h.
double tesseract::Tesseract::crunch_pot_poor_cert = -8.0 |
"POTENTIAL crunch cert lt this"
Definition at line 958 of file tesseractclass.h.
double tesseract::Tesseract::crunch_pot_poor_rate = 40 |
"POTENTIAL crunch rating lt this"
Definition at line 957 of file tesseractclass.h.
int tesseract::Tesseract::crunch_rating_max = 10 |
"For adj length in rating per ch"
Definition at line 969 of file tesseractclass.h.
double tesseract::Tesseract::crunch_small_outlines_size = 0.6 |
"Small if lt xht x this"
Definition at line 968 of file tesseractclass.h.
bool tesseract::Tesseract::crunch_terrible_garbage = true |
"As it says"
Definition at line 953 of file tesseractclass.h.
double tesseract::Tesseract::crunch_terrible_rating = 80.0 |
"crunch rating lt this"
Definition at line 952 of file tesseractclass.h.
bool tesseract::Tesseract::debug_acceptable_wds = false |
"Dump word pass/fail chk"
Definition at line 881 of file tesseractclass.h.
int tesseract::Tesseract::debug_fix_space_level = 0 |
"Contextual fixspace debug"
Definition at line 987 of file tesseractclass.h.
int tesseract::Tesseract::debug_noise_removal = 0 |
"Debug reassignment of small outlines"
Definition at line 865 of file tesseractclass.h.
int tesseract::Tesseract::debug_x_ht_level = 0 |
"Reestimate debug"
Definition at line 880 of file tesseractclass.h.
bool tesseract::Tesseract::docqual_excuse_outline_errs = false |
"Allow outline errs in unrejection?"
Definition at line 911 of file tesseractclass.h.
bool tesseract::Tesseract::enable_new_segsearch = false |
"Enable new segmentation search path."
Definition at line 1157 of file tesseractclass.h.
bool tesseract::Tesseract::enable_noise_removal = true |
"Remove and conditionally reassign small outlines when they" " confuse layout analysis, determining diacritics vs noise"
Definition at line 864 of file tesseractclass.h.
char* tesseract::Tesseract::file_type = ".tif" |
"Filename extension"
Definition at line 1066 of file tesseractclass.h.
int tesseract::Tesseract::fixsp_done_mode = 1 |
"What constitues done for spacing"
Definition at line 986 of file tesseractclass.h.
int tesseract::Tesseract::fixsp_non_noise_limit = 1 |
"How many non-noise blbs either side?"
Definition at line 983 of file tesseractclass.h.
double tesseract::Tesseract::fixsp_small_outlines_size = 0.28 |
"Small if lt xht x this"
Definition at line 984 of file tesseractclass.h.
double tesseract::Tesseract::heuristic_max_char_wh_ratio = 2.0 |
"max char width-to-height ratio allowed in segmentation"
Definition at line 1155 of file tesseractclass.h.
double tesseract::Tesseract::heuristic_segcost_rating_base = 1.25 |
"base factor for adding segmentation cost into word rating." "It's a multiplying factor, the larger the value above 1, " "the bigger the effect of segmentation cost."
Definition at line 1146 of file tesseractclass.h.
double tesseract::Tesseract::heuristic_weight_rating = 1 |
"weight associated with char rating in combined cost of state"
Definition at line 1148 of file tesseractclass.h.
double tesseract::Tesseract::heuristic_weight_seamcut = 0 |
"weight associated with seam cut in combined cost of state"
Definition at line 1153 of file tesseractclass.h.
double tesseract::Tesseract::heuristic_weight_width = 1000.0 |
"weight associated with width evidence in combined cost of" " state"
Definition at line 1151 of file tesseractclass.h.
bool tesseract::Tesseract::hocr_font_info = false |
"Add font info to hocr output"
Definition at line 949 of file tesseractclass.h.
bool tesseract::Tesseract::include_page_breaks = false |
"Include page separator string in output text after each " "image/page."
Definition at line 1097 of file tesseractclass.h.
bool tesseract::Tesseract::interactive_display_mode = false |
"Run interactively?"
Definition at line 1065 of file tesseractclass.h.
int tesseract::Tesseract::language_model_fixed_length_choices_depth = 3 |
"Depth of blob choice lists to explore" " when fixed length dawgs are on"
Definition at line 1140 of file tesseractclass.h.
bool tesseract::Tesseract::load_fixed_length_dawgs = true |
"Load fixed length" " dawgs (e.g. for non-space delimited languages)"
Definition at line 1113 of file tesseractclass.h.
bool tesseract::Tesseract::lstm_use_matrix = 1 |
"Use ratings matrix/beam searct with lstm"
Definition at line 907 of file tesseractclass.h.
double tesseract::Tesseract::min_orientation_margin = 7.0 |
"Min acceptable orientation margin"
Definition at line 1075 of file tesseractclass.h.
int tesseract::Tesseract::min_sane_x_ht_pixels = 8 |
"Reject any x-ht lt or eq than this"
Definition at line 1060 of file tesseractclass.h.
int tesseract::Tesseract::multilang_debug_level = 0 |
"Print multilang debug info."
Definition at line 902 of file tesseractclass.h.
bool tesseract::Tesseract::ngram_permuter_activated = false |
"Activate character-level n-gram-based permuter"
Definition at line 1136 of file tesseractclass.h.
double tesseract::Tesseract::noise_cert_basechar = -8.0 |
"Hingepoint for base char certainty"
Definition at line 868 of file tesseractclass.h.
double tesseract::Tesseract::noise_cert_disjoint = -2.5 |
"Hingepoint for disjoint certainty"
Definition at line 871 of file tesseractclass.h.
double tesseract::Tesseract::noise_cert_factor = 0.375 |
"Scaling on certainty diff from Hingepoint"
Definition at line 877 of file tesseractclass.h.
double tesseract::Tesseract::noise_cert_punc = -2.5 |
"Threshold for new punc char certainty"
Definition at line 874 of file tesseractclass.h.
int tesseract::Tesseract::noise_maxperblob = 8 |
"Max diacritics to apply to a blob"
Definition at line 878 of file tesseractclass.h.
int tesseract::Tesseract::noise_maxperword = 16 |
"Max diacritics to apply to a word"
Definition at line 879 of file tesseractclass.h.
char* tesseract::Tesseract::numeric_punctuation = ".," |
"Punct. chs expected WITHIN numbers"
Definition at line 989 of file tesseractclass.h.
int tesseract::Tesseract::ocr_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT |
"Whether to use the top-line splitting process for Devanagari " "documents while performing ocr."
Definition at line 824 of file tesseractclass.h.
char* tesseract::Tesseract::ok_repeated_ch_non_alphanum_wds = "-?*\075" |
"Allow NN to unrej"
Definition at line 1058 of file tesseractclass.h.
char* tesseract::Tesseract::outlines_2 = "ij!?%\":;" |
"Non standard number of outlines"
Definition at line 909 of file tesseractclass.h.
char* tesseract::Tesseract::outlines_odd = "%| " |
"Non standard number of outlines"
Definition at line 908 of file tesseractclass.h.
char* tesseract::Tesseract::page_separator = "\f" |
"Page separator (default is form feed control character)"
Definition at line 1099 of file tesseractclass.h.
int tesseract::Tesseract::pageseg_devanagari_split_strategy = tesseract::ShiroRekhaSplitter::NO_SPLIT |
"Whether to use the top-line splitting process for Devanagari " "documents while performing page-segmentation."
Definition at line 820 of file tesseractclass.h.
int tesseract::Tesseract::paragraph_debug_level = 0 |
"Print paragraph debug info."
Definition at line 903 of file tesseractclass.h.
bool tesseract::Tesseract::paragraph_text_based = true |
"Run paragraph detection on the post-text-recognition " "(more accurate)"
Definition at line 906 of file tesseractclass.h.
bool tesseract::Tesseract::permute_chartype_word = 0 |
"Turn on character type (property) consistency permuter"
Definition at line 1129 of file tesseractclass.h.
bool tesseract::Tesseract::permute_debug = 0 |
"char permutation debug"
Definition at line 1115 of file tesseractclass.h.
bool tesseract::Tesseract::permute_fixed_length_dawg = 0 |
"Turn on fixed-length phrasebook search permuter"
Definition at line 1127 of file tesseractclass.h.
bool tesseract::Tesseract::permute_only_top = false |
"Run only the top choice permuter"
Definition at line 1137 of file tesseractclass.h.
bool tesseract::Tesseract::permute_script_word = 0 |
"Turn on word script consistency permuter"
Definition at line 1119 of file tesseractclass.h.
bool tesseract::Tesseract::poly_allow_detailed_fx = false |
"Allow feature extractors to see the original outline"
Definition at line 1079 of file tesseractclass.h.
bool tesseract::Tesseract::preserve_interword_spaces = false |
"Preserve multiple interword spaces"
Definition at line 1094 of file tesseractclass.h.
double tesseract::Tesseract::quality_blob_pc = 0.0 |
"good_quality_doc gte good blobs limit"
Definition at line 886 of file tesseractclass.h.
double tesseract::Tesseract::quality_char_pc = 0.95 |
"good_quality_doc gte good char limit"
Definition at line 889 of file tesseractclass.h.
int tesseract::Tesseract::quality_min_initial_alphas_reqd = 2 |
"alphas in a good word"
Definition at line 890 of file tesseractclass.h.
double tesseract::Tesseract::quality_outline_pc = 1.0 |
"good_quality_doc lte outline error limit"
Definition at line 888 of file tesseractclass.h.
double tesseract::Tesseract::quality_rej_pc = 0.08 |
"good_quality_doc lte rejection limit"
Definition at line 885 of file tesseractclass.h.
double tesseract::Tesseract::quality_rowrej_pc = 1.1 |
"good_quality_doc gte good char limit"
Definition at line 945 of file tesseractclass.h.
bool tesseract::Tesseract::rej_1Il_trust_permuter_type = true |
"Don't double check"
Definition at line 1049 of file tesseractclass.h.
bool tesseract::Tesseract::rej_1Il_use_dict_word = false |
"Use dictword test"
Definition at line 1048 of file tesseractclass.h.
bool tesseract::Tesseract::rej_alphas_in_number_perm = false |
"Extend permuter check"
Definition at line 1054 of file tesseractclass.h.
bool tesseract::Tesseract::rej_trust_doc_dawg = false |
"Use DOC dawg in 11l conf. detector"
Definition at line 1047 of file tesseractclass.h.
bool tesseract::Tesseract::rej_use_good_perm = true |
"Individual rejection control"
Definition at line 1052 of file tesseractclass.h.
bool tesseract::Tesseract::rej_use_sensible_wd = false |
"Extend permuter check"
Definition at line 1053 of file tesseractclass.h.
bool tesseract::Tesseract::rej_use_tess_accepted = true |
"Individual rejection control"
Definition at line 1050 of file tesseractclass.h.
bool tesseract::Tesseract::rej_use_tess_blanks = true |
"Individual rejection control"
Definition at line 1051 of file tesseractclass.h.
double tesseract::Tesseract::rej_whole_of_mostly_reject_word_fract = 0.85 |
"if >this fract"
Definition at line 1055 of file tesseractclass.h.
int tesseract::Tesseract::segment_debug = 0 |
"Debug the whole segmentation process"
Definition at line 1114 of file tesseractclass.h.
double tesseract::Tesseract::segment_reward_chartype = 0.97 |
"Score multipler for char type consistency within a word. "
Definition at line 1131 of file tesseractclass.h.
double tesseract::Tesseract::segment_reward_ngram_best_choice = 0.99 |
"Score multipler for ngram permuter's best choice" " (only used in the Han script path)."
Definition at line 1134 of file tesseractclass.h.
double tesseract::Tesseract::segment_reward_script = 0.95 |
"Score multipler for script consistency within a word. " "Being a 'reward' factor, it should be <= 1. " "Smaller value implies bigger reward."
Definition at line 1125 of file tesseractclass.h.
bool tesseract::Tesseract::segment_segcost_rating = 0 |
"incorporate segmentation cost in word rating?"
Definition at line 1121 of file tesseractclass.h.
double tesseract::Tesseract::segsearch_max_fixed_pitch_char_wh_ratio = 2.0 |
"Maximum character width-to-height ratio for" "fixed pitch fonts"
Definition at line 1160 of file tesseractclass.h.
double tesseract::Tesseract::subscript_max_y_top = 0.5 |
"Maximum top of a character measured as a multiple of x-height " "above the baseline for us to reconsider whether it's a " "subscript."
Definition at line 1008 of file tesseractclass.h.
double tesseract::Tesseract::superscript_bettered_certainty = 0.97 |
"What reduction in " "badness do we think sufficient to choose a superscript over " "what we'd thought. For example, a value of 0.6 means we want " "to reduce badness of certainty by 40%"
Definition at line 1000 of file tesseractclass.h.
int tesseract::Tesseract::superscript_debug = 0 |
"Debug level for sub & superscript fixer"
Definition at line 993 of file tesseractclass.h.
double tesseract::Tesseract::superscript_min_y_bottom = 0.3 |
"Minimum bottom of a character measured as a multiple of " "x-height above the baseline for us to reconsider whether it's " "a superscript."
Definition at line 1012 of file tesseractclass.h.
double tesseract::Tesseract::superscript_scaledown_ratio = 0.4 |
"A superscript scaled down more than this is unbelievably " "small. For example, 0.3 means we expect the font size to " "be no smaller than 30% of the text line font size."
Definition at line 1004 of file tesseractclass.h.
double tesseract::Tesseract::superscript_worse_certainty = 2.0 |
"How many times worse " "certainty does a superscript position glyph need to be for us " "to try classifying it as a char with a different baseline?"
Definition at line 996 of file tesseractclass.h.
double tesseract::Tesseract::suspect_accept_rating = -999.9 |
"Accept good rating limit"
Definition at line 1032 of file tesseractclass.h.
bool tesseract::Tesseract::suspect_constrain_1Il = false |
"UNLV keep 1Il chars rejected"
Definition at line 1030 of file tesseractclass.h.
int tesseract::Tesseract::suspect_level = 99 |
"Suspect marker level"
Definition at line 1026 of file tesseractclass.h.
double tesseract::Tesseract::suspect_rating_per_ch = 999.9 |
"Don't touch bad rating limit"
Definition at line 1031 of file tesseractclass.h.
int tesseract::Tesseract::suspect_short_words = 2 |
"Don't Suspect dict wds longer than this"
Definition at line 1029 of file tesseractclass.h.
int tesseract::Tesseract::suspect_space_level = 100 |
"Min suspect level for rejecting spaces"
Definition at line 1028 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_adaption_debug = false |
"Generate and print debug information for adaption"
Definition at line 828 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_ambigs_training = false |
"Perform training for ambiguities"
Definition at line 816 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_bigram_debug = 0 |
"Amount of debug output for bigram " "correction."
Definition at line 861 of file tesseractclass.h.
char* tesseract::Tesseract::tessedit_char_blacklist = "" |
"Blacklist of chars not to recognize"
Definition at line 810 of file tesseractclass.h.
char* tesseract::Tesseract::tessedit_char_unblacklist = "" |
"List of chars to override tessedit_char_blacklist"
Definition at line 814 of file tesseractclass.h.
char* tesseract::Tesseract::tessedit_char_whitelist = "" |
"Whitelist of chars to recognize"
Definition at line 812 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_consistent_reps = true |
"Force all rep chars the same"
Definition at line 1039 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_create_boxfile = false |
"Output text with boxes"
Definition at line 1061 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_create_hocr = false |
"Write .html hOCR output file"
Definition at line 1019 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_create_pdf = false |
"Write .pdf output file"
Definition at line 1021 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_create_tsv = false |
"Write .tsv output file"
Definition at line 1020 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_create_txt = false |
"Write .txt output file"
Definition at line 1018 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_debug_block_rejection = false |
"Block and Row stats"
Definition at line 855 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_debug_doc_rejection = false |
"Page stats"
Definition at line 940 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_debug_fonts = false |
"Output font info per char"
Definition at line 854 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_debug_quality_metrics = false |
"Output data to debug file"
Definition at line 942 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_display_outwords = false |
"Draw output words"
Definition at line 843 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_dont_blkrej_good_wds = false |
"Use word segmentation quality metric"
Definition at line 929 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_dont_rowrej_good_wds = false |
"Use word segmentation quality metric"
Definition at line 931 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_dump_choices = false |
"Dump char choices"
Definition at line 844 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_dump_pageseg_images = false |
"Dump intermediate images made during page segmentation"
Definition at line 801 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_enable_bigram_correction = true |
"Enable correction based on the word bigram dictionary."
Definition at line 857 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_enable_dict_correction = false |
"Enable single word correction based on the dictionary."
Definition at line 859 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_enable_doc_dict = true |
"Add words to the document dictionary"
Definition at line 853 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_fix_fuzzy_spaces = true |
"Try to improve fuzzy spaces"
Definition at line 847 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_fix_hyphens = true |
"Crunch double hyphens?"
Definition at line 850 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_flip_0O = true |
"Contextual 0O O0 flips"
Definition at line 1042 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_good_doc_still_rowrej_wd = 1.1 |
"rej good doc wd if more than this fraction rejected"
Definition at line 937 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_good_quality_unrej = true |
"Reduce rejection on good docs"
Definition at line 913 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_image_border = 2 |
"Rej blbs near image edge limit"
Definition at line 1056 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_init_config_only = false |
"Only initialize with the config file. Useful if the instance is " "not going to be used for OCR but say only for layout analysis."
Definition at line 1082 of file tesseractclass.h.
char* tesseract::Tesseract::tessedit_load_sublangs = "" |
"List of languages to load with this one"
Definition at line 1069 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_lower_flip_hyphen = 1.5 |
"Aspect ratio dot/hyphen test"
Definition at line 1044 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_make_boxes_from_boxes = false |
"Generate more boxes from boxed chars"
Definition at line 797 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_matcher_log = false |
"Log matcher activity"
Definition at line 896 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_minimal_rej_pass1 = false |
"Do minimal rejection on pass 1 output"
Definition at line 894 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_minimal_rejection = false |
"Only reject tess failures"
Definition at line 1033 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_ocr_engine_mode = tesseract::OEM_DEFAULT |
"Which OCR engine(s) to run (Tesseract, LSTM, both). Defaults" " to loading and running the most accurate available."
Definition at line 808 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_ok_mode = 5 |
"Acceptance decision algorithm"
Definition at line 1111 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_override_permuter = true |
"According to dict_word"
Definition at line 1067 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_page_number = -1 |
"-1 -> All pages, else specific page to process"
Definition at line 1063 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_pageseg_mode = PSM_SINGLE_BLOCK |
"Page seg mode: 0=osd only, 1=auto+osd, 2=auto, 3=col, 4=block," " 5=line, 6=word, 7=char" " (Values from PageSegMode enum in publictypes.h)"
Definition at line 805 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_parallelize = 0 |
"Run in parallel where possible"
Definition at line 1092 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_prefer_joined_punct = false |
"Reward punctation joins"
Definition at line 985 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_preserve_blk_rej_perfect_wds = true |
"Only rej partially rejected words in block rejection"
Definition at line 925 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_preserve_min_wd_len = 2 |
"Only preserve wds longer than this"
Definition at line 933 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_preserve_row_rej_perfect_wds = true |
"Only rej partially rejected words in row rejection"
Definition at line 927 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_redo_xheight = true |
"Check/Correct x-height"
Definition at line 851 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_reject_bad_qual_wds = true |
"Reject all bad quality wds"
Definition at line 939 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_reject_block_percent = 45.00 |
"%rej allowed before rej whole block"
Definition at line 918 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_reject_doc_percent = 65.00 |
"%rej allowed before rej whole doc"
Definition at line 916 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_reject_mode = 0 |
"Rejection algorithm"
Definition at line 1040 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_reject_row_percent = 40.00 |
"%rej allowed before rej whole row"
Definition at line 920 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_rejection_debug = false |
"Adaption debug"
Definition at line 1041 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_resegment_from_boxes = false |
"Take segmentation and labeling from box file"
Definition at line 791 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_resegment_from_line_boxes = false |
"Conversion of word/line box file to char box file"
Definition at line 793 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_row_rej_good_docs = true |
"Apply row rejection to good docs"
Definition at line 935 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_tess_adaption_mode = 0x27 |
"Adaptation decision algorithm for tess"
Definition at line 892 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_test_adaption = false |
"Test adaption criteria"
Definition at line 895 of file tesseractclass.h.
int tesseract::Tesseract::tessedit_test_adaption_mode = 3 |
"Adaptation decision algorithm for tess"
Definition at line 898 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_timing_debug = false |
"Print timing stats"
Definition at line 845 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_train_from_boxes = false |
"Generate training data from boxed chars"
Definition at line 795 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_train_line_recognizer = false |
"Break input into lines and remap boxes if present"
Definition at line 799 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_unrej_any_wd = false |
"Don't bother with word plausibility"
Definition at line 849 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_upper_flip_hyphen = 1.8 |
"Aspect ratio dot/hyphen test"
Definition at line 1046 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_use_primary_params_model = false |
"In multilingual mode use params model of the primary language"
Definition at line 1071 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_use_reject_spaces = true |
"Reject spaces?"
Definition at line 914 of file tesseractclass.h.
double tesseract::Tesseract::tessedit_whole_wd_rej_row_percent = 70.00 |
"Number of row rejects in whole word rejects" "which prevents whole row rejection"
Definition at line 923 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_word_for_word = false |
"Make output have exactly one word per WERD"
Definition at line 1036 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_write_block_separators = false |
"Write block separators in output"
Definition at line 1014 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_write_images = false |
"Capture the image from the IPE"
Definition at line 1064 of file tesseractclass.h.
char* tesseract::Tesseract::tessedit_write_params_to_file = "" |
"Write all parameters to the given file."
Definition at line 826 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_write_rep_codes = false |
"Write repetition char code"
Definition at line 1016 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_write_unlv = false |
"Write .unlv output file"
Definition at line 1017 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_zero_kelvin_rejection = false |
"Don't reject ANYTHING AT ALL"
Definition at line 1038 of file tesseractclass.h.
bool tesseract::Tesseract::tessedit_zero_rejection = false |
"Don't reject ANYTHING"
Definition at line 1034 of file tesseractclass.h.
bool tesseract::Tesseract::test_pt = false |
"Test for point"
Definition at line 899 of file tesseractclass.h.
double tesseract::Tesseract::test_pt_x = 99999.99 |
"xcoord"
Definition at line 900 of file tesseractclass.h.
double tesseract::Tesseract::test_pt_y = 99999.99 |
"ycoord"
Definition at line 901 of file tesseractclass.h.
bool tesseract::Tesseract::textonly_pdf = false |
"Create PDF with only one invisible text layer"
Definition at line 1023 of file tesseractclass.h.
bool tesseract::Tesseract::textord_equation_detect = false |
"Turn on equation detector"
Definition at line 1083 of file tesseractclass.h.
double tesseract::Tesseract::textord_tabfind_aligned_gap_fraction = 0.75 |
"Fraction of height used as a minimum gap for aligned blobs."
Definition at line 1091 of file tesseractclass.h.
bool tesseract::Tesseract::textord_tabfind_force_vertical_text = false |
"Force using vertical text page mode"
Definition at line 1086 of file tesseractclass.h.
bool tesseract::Tesseract::textord_tabfind_show_vlines = false |
"Debug line finding"
Definition at line 1076 of file tesseractclass.h.
bool tesseract::Tesseract::textord_tabfind_vertical_horizontal_mix = true |
"find horizontal lines such as headers in vertical page mode"
Definition at line 1110 of file tesseractclass.h.
bool tesseract::Tesseract::textord_tabfind_vertical_text = true |
"Enable vertical detection"
Definition at line 1084 of file tesseractclass.h.
double tesseract::Tesseract::textord_tabfind_vertical_text_ratio = 0.5 |
"Fraction of textlines deemed vertical to use vertical page " "mode"
Definition at line 1089 of file tesseractclass.h.
bool tesseract::Tesseract::textord_use_cjk_fp_model = FALSE |
"Use CJK fixed pitch model"
Definition at line 1077 of file tesseractclass.h.
bool tesseract::Tesseract::unlv_tilde_crunching = true |
"Mark v.bad words for tilde crunch"
Definition at line 947 of file tesseractclass.h.
char* tesseract::Tesseract::unrecognised_char = "|" |
"Output char for unidentified blobs"
Definition at line 1025 of file tesseractclass.h.
bool tesseract::Tesseract::use_new_state_cost = FALSE |
"use new state cost heuristics for segmentation state evaluation"
Definition at line 1142 of file tesseractclass.h.
int tesseract::Tesseract::x_ht_acceptance_tolerance = 8 |
"Max allowed deviation of blob top outside of font data"
Definition at line 991 of file tesseractclass.h.
int tesseract::Tesseract::x_ht_min_change = 8 |
"Min change in xht before actually trying it"
Definition at line 992 of file tesseractclass.h.