tesseract  4.00.00dev
tesseract::Wordrec Class Reference

#include <wordrec.h>

Inheritance diagram for tesseract::Wordrec:
tesseract::Classify tesseract::CCStruct tesseract::CUtil tesseract::CCUtil tesseract::Tesseract

Public Member Functions

 Wordrec ()
 
virtual ~Wordrec ()
 
void SaveAltChoices (const LIST &best_choices, WERD_RES *word)
 
void FillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void CallFillLattice (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
void SegSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void WordSearch (WERD_RES *word_res)
 
void InitialSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void DoSegSearch (WERD_RES *word_res)
 
SEAMattempt_blob_chop (TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
 
SEAMchop_numbered_blob (TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
 
SEAMchop_overlapping_blob (const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
 
void add_seam_to_queue (float new_priority, SEAM *new_seam, SeamQueue *seams)
 
void choose_best_seam (SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
 
void combine_seam (const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
 
SEAMpick_good_seam (TBLOB *blob)
 
void try_point_pairs (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
void try_vertical_splits (EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
 
PRIORITY grade_split_length (register SPLIT *split)
 
PRIORITY grade_sharpness (register SPLIT *split)
 
bool near_point (EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
 
virtual BLOB_CHOICE_LIST * classify_piece (const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
 
void merge_fragments (MATRIX *ratings, inT16 num_blobs)
 
void get_fragment_lists (inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
 
void merge_and_put_fragment_lists (inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
 
void fill_filtered_fragment_list (BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
 
program_editup

Initialize all the things in the program that need to be initialized. init_permute determines whether to initialize the permute functions and Dawg models.

void program_editup (const char *textbase, TessdataManager *init_classifier, TessdataManager *init_dict)
 
cc_recog

Recognize a word.

void cc_recog (WERD_RES *word)
 
program_editdown

This function holds any nessessary post processing for the Wise Owl program.

void program_editdown (inT32 elasped_time)
 
set_pass1

Get ready to do some pass 1 stuff.

void set_pass1 ()
 
set_pass2

Get ready to do some pass 2 stuff.

void set_pass2 ()
 
end_recog

Cleanup and exit the recog program.

int end_recog ()
 
call_matcher

Called from Tess with a blob in tess form. The blob may need rotating to the correct orientation for classification.

BLOB_CHOICE_LIST * call_matcher (TBLOB *blob)
 
dict_word()

Test the dictionaries, returning NO_PERM (0) if not found, or one of the PermuterType values if found, according to the dictionary.

int dict_word (const WERD_CHOICE &word)
 
classify_blob

Classify the this blob if it is not already recorded in the match table. Attempt to recognize this blob as a character. The recognition rating for this blob will be stored as a part of the blob. This value will also be returned to the caller.

Parameters
blobCurrent blob
stringThe string to display in ScrollView
colorThe colour to use when displayed with ScrollView
BLOB_CHOICE_LIST * classify_blob (TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
 
point_priority

Assign a priority to and edge point that might be used as part of a split. The argument should be of type EDGEPT.

PRIORITY point_priority (EDGEPT *point)
 
add_point_to_list

Add an edge point to a POINT_GROUP containg a list of other points.

void add_point_to_list (PointHeap *point_heap, EDGEPT *point)
 
bool is_inside_angle (EDGEPT *pt)
 
angle_change

Return the change in angle (degrees) of the line segments between points one and two, and two and three.

int angle_change (EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
 
pick_close_point

Choose the edge point that is closest to the critical point. This point may not be exactly vertical from the critical point.

EDGEPTpick_close_point (EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
 
prioritize_points

Find a list of edge points from the outer outline of this blob. For each of these points assign a priority. Sort these points using a heap structure so that they can be visited in order.

void prioritize_points (TESSLINE *outline, PointHeap *points)
 
new_min_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL.

void new_min_point (EDGEPT *local_min, PointHeap *points)
 
new_max_point

Found a new minimum point try to decide whether to save it or not. Return the new value for the local minimum. If a point is saved then the local minimum is reset to NULL.

void new_max_point (EDGEPT *local_max, PointHeap *points)
 
vertical_projection_point

For one point on the outline, find the corresponding point on the other side of the outline that is a likely projection for a split point. This is done by iterating through the edge points until the X value of the point being looked at is greater than the X value of the split point. Ensure that the point being returned is not right next to the split point. Return the edge point in *best_point as a result, and any points that were newly created are also saved on the new_points list.

void vertical_projection_point (EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
 
improve_one_blob

Finds the best place to chop, based on the worst blob, fixpt, or next to a fragment, according to the input. Returns the SEAM corresponding to the chop point, if any is found, and the index in the ratings_matrix of the chopped blob. Note that blob_choices is just a copy of the pointers in the leading diagonal of the ratings MATRIX. Although the blob is chopped, the returned SEAM is yet to be inserted into word->seam_array and the resulting blobs are unclassified, so this function can be used by ApplyBox as well as during recognition.

SEAMimprove_one_blob (const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
 
chop_one_blob

Start with the current one-blob word and its classification. Find the worst blobs and try to divide it up to improve the ratings. Used for testing chopper.

SEAMchop_one_blob (const GenericVector< TBOX > &boxes, const GenericVector< BLOB_CHOICE *> &blob_choices, WERD_RES *word_res, int *blob_number)
 
chop_word_main

Classify the blobs in this word and permute the results. Find the worst blob in the word and chop it up. Continue this process until a good answer has been found or all the blobs have been chopped up enough. The results are returned in the WERD_RES.

void chop_word_main (WERD_RES *word)
 
improve_by_chopping

Repeatedly chops the worst blob, classifying the new blobs fixing up all the data, and incrementally runs the segmentation search until a good word is found, or no more chops can be found.

void improve_by_chopping (float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
 
int select_blob_to_split (const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment)
 
int select_blob_to_split_from_fixpt (DANGERR *fixpt)
 
- Public Member Functions inherited from tesseract::Classify
 Classify ()
 
virtual ~Classify ()
 
DictgetDict ()
 
const ShapeTableshape_table () const
 
void SetStaticClassifier (ShapeClassifier *static_classifier)
 
void AddLargeSpeckleTo (int blob_length, BLOB_CHOICE_LIST *choices)
 
bool LargeSpeckle (const TBLOB &blob)
 
ADAPT_TEMPLATES NewAdaptedTemplates (bool InitFromUnicharset)
 
int GetFontinfoId (ADAPT_CLASS Class, uinT8 ConfigId)
 
int PruneClasses (const INT_TEMPLATES_STRUCT *int_templates, int num_features, int keep_this, const INT_FEATURE_STRUCT *features, const uinT8 *normalization_factors, const uinT16 *expected_num_features, GenericVector< CP_RESULT_STRUCT > *results)
 
void ReadNewCutoffs (TFile *fp, CLASS_CUTOFF_ARRAY Cutoffs)
 
void PrintAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
void WriteAdaptedTemplates (FILE *File, ADAPT_TEMPLATES Templates)
 
ADAPT_TEMPLATES ReadAdaptedTemplates (TFile *File)
 
FLOAT32 ComputeNormMatch (CLASS_ID ClassId, const FEATURE_STRUCT &feature, BOOL8 DebugMatch)
 
void FreeNormProtos ()
 
NORM_PROTOSReadNormProtos (TFile *fp)
 
void ConvertProto (PROTO Proto, int ProtoId, INT_CLASS Class)
 
INT_TEMPLATES CreateIntTemplates (CLASSES FloatProtos, const UNICHARSET &target_unicharset)
 
void LearnWord (const char *fontname, WERD_RES *word)
 
void LearnPieces (const char *fontname, int start, int length, float threshold, CharSegmentationType segmentation, const char *correct_text, WERD_RES *word)
 
void InitAdaptiveClassifier (TessdataManager *mgr)
 
void InitAdaptedClass (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, ADAPT_CLASS Class, ADAPT_TEMPLATES Templates)
 
void AmbigClassifier (const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, const TBLOB *blob, INT_TEMPLATES templates, ADAPT_CLASS *classes, UNICHAR_ID *ambiguities, ADAPT_RESULTS *results)
 
void MasterMatcher (INT_TEMPLATES templates, inT16 num_features, const INT_FEATURE_STRUCT *features, const uinT8 *norm_factors, ADAPT_CLASS *classes, int debug, int matcher_multiplier, const TBOX &blob_box, const GenericVector< CP_RESULT_STRUCT > &results, ADAPT_RESULTS *final_results)
 
void ExpandShapesAndApplyCorrections (ADAPT_CLASS *classes, bool debug, int class_id, int bottom, int top, float cp_rating, int blob_length, int matcher_multiplier, const uinT8 *cn_factors, UnicharRating *int_result, ADAPT_RESULTS *final_results)
 
double ComputeCorrectedRating (bool debug, int unichar_id, double cp_rating, double im_rating, int feature_misses, int bottom, int top, int blob_length, int matcher_multiplier, const uinT8 *cn_factors)
 
void ConvertMatchesToChoices (const DENORM &denorm, const TBOX &box, ADAPT_RESULTS *Results, BLOB_CHOICE_LIST *Choices)
 
void AddNewResult (const UnicharRating &new_result, ADAPT_RESULTS *results)
 
int GetAdaptiveFeatures (TBLOB *Blob, INT_FEATURE_ARRAY IntFeatures, FEATURE_SET *FloatFeatures)
 
void DebugAdaptiveClassifier (TBLOB *Blob, ADAPT_RESULTS *Results)
 
PROTO_ID MakeNewTempProtos (FEATURE_SET Features, int NumBadFeat, FEATURE_ID BadFeat[], INT_CLASS IClass, ADAPT_CLASS Class, BIT_VECTOR TempProtoMask)
 
int MakeNewTemporaryConfig (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int FontinfoId, int NumFeatures, INT_FEATURE_ARRAY Features, FEATURE_SET FloatFeatures)
 
void MakePermanent (ADAPT_TEMPLATES Templates, CLASS_ID ClassId, int ConfigId, TBLOB *Blob)
 
void PrintAdaptiveMatchResults (const ADAPT_RESULTS &results)
 
void RemoveExtraPuncs (ADAPT_RESULTS *Results)
 
void RemoveBadMatches (ADAPT_RESULTS *Results)
 
void SetAdaptiveThreshold (FLOAT32 Threshold)
 
void ShowBestMatchFor (int shape_id, const INT_FEATURE_STRUCT *features, int num_features)
 
STRING ClassIDToDebugStr (const INT_TEMPLATES_STRUCT *templates, int class_id, int config_id) const
 
int ClassAndConfigIDToFontOrShapeID (int class_id, int int_result_config) const
 
int ShapeIDToClassID (int shape_id) const
 
UNICHAR_IDBaselineClassifier (TBLOB *Blob, const GenericVector< INT_FEATURE_STRUCT > &int_features, const INT_FX_RESULT_STRUCT &fx_info, ADAPT_TEMPLATES Templates, ADAPT_RESULTS *Results)
 
int CharNormClassifier (TBLOB *blob, const TrainingSample &sample, ADAPT_RESULTS *adapt_results)
 
int CharNormTrainingSample (bool pruner_only, int keep_this, const TrainingSample &sample, GenericVector< UnicharRating > *results)
 
UNICHAR_IDGetAmbiguities (TBLOB *Blob, CLASS_ID CorrectClass)
 
void DoAdaptiveMatch (TBLOB *Blob, ADAPT_RESULTS *Results)
 
void AdaptToChar (TBLOB *Blob, CLASS_ID ClassId, int FontinfoId, FLOAT32 Threshold, ADAPT_TEMPLATES adaptive_templates)
 
void DisplayAdaptedChar (TBLOB *blob, INT_CLASS_STRUCT *int_class)
 
bool AdaptableWord (WERD_RES *word)
 
void EndAdaptiveClassifier ()
 
void SettupPass1 ()
 
void SettupPass2 ()
 
void AdaptiveClassifier (TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
 
void ClassifyAsNoise (ADAPT_RESULTS *Results)
 
void ResetAdaptiveClassifierInternal ()
 
void SwitchAdaptiveClassifier ()
 
void StartBackupAdaptiveClassifier ()
 
int GetCharNormFeature (const INT_FX_RESULT_STRUCT &fx_info, INT_TEMPLATES templates, uinT8 *pruner_norm_array, uinT8 *char_norm_array)
 
void ComputeCharNormArrays (FEATURE_STRUCT *norm_feature, INT_TEMPLATES_STRUCT *templates, uinT8 *char_norm_array, uinT8 *pruner_array)
 
bool TempConfigReliable (CLASS_ID class_id, const TEMP_CONFIG &config)
 
void UpdateAmbigsGroup (CLASS_ID class_id, TBLOB *Blob)
 
bool AdaptiveClassifierIsFull () const
 
bool AdaptiveClassifierIsEmpty () const
 
bool LooksLikeGarbage (TBLOB *blob)
 
void RefreshDebugWindow (ScrollView **win, const char *msg, int y_offset, const TBOX &wbox)
 
void ClearCharNormArray (uinT8 *char_norm_array)
 
void ComputeIntCharNormArray (const FEATURE_STRUCT &norm_feature, uinT8 *char_norm_array)
 
void ComputeIntFeatures (FEATURE_SET Features, INT_FEATURE_ARRAY IntFeatures)
 
INT_TEMPLATES ReadIntTemplates (TFile *fp)
 
void WriteIntTemplates (FILE *File, INT_TEMPLATES Templates, const UNICHARSET &target_unicharset)
 
CLASS_ID GetClassToDebug (const char *Prompt, bool *adaptive_on, bool *pretrained_on, int *shape_id)
 
void ShowMatchDisplay ()
 
UnicityTable< FontInfo > & get_fontinfo_table ()
 
const UnicityTable< FontInfo > & get_fontinfo_table () const
 
UnicityTable< FontSet > & get_fontset_table ()
 
void NormalizeOutlines (LIST Outlines, FLOAT32 *XScale, FLOAT32 *YScale)
 
FEATURE_SET ExtractOutlineFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractPicoFeatures (TBLOB *Blob)
 
FEATURE_SET ExtractIntCNFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
FEATURE_SET ExtractIntGeoFeatures (const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
 
void LearnBlob (const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
 
bool WriteTRFile (const STRING &filename)
 
- Public Member Functions inherited from tesseract::CCStruct
 CCStruct ()
 
 ~CCStruct ()
 
- Public Member Functions inherited from tesseract::CUtil
 CUtil ()
 
 ~CUtil ()
 
void read_variables (const char *filename, bool global_only)
 
- Public Member Functions inherited from tesseract::CCUtil
 CCUtil ()
 
virtual ~CCUtil ()
 
void main_setup (const char *argv0, const char *basename)
 CCUtil::main_setup - set location of tessdata and name of image. More...
 
ParamsVectorsparams ()
 

Public Attributes

bool merge_fragments_in_matrix = TRUE
 
bool wordrec_no_block = FALSE
 
bool wordrec_enable_assoc = TRUE
 
bool force_word_assoc = FALSE
 
double wordrec_worst_state = 1
 
bool fragments_guide_chopper = FALSE
 
int repair_unchopped_blobs = 1
 
double tessedit_certainty_threshold = -2.25
 
int chop_debug = 0
 
bool chop_enable = 1
 
bool chop_vertical_creep = 0
 
int chop_split_length = 10000
 
int chop_same_distance = 2
 
int chop_min_outline_points = 6
 
int chop_seam_pile_size = 150
 
bool chop_new_seam_pile = 1
 
int chop_inside_angle = -50
 
int chop_min_outline_area = 2000
 
double chop_split_dist_knob = 0.5
 
double chop_overlap_knob = 0.9
 
double chop_center_knob = 0.15
 
int chop_centered_maxwidth = 90
 
double chop_sharpness_knob = 0.06
 
double chop_width_change_knob = 5.0
 
double chop_ok_split = 100.0
 
double chop_good_split = 50.0
 
int chop_x_y_weight = 3
 
int segment_adjust_debug = 0
 
bool assume_fixed_pitch_char_segment = FALSE
 
int wordrec_debug_level = 0
 
int wordrec_max_join_chunks = 4
 
bool wordrec_skip_no_truth_words = false
 
bool wordrec_debug_blamer = false
 
bool wordrec_run_blamer = false
 
int segsearch_debug_level = 0
 
int segsearch_max_pain_points = 2000
 
int segsearch_max_futile_classifications = 10
 
double segsearch_max_char_wh_ratio = 2.0
 
bool save_alt_choices = true
 
LanguageModellanguage_model_
 
PRIORITY pass2_ok_split
 
WERD_CHOICEprev_word_best_choice_
 
GenericVector< intblame_reasons_
 
void(Wordrec::* fill_lattice_ )(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
 
- Public Attributes inherited from tesseract::Classify
bool allow_blob_division = true
 
bool prioritize_division = FALSE
 
int tessedit_single_match = FALSE
 
bool classify_enable_learning = true
 
int classify_debug_level = 0
 
int classify_norm_method = character
 
double classify_char_norm_range = 0.2
 
double classify_min_norm_scale_x = 0.0
 
double classify_max_norm_scale_x = 0.325
 
double classify_min_norm_scale_y = 0.0
 
double classify_max_norm_scale_y = 0.325
 
double classify_max_rating_ratio = 1.5
 
double classify_max_certainty_margin = 5.5
 
bool tess_cn_matching = 0
 
bool tess_bn_matching = 0
 
bool classify_enable_adaptive_matcher = 1
 
bool classify_use_pre_adapted_templates = 0
 
bool classify_save_adapted_templates = 0
 
bool classify_enable_adaptive_debugger = 0
 
bool classify_nonlinear_norm = 0
 
int matcher_debug_level = 0
 
int matcher_debug_flags = 0
 
int classify_learning_debug_level = 0
 
double matcher_good_threshold = 0.125
 
double matcher_reliable_adaptive_result = 0.0
 
double matcher_perfect_threshold = 0.02
 
double matcher_bad_match_pad = 0.15
 
double matcher_rating_margin = 0.1
 
double matcher_avg_noise_size = 12.0
 
int matcher_permanent_classes_min = 1
 
int matcher_min_examples_for_prototyping = 3
 
int matcher_sufficient_examples_for_prototyping = 5
 
double matcher_clustering_max_angle_delta = 0.015
 
double classify_misfit_junk_penalty = 0.0
 
double rating_scale = 1.5
 
double certainty_scale = 20.0
 
double tessedit_class_miss_scale = 0.00390625
 
double classify_adapted_pruning_factor = 2.5
 
double classify_adapted_pruning_threshold = -1.0
 
int classify_adapt_proto_threshold = 230
 
int classify_adapt_feature_threshold = 230
 
bool disable_character_fragments = TRUE
 
double classify_character_fragments_garbage_certainty_threshold = -3.0
 
bool classify_debug_character_fragments = FALSE
 
bool matcher_debug_separate_windows = FALSE
 
char * classify_learn_debug_str = ""
 
int classify_class_pruner_threshold = 229
 
int classify_class_pruner_multiplier = 15
 
int classify_cp_cutoff_strength = 7
 
int classify_integer_matcher_multiplier = 10
 
INT_TEMPLATES PreTrainedTemplates
 
ADAPT_TEMPLATES AdaptedTemplates
 
ADAPT_TEMPLATES BackupAdaptedTemplates
 
BIT_VECTOR AllProtosOn
 
BIT_VECTOR AllConfigsOn
 
BIT_VECTOR AllConfigsOff
 
BIT_VECTOR TempProtoMask
 
bool EnableLearning
 
NORM_PROTOSNormProtos
 
UnicityTable< FontInfofontinfo_table_
 
UnicityTable< FontSetfontset_table_
 
int il1_adaption_test = 0
 
bool classify_bln_numeric_mode = 0
 
double speckle_large_max_size = 0.30
 
double speckle_rating_penalty = 10.0
 
- Public Attributes inherited from tesseract::CCUtil
STRING datadir
 
STRING imagebasename
 
STRING lang
 
STRING language_data_path_prefix
 
UNICHARSET unicharset
 
UnicharAmbigs unichar_ambigs
 
STRING imagefile
 
STRING directory
 
char * m_data_sub_dir = "tessdata/"
 
int ambigs_debug_level = 0
 
bool use_definite_ambigs_for_classifier = 0
 
bool use_ambigs_for_adaption = 0
 

Protected Member Functions

bool SegSearchDone (int num_futile_classifications)
 
void UpdateSegSearchNodes (float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
 
void ProcessSegSearchPainPoint (float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
 
void ResetNGramSearch (WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
 
void InitBlamerForSegSearch (WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
 

Additional Inherited Members

- Static Public Member Functions inherited from tesseract::Classify
static void SetupBLCNDenorms (const TBLOB &blob, bool nonlinear_norm, DENORM *bl_denorm, DENORM *cn_denorm, INT_FX_RESULT_STRUCT *fx_info)
 
static void ExtractFeatures (const TBLOB &blob, bool nonlinear_norm, GenericVector< INT_FEATURE_STRUCT > *bl_features, GenericVector< INT_FEATURE_STRUCT > *cn_features, INT_FX_RESULT_STRUCT *results, GenericVector< int > *outline_cn_counts)
 
- Static Public Attributes inherited from tesseract::CCStruct
static const double kDescenderFraction = 0.25
 
static const double kXHeightFraction = 0.5
 
static const double kAscenderFraction = 0.25
 
static const double kXHeightCapRatio
 
- Protected Attributes inherited from tesseract::Classify
IntegerMatcher im_
 
FEATURE_DEFS_STRUCT feature_defs_
 
ShapeTableshape_table_
 

Detailed Description

Definition at line 123 of file wordrec.h.

Constructor & Destructor Documentation

◆ Wordrec()

tesseract::Wordrec::Wordrec ( )

Definition at line 26 of file wordrec.cpp.

26  :
27  // control parameters
29  "Merge the fragments in the ratings matrix and delete them"
30  " after merging", params()),
31  BOOL_MEMBER(wordrec_no_block, FALSE, "Don't output block information",
32  params()),
33  BOOL_MEMBER(wordrec_enable_assoc, TRUE, "Associator Enable",
34  params()),
36  "force associator to run regardless of what enable_assoc is."
37  "This is used for CJK where component grouping is necessary.",
38  CCUtil::params()),
39  double_MEMBER(wordrec_worst_state, 1.0, "Worst segmentation state",
40  params()),
42  "Use information from fragments to guide chopping process",
43  params()),
44  INT_MEMBER(repair_unchopped_blobs, 1, "Fix blobs that aren't chopped",
45  params()),
46  double_MEMBER(tessedit_certainty_threshold, -2.25, "Good blob limit",
47  params()),
48  INT_MEMBER(chop_debug, 0, "Chop debug",
49  params()),
50  BOOL_MEMBER(chop_enable, 1, "Chop enable",
51  params()),
52  BOOL_MEMBER(chop_vertical_creep, 0, "Vertical creep",
53  params()),
54  INT_MEMBER(chop_split_length, 10000, "Split Length",
55  params()),
56  INT_MEMBER(chop_same_distance, 2, "Same distance",
57  params()),
58  INT_MEMBER(chop_min_outline_points, 6, "Min Number of Points on Outline",
59  params()),
60  INT_MEMBER(chop_seam_pile_size, 150, "Max number of seams in seam_pile",
61  params()),
62  BOOL_MEMBER(chop_new_seam_pile, 1, "Use new seam_pile", params()),
63  INT_MEMBER(chop_inside_angle, -50, "Min Inside Angle Bend",
64  params()),
65  INT_MEMBER(chop_min_outline_area, 2000, "Min Outline Area",
66  params()),
67  double_MEMBER(chop_split_dist_knob, 0.5, "Split length adjustment",
68  params()),
69  double_MEMBER(chop_overlap_knob, 0.9, "Split overlap adjustment",
70  params()),
71  double_MEMBER(chop_center_knob, 0.15, "Split center adjustment",
72  params()),
73  INT_MEMBER(chop_centered_maxwidth, 90, "Width of (smaller) chopped blobs "
74  "above which we don't care that a chop is not near the center.",
75  params()),
76  double_MEMBER(chop_sharpness_knob, 0.06, "Split sharpness adjustment",
77  params()),
78  double_MEMBER(chop_width_change_knob, 5.0, "Width change adjustment",
79  params()),
80  double_MEMBER(chop_ok_split, 100.0, "OK split limit",
81  params()),
82  double_MEMBER(chop_good_split, 50.0, "Good split limit",
83  params()),
84  INT_MEMBER(chop_x_y_weight, 3, "X / Y length weight",
85  params()),
86  INT_MEMBER(segment_adjust_debug, 0, "Segmentation adjustment debug",
87  params()),
89  "include fixed-pitch heuristics in char segmentation",
90  params()),
92  "Debug level for wordrec", params()),
94  "Max number of broken pieces to associate", params()),
96  "Only run OCR for words that had truth recorded in BlamerBundle",
97  params()),
99  "Print blamer debug messages", params()),
101  "Try to set the blame for errors", params()),
103  "SegSearch debug level", params()),
105  "Maximum number of pain points stored in the queue",
106  params()),
108  "Maximum number of pain point classifications per chunk that"
109  "did not result in finding a better word choice.",
110  params()),
112  "Maximum character width-to-height ratio", params()),
114  "Save alternative paths found during chopping"
115  " and segmentation search",
116  params()) {
117  prev_word_best_choice_ = NULL;
118  language_model_ = new LanguageModel(&get_fontinfo_table(),
119  &(getDict()));
120  fill_lattice_ = NULL;
121 }
int wordrec_debug_level
Definition: wordrec.h:162
bool wordrec_skip_no_truth_words
Definition: wordrec.h:166
#define TRUE
Definition: capi.h:45
bool wordrec_no_block
Definition: wordrec.h:129
ParamsVectors * params()
Definition: ccutil.h:62
int segsearch_max_futile_classifications
Definition: wordrec.h:173
Dict & getDict()
Definition: classify.h:65
#define double_MEMBER(name, val, comment, vec)
Definition: params.h:309
int wordrec_max_join_chunks
Definition: wordrec.h:164
double chop_center_knob
Definition: wordrec.h:151
bool wordrec_enable_assoc
Definition: wordrec.h:130
double chop_ok_split
Definition: wordrec.h:156
int chop_same_distance
Definition: wordrec.h:143
int repair_unchopped_blobs
Definition: wordrec.h:137
int chop_min_outline_points
Definition: wordrec.h:144
UnicityTable< FontInfo > & get_fontinfo_table()
Definition: classify.h:344
int chop_min_outline_area
Definition: wordrec.h:148
double chop_overlap_knob
Definition: wordrec.h:150
double chop_sharpness_knob
Definition: wordrec.h:154
bool save_alt_choices
Definition: wordrec.h:178
int segment_adjust_debug
Definition: wordrec.h:159
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
int chop_seam_pile_size
Definition: wordrec.h:145
int chop_centered_maxwidth
Definition: wordrec.h:153
#define FALSE
Definition: capi.h:46
double chop_split_dist_knob
Definition: wordrec.h:149
int segsearch_max_pain_points
Definition: wordrec.h:171
double tessedit_certainty_threshold
Definition: wordrec.h:138
bool fragments_guide_chopper
Definition: wordrec.h:136
bool force_word_assoc
Definition: wordrec.h:133
bool merge_fragments_in_matrix
Definition: wordrec.h:128
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:419
double chop_good_split
Definition: wordrec.h:157
int segsearch_debug_level
Definition: wordrec.h:169
double chop_width_change_knob
Definition: wordrec.h:155
bool chop_vertical_creep
Definition: wordrec.h:141
bool wordrec_run_blamer
Definition: wordrec.h:168
LanguageModel * language_model_
Definition: wordrec.h:410
bool wordrec_debug_blamer
Definition: wordrec.h:167
#define BOOL_MEMBER(name, val, comment, vec)
Definition: params.h:303
#define INT_MEMBER(name, val, comment, vec)
Definition: params.h:300
double wordrec_worst_state
Definition: wordrec.h:134
bool chop_new_seam_pile
Definition: wordrec.h:146
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:415

◆ ~Wordrec()

tesseract::Wordrec::~Wordrec ( )
virtual

Definition at line 123 of file wordrec.cpp.

123  {
124  delete language_model_;
125 }
LanguageModel * language_model_
Definition: wordrec.h:410

Member Function Documentation

◆ add_point_to_list()

void tesseract::Wordrec::add_point_to_list ( PointHeap point_heap,
EDGEPT point 
)

Definition at line 64 of file chop.cpp.

64  {
65  if (point_heap->size() < MAX_NUM_POINTS - 2) {
66  PointPair pair(point_priority(point), point);
67  point_heap->Push(&pair);
68  }
69 
70 #ifndef GRAPHICS_DISABLED
71  if (chop_debug > 2)
72  mark_outline(point);
73 #endif
74 }
void mark_outline(EDGEPT *edgept)
Definition: plotedges.cpp:95
void Push(Pair *entry)
Definition: genericheap.h:95
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:54
#define MAX_NUM_POINTS
Definition: chop.h:39

◆ add_seam_to_queue()

void tesseract::Wordrec::add_seam_to_queue ( float  new_priority,
SEAM new_seam,
SeamQueue seams 
)

Definition at line 63 of file findseam.cpp.

64  {
65  if (new_seam == NULL) return;
66  if (chop_debug) {
67  tprintf("Pushing new seam with priority %g :", new_priority);
68  new_seam->Print("seam: ");
69  }
70  if (seams->size() >= MAX_NUM_SEAMS) {
71  SeamPair old_pair(0, NULL);
72  if (seams->PopWorst(&old_pair) && old_pair.key() <= new_priority) {
73  if (chop_debug) {
74  tprintf("Old seam staying with priority %g\n", old_pair.key());
75  }
76  delete new_seam;
77  seams->Push(&old_pair);
78  return;
79  } else if (chop_debug) {
80  tprintf("New seam with priority %g beats old worst seam with %g\n",
81  new_priority, old_pair.key());
82  }
83  }
84  SeamPair new_pair(new_priority, new_seam);
85  seams->Push(&new_pair);
86 }
void Push(Pair *entry)
Definition: genericheap.h:95
#define tprintf(...)
Definition: tprintf.h:31
#define MAX_NUM_SEAMS
Definition: findseam.cpp:45
void Print(const char *label) const
Definition: seam.cpp:160
bool PopWorst(Pair *entry)
Definition: genericheap.h:140

◆ angle_change()

int tesseract::Wordrec::angle_change ( EDGEPT point1,
EDGEPT point2,
EDGEPT point3 
)

Definition at line 88 of file chop.cpp.

88  {
89  VECTOR vector1;
90  VECTOR vector2;
91 
92  int angle;
93  float length;
94 
95  /* Compute angle */
96  vector1.x = point2->pos.x - point1->pos.x;
97  vector1.y = point2->pos.y - point1->pos.y;
98  vector2.x = point3->pos.x - point2->pos.x;
99  vector2.y = point3->pos.y - point2->pos.y;
100  /* Use cross product */
101  length = (float)sqrt((float)LENGTH(vector1) * LENGTH(vector2));
102  if ((int) length == 0)
103  return (0);
104  angle = static_cast<int>(floor(asin(CROSS (vector1, vector2) /
105  length) / PI * 180.0 + 0.5));
106 
107  /* Use dot product */
108  if (SCALAR (vector1, vector2) < 0)
109  angle = 180 - angle;
110  /* Adjust angle */
111  if (angle > 180)
112  angle -= 360;
113  if (angle <= -180)
114  angle += 360;
115  return (angle);
116 }
TPOINT pos
Definition: blobs.h:163
#define CROSS(a, b)
Definition: vecfuncs.h:52
#define SCALAR(a, b)
Definition: vecfuncs.h:61
#define LENGTH(a)
Definition: vecfuncs.h:70
#define PI
Definition: const.h:19
inT16 x
Definition: blobs.h:71
inT16 y
Definition: blobs.h:72
Definition: blobs.h:50

◆ attempt_blob_chop()

SEAM * tesseract::Wordrec::attempt_blob_chop ( TWERD word,
TBLOB blob,
inT32  blob_number,
bool  italic_blob,
const GenericVector< SEAM *> &  seams 
)

Definition at line 169 of file chopper.cpp.

171  {
174  TBLOB *other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
175  // Insert it into the word.
176  word->blobs.insert(other_blob, blob_number + 1);
177 
178  SEAM *seam = NULL;
179  if (prioritize_division) {
180  TPOINT location;
181  if (divisible_blob(blob, italic_blob, &location)) {
182  seam = new SEAM(0.0f, location);
183  }
184  }
185  if (seam == NULL)
186  seam = pick_good_seam(blob);
187  if (chop_debug) {
188  if (seam != NULL)
189  seam->Print("Good seam picked=");
190  else
191  tprintf("\n** no seam picked *** \n");
192  }
193  if (seam) {
194  seam->ApplySeam(italic_blob, blob, other_blob);
195  }
196 
197  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
198  seams, seam);
199  if (seam == NULL) {
203  // If the blob can simply be divided into outlines, then do that.
204  TPOINT location;
205  if (divisible_blob(blob, italic_blob, &location)) {
206  other_blob = TBLOB::ShallowCopy(*blob); /* Make new blob */
207  word->blobs.insert(other_blob, blob_number + 1);
208  seam = new SEAM(0.0f, location);
209  seam->ApplySeam(italic_blob, blob, other_blob);
210  seam = CheckSeam(chop_debug, blob_number, word, blob, other_blob,
211  seams, seam);
212  }
213  }
214  }
215  if (seam != NULL) {
216  // Make sure this seam doesn't get chopped again.
217  seam->Finalize();
218  }
219  return seam;
220 }
void restore_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:122
void Finalize()
Definition: seam.h:116
bool allow_blob_division
Definition: classify.h:381
TESSLINE * outlines
Definition: blobs.h:377
void preserve_outline_tree(TESSLINE *srcline)
Definition: chopper.cpp:82
#define tprintf(...)
Definition: tprintf.h:31
int repair_unchopped_blobs
Definition: wordrec.h:137
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:932
void insert(T t, int index)
void Print(const char *label) const
Definition: seam.cpp:160
Definition: seam.h:44
void ApplySeam(bool italic_blob, TBLOB *blob, TBLOB *other_blob) const
Definition: seam.cpp:124
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
Definition: blobs.h:50
static TBLOB * ShallowCopy(const TBLOB &src)
Definition: blobs.cpp:352
SEAM * pick_good_seam(TBLOB *blob)
Definition: findseam.cpp:215
bool prioritize_division
Definition: classify.h:386

◆ call_matcher()

BLOB_CHOICE_LIST * tesseract::Wordrec::call_matcher ( TBLOB blob)

Definition at line 138 of file tface.cpp.

138  {
139  // Rotate the blob for classification if necessary.
140  TBLOB* rotated_blob = tessblob->ClassifyNormalizeIfNeeded();
141  if (rotated_blob == NULL) {
142  rotated_blob = tessblob;
143  }
144  BLOB_CHOICE_LIST *ratings = new BLOB_CHOICE_LIST(); // matcher result
145  AdaptiveClassifier(rotated_blob, ratings);
146  if (rotated_blob != tessblob) {
147  delete rotated_blob;
148  }
149  return ratings;
150 }
TBLOB * ClassifyNormalizeIfNeeded() const
Definition: blobs.cpp:363
Definition: blobs.h:261
void AdaptiveClassifier(TBLOB *Blob, BLOB_CHOICE_LIST *Choices)
Definition: adaptmatch.cpp:185

◆ CallFillLattice()

void tesseract::Wordrec::CallFillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)
inline

Definition at line 195 of file wordrec.h.

198  {
199  (this->*fill_lattice_)(ratings, best_choices, unicharset, blamer_bundle);
200  }
void(Wordrec::* fill_lattice_)(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:419

◆ cc_recog()

void tesseract::Wordrec::cc_recog ( WERD_RES word)

Definition at line 113 of file tface.cpp.

113  {
115  chop_word_main(word);
116  word->DebugWordChoices(getDict().stopper_debug_level >= 1,
117  getDict().word_to_debug.string());
118  ASSERT_HOST(word->StatesAllValid());
119 }
Dict & getDict()
Definition: classify.h:65
Definition: werd.h:36
bool StatesAllValid()
Definition: pageres.cpp:450
void chop_word_main(WERD_RES *word)
Definition: chopper.cpp:393
#define ASSERT_HOST(x)
Definition: errcode.h:84
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
WERD * word
Definition: pageres.h:175
void DebugWordChoices(bool debug, const char *word_to_debug)
Definition: pageres.cpp:472

◆ choose_best_seam()

void tesseract::Wordrec::choose_best_seam ( SeamQueue seam_queue,
const SPLIT split,
PRIORITY  priority,
SEAM **  seam_result,
TBLOB blob,
SeamPile seam_pile 
)

Definition at line 102 of file findseam.cpp.

104  {
105  SEAM *seam;
106  char str[80];
107  float my_priority;
108  /* Add seam of split */
109  my_priority = priority;
110  if (split != NULL) {
111  TPOINT split_point = split->point1->pos;
112  split_point += split->point2->pos;
113  split_point /= 2;
114  seam = new SEAM(my_priority, split_point, *split);
115  if (chop_debug > 1) seam->Print("Partial priority ");
116  add_seam_to_queue(my_priority, seam, seam_queue);
117 
118  if (my_priority > chop_good_split)
119  return;
120  }
121 
122  TBOX bbox = blob->bounding_box();
123  /* Queue loop */
124  while (!seam_queue->empty()) {
125  SeamPair seam_pair;
126  seam_queue->Pop(&seam_pair);
127  seam = seam_pair.extract_data();
128  /* Set full priority */
129  my_priority = seam->FullPriority(bbox.left(), bbox.right(),
132  if (chop_debug) {
133  sprintf (str, "Full my_priority %0.0f, ", my_priority);
134  seam->Print(str);
135  }
136 
137  if ((*seam_result == NULL || (*seam_result)->priority() > my_priority) &&
138  my_priority < chop_ok_split) {
139  /* No crossing */
140  if (seam->IsHealthy(*blob, chop_min_outline_points,
142  delete *seam_result;
143  *seam_result = new SEAM(*seam);
144  (*seam_result)->set_priority(my_priority);
145  } else {
146  delete seam;
147  seam = NULL;
148  my_priority = BAD_PRIORITY;
149  }
150  }
151 
152  if (my_priority < chop_good_split) {
153  if (seam)
154  delete seam;
155  return; /* Made good answer */
156  }
157 
158  if (seam) {
159  /* Combine with others */
160  if (seam_pile->size() < chop_seam_pile_size) {
161  combine_seam(*seam_pile, seam, seam_queue);
162  SeamDecPair pair(seam_pair.key(), seam);
163  seam_pile->Push(&pair);
164  } else if (chop_new_seam_pile &&
165  seam_pile->size() == chop_seam_pile_size &&
166  seam_pile->PeekTop().key() > seam_pair.key()) {
167  combine_seam(*seam_pile, seam, seam_queue);
168  SeamDecPair pair;
169  seam_pile->Pop(&pair); // pop the worst.
170  // Replace the seam in pair (deleting the old one) with
171  // the new seam and score, then push back into the heap.
172  pair.set_key(seam_pair.key());
173  pair.set_data(seam);
174  seam_pile->Push(&pair);
175  } else {
176  delete seam;
177  }
178  }
179 
180  my_priority = seam_queue->empty() ? NO_FULL_PRIORITY
181  : seam_queue->PeekTop().key();
182  if ((my_priority > chop_ok_split) ||
183  (my_priority > chop_good_split && split))
184  return;
185  }
186 }
TPOINT pos
Definition: blobs.h:163
void Push(Pair *entry)
Definition: genericheap.h:95
#define NO_FULL_PRIORITY
Definition: findseam.cpp:48
void combine_seam(const SeamPile &seam_pile, const SEAM *seam, SeamQueue *seam_queue)
Definition: findseam.cpp:196
double chop_center_knob
Definition: wordrec.h:151
double chop_ok_split
Definition: wordrec.h:156
const Pair & PeekTop() const
Definition: genericheap.h:108
int chop_min_outline_points
Definition: wordrec.h:144
void set_key(const Key &new_key)
Definition: kdpair.h:119
int chop_min_outline_area
Definition: wordrec.h:148
double chop_overlap_knob
Definition: wordrec.h:150
inT16 left() const
Definition: rect.h:68
EDGEPT * point2
Definition: split.h:104
Data * extract_data()
Definition: kdpair.h:131
int chop_seam_pile_size
Definition: wordrec.h:145
void Print(const char *label) const
Definition: seam.cpp:160
Definition: seam.h:44
int chop_centered_maxwidth
Definition: wordrec.h:153
float FullPriority(int xmin, int xmax, double overlap_knob, int centered_maxwidth, double center_knob, double width_change_knob) const
Definition: seam.cpp:245
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams)
Definition: findseam.cpp:63
float priority() const
Definition: seam.h:65
Definition: rect.h:30
double chop_good_split
Definition: wordrec.h:157
Definition: blobs.h:50
void set_data(Data *new_data)
Definition: kdpair.h:126
inT16 right() const
Definition: rect.h:75
double chop_width_change_knob
Definition: wordrec.h:155
const Key & key() const
Definition: kdpair.h:116
bool IsHealthy(const TBLOB &blob, int min_points, int min_area) const
Definition: seam.cpp:72
EDGEPT * point1
Definition: split.h:103
bool Pop(Pair *entry)
Definition: genericheap.h:118
bool empty() const
Definition: genericheap.h:68
TBOX bounding_box() const
Definition: blobs.cpp:482
bool chop_new_seam_pile
Definition: wordrec.h:146
#define BAD_PRIORITY
Definition: findseam.cpp:50

◆ chop_numbered_blob()

SEAM * tesseract::Wordrec::chop_numbered_blob ( TWERD word,
inT32  blob_number,
bool  italic_blob,
const GenericVector< SEAM *> &  seams 
)

Definition at line 223 of file chopper.cpp.

225  {
226  return attempt_blob_chop(word, word->blobs[blob_number], blob_number,
227  italic_blob, seams);
228 }
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:169
GenericVector< TBLOB * > blobs
Definition: blobs.h:436

◆ chop_one_blob()

SEAM * tesseract::Wordrec::chop_one_blob ( const GenericVector< TBOX > &  boxes,
const GenericVector< BLOB_CHOICE *> &  blob_choices,
WERD_RES word_res,
int blob_number 
)

Definition at line 373 of file chopper.cpp.

376  {
377  if (prioritize_division) {
378  return chop_overlapping_blob(boxes, true, word_res, blob_number);
379  } else {
380  return improve_one_blob(blob_choices, NULL, false, true, word_res,
381  blob_number);
382  }
383 }
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:329
SEAM * chop_overlapping_blob(const GenericVector< TBOX > &boxes, bool italic_blob, WERD_RES *word_res, int *blob_number)
Definition: chopper.cpp:231
bool prioritize_division
Definition: classify.h:386

◆ chop_overlapping_blob()

SEAM * tesseract::Wordrec::chop_overlapping_blob ( const GenericVector< TBOX > &  boxes,
bool  italic_blob,
WERD_RES word_res,
int blob_number 
)

Definition at line 231 of file chopper.cpp.

233  {
234  TWERD *word = word_res->chopped_word;
235  for (*blob_number = 0; *blob_number < word->NumBlobs(); ++*blob_number) {
236  TBLOB *blob = word->blobs[*blob_number];
237  TPOINT topleft, botright;
238  topleft.x = blob->bounding_box().left();
239  topleft.y = blob->bounding_box().top();
240  botright.x = blob->bounding_box().right();
241  botright.y = blob->bounding_box().bottom();
242 
243  TPOINT original_topleft, original_botright;
244  word_res->denorm.DenormTransform(NULL, topleft, &original_topleft);
245  word_res->denorm.DenormTransform(NULL, botright, &original_botright);
246 
247  TBOX original_box = TBOX(original_topleft.x, original_botright.y,
248  original_botright.x, original_topleft.y);
249 
250  bool almost_equal_box = false;
251  int num_overlap = 0;
252  for (int i = 0; i < boxes.size(); i++) {
253  if (original_box.overlap_fraction(boxes[i]) > 0.125)
254  num_overlap++;
255  if (original_box.almost_equal(boxes[i], 3))
256  almost_equal_box = true;
257  }
258 
259  TPOINT location;
260  if (divisible_blob(blob, italic_blob, &location) ||
261  (!almost_equal_box && num_overlap > 1)) {
262  SEAM *seam = attempt_blob_chop(word, blob, *blob_number,
263  italic_blob, word_res->seam_array);
264  if (seam != NULL)
265  return seam;
266  }
267  }
268 
269  *blob_number = -1;
270  return NULL;
271 }
SEAM * attempt_blob_chop(TWERD *word, TBLOB *blob, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:169
int size() const
Definition: genericvector.h:72
inT16 left() const
Definition: rect.h:68
Definition: blobs.h:395
bool divisible_blob(TBLOB *blob, bool italic_blob, TPOINT *location)
Definition: blobs.cpp:932
void DenormTransform(const DENORM *last_denorm, const TPOINT &pt, TPOINT *original) const
Definition: normalis.cpp:389
Definition: seam.h:44
inT16 x
Definition: blobs.h:71
double overlap_fraction(const TBOX &box) const
Definition: rect.h:378
int NumBlobs() const
Definition: blobs.h:425
inT16 top() const
Definition: rect.h:54
bool almost_equal(const TBOX &box, int tolerance) const
Definition: rect.cpp:258
inT16 y
Definition: blobs.h:72
Definition: rect.h:30
GenericVector< TBLOB * > blobs
Definition: blobs.h:436
Definition: blobs.h:261
DENORM denorm
Definition: pageres.h:190
Definition: blobs.h:50
inT16 right() const
Definition: rect.h:75
inT16 bottom() const
Definition: rect.h:61
TBOX bounding_box() const
Definition: blobs.cpp:482
TWERD * chopped_word
Definition: pageres.h:201
GenericVector< SEAM * > seam_array
Definition: pageres.h:203

◆ chop_word_main()

void tesseract::Wordrec::chop_word_main ( WERD_RES word)

Definition at line 393 of file chopper.cpp.

393  {
394  int num_blobs = word->chopped_word->NumBlobs();
395  if (word->ratings == NULL) {
396  word->ratings = new MATRIX(num_blobs, wordrec_max_join_chunks);
397  }
398  if (word->ratings->get(0, 0) == NULL) {
399  // Run initial classification.
400  for (int b = 0; b < num_blobs; ++b) {
401  BLOB_CHOICE_LIST* choices = classify_piece(word->seam_array, b, b,
402  "Initial:", word->chopped_word,
403  word->blamer_bundle);
404  word->ratings->put(b, b, choices);
405  }
406  } else {
407  // Blobs have been pre-classified. Set matrix cell for all blob choices
408  for (int col = 0; col < word->ratings->dimension(); ++col) {
409  for (int row = col; row < word->ratings->dimension() &&
410  row < col + word->ratings->bandwidth(); ++row) {
411  BLOB_CHOICE_LIST* choices = word->ratings->get(col, row);
412  if (choices != NULL) {
413  BLOB_CHOICE_IT bc_it(choices);
414  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
415  bc_it.data()->set_matrix_cell(col, row);
416  }
417  }
418  }
419  }
420  }
421 
422  // Run Segmentation Search.
423  BestChoiceBundle best_choice_bundle(word->ratings->dimension());
424  SegSearch(word, &best_choice_bundle, word->blamer_bundle);
425 
426  if (word->best_choice == NULL) {
427  // SegSearch found no valid paths, so just use the leading diagonal.
429  }
430  word->RebuildBestState();
431  // If we finished without a hyphen at the end of the word, let the next word
432  // be found in the dictionary.
433  if (word->word->flag(W_EOL) &&
434  !getDict().has_hyphen_end(*word->best_choice)) {
435  getDict().reset_hyphen_vars(true);
436  }
437 
438  if (word->blamer_bundle != NULL && this->fill_lattice_ != NULL) {
439  CallFillLattice(*word->ratings, word->best_choices,
440  *word->uch_set, word->blamer_bundle);
441  }
442  if (wordrec_debug_level > 0) {
443  tprintf("Final Ratings Matrix:\n");
444  word->ratings->print(getDict().getUnicharset());
445  }
446  word->FilterWordChoices(getDict().stopper_debug_level);
447 }
int wordrec_debug_level
Definition: wordrec.h:162
void RebuildBestState()
Definition: pageres.cpp:800
WERD_CHOICE * best_choice
Definition: pageres.h:219
Dict & getDict()
Definition: classify.h:65
BlamerBundle * blamer_bundle
Definition: pageres.h:230
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
int wordrec_max_join_chunks
Definition: wordrec.h:164
Definition: werd.h:36
T get(ICOORD pos) const
Definition: matrix.h:223
#define tprintf(...)
Definition: tprintf.h:31
void FilterWordChoices(int debug_level)
Definition: pageres.cpp:505
BOOL8 flag(WERD_FLAGS mask) const
Definition: werd.h:128
MATRIX * ratings
Definition: pageres.h:215
void reset_hyphen_vars(bool last_word_on_line)
Definition: hyphen.cpp:32
WERD_CHOICE_LIST best_choices
Definition: pageres.h:227
void FakeWordFromRatings(PermuterType permuter)
Definition: pageres.cpp:893
void CallFillLattice(const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)
Definition: wordrec.h:195
int bandwidth() const
Definition: matrix.h:523
int NumBlobs() const
Definition: blobs.h:425
int dimension() const
Definition: matrix.h:521
void put(ICOORD pos, const T &thing)
Definition: matrix.h:215
Definition: matrix.h:563
WERD * word
Definition: pageres.h:175
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:56
const UNICHARSET * uch_set
Definition: pageres.h:192
TWERD * chopped_word
Definition: pageres.h:201
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:37
bool has_hyphen_end(UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
Definition: dict.h:143
GenericVector< SEAM * > seam_array
Definition: pageres.h:203

◆ classify_blob()

BLOB_CHOICE_LIST * tesseract::Wordrec::classify_blob ( TBLOB blob,
const char *  string,
C_COL  color,
BlamerBundle blamer_bundle 
)

Definition at line 56 of file wordclass.cpp.

58  {
59 #ifndef GRAPHICS_DISABLED
61  display_blob(blob, color);
62 #endif
63  // TODO(rays) collapse with call_matcher and move all to wordrec.cpp.
64  BLOB_CHOICE_LIST* choices = call_matcher(blob);
65  // If a blob with the same bounding box as one of the truth character
66  // bounding boxes is not classified as the corresponding truth character
67  // blame character classifier for incorrect answer.
68  if (blamer_bundle != NULL) {
69  blamer_bundle->BlameClassifier(getDict().getUnicharset(),
70  blob->bounding_box(),
71  *choices,
73  }
74  #ifndef GRAPHICS_DISABLED
75  if (classify_debug_level && string)
76  print_ratings_list(string, choices, getDict().getUnicharset());
77 
80 #endif
81 
82  return choices;
83 }
bool wordrec_display_all_blobs
Definition: render.cpp:49
BLOB_CHOICE_LIST * call_matcher(TBLOB *blob)
Definition: tface.cpp:138
void BlameClassifier(const UNICHARSET &unicharset, const TBOX &blob_box, const BLOB_CHOICE_LIST &choices, bool debug)
Definition: blamer.cpp:257
Dict & getDict()
Definition: classify.h:65
void display_blob(TBLOB *blob, C_COL color)
Definition: render.cpp:64
bool wordrec_blob_pause
Definition: render.cpp:53
char window_wait(ScrollView *win)
Definition: callcpp.cpp:111
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
bool wordrec_debug_blamer
Definition: wordrec.h:167
TBOX bounding_box() const
Definition: blobs.cpp:482
ScrollView * blob_window
Definition: render.cpp:43

◆ classify_piece()

BLOB_CHOICE_LIST * tesseract::Wordrec::classify_piece ( const GenericVector< SEAM *> &  seams,
inT16  start,
inT16  end,
const char *  description,
TWERD word,
BlamerBundle blamer_bundle 
)
virtual

Definition at line 56 of file pieces.cpp.

61  {
62  if (end > start) SEAM::JoinPieces(seams, word->blobs, start, end);
63  BLOB_CHOICE_LIST *choices = classify_blob(word->blobs[start], description,
64  White, blamer_bundle);
65  // Set the matrix_cell_ entries in all the BLOB_CHOICES.
66  BLOB_CHOICE_IT bc_it(choices);
67  for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
68  bc_it.data()->set_matrix_cell(start, end);
69  }
70 
71  if (end > start) SEAM::BreakPieces(seams, word->blobs, start, end);
72 
73  return (choices);
74 }
Definition: callcpp.h:34
BLOB_CHOICE_LIST * classify_blob(TBLOB *blob, const char *string, C_COL color, BlamerBundle *blamer_bundle)
Definition: wordclass.cpp:56
static void BreakPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:194
static void JoinPieces(const GenericVector< SEAM *> &seams, const GenericVector< TBLOB *> &blobs, int first, int last)
Definition: seam.cpp:216
GenericVector< TBLOB * > blobs
Definition: blobs.h:436

◆ combine_seam()

void tesseract::Wordrec::combine_seam ( const SeamPile seam_pile,
const SEAM seam,
SeamQueue seam_queue 
)

Definition at line 196 of file findseam.cpp.

197  {
198  for (int x = 0; x < seam_pile.size(); ++x) {
199  const SEAM *this_one = seam_pile.get(x).data();
200  if (seam->CombineableWith(*this_one, SPLIT_CLOSENESS, chop_ok_split)) {
201  SEAM *new_one = new SEAM(*seam);
202  new_one->CombineWith(*this_one);
203  if (chop_debug > 1) new_one->Print("Combo priority ");
204  add_seam_to_queue(new_one->priority(), new_one, seam_queue);
205  }
206  }
207 }
const Pair & get(int index) const
Definition: genericheap.h:87
double chop_ok_split
Definition: wordrec.h:156
void Print(const char *label) const
Definition: seam.cpp:160
Definition: seam.h:44
void add_seam_to_queue(float new_priority, SEAM *new_seam, SeamQueue *seams)
Definition: findseam.cpp:63
float priority() const
Definition: seam.h:65
#define SPLIT_CLOSENESS
Definition: findseam.cpp:43
bool CombineableWith(const SEAM &other, int max_x_dist, float max_total_priority) const
Definition: seam.cpp:46
void CombineWith(const SEAM &other)
Definition: seam.cpp:60

◆ dict_word()

int tesseract::Wordrec::dict_word ( const WERD_CHOICE word)

Definition at line 128 of file tface.cpp.

128  {
129  return getDict().valid_word(word);
130 }
Dict & getDict()
Definition: classify.h:65
int valid_word(const WERD_CHOICE &word, bool numbers_ok) const
Definition: dict.cpp:750

◆ DoSegSearch()

void tesseract::Wordrec::DoSegSearch ( WERD_RES word_res)

Definition at line 31 of file segsearch.cpp.

31  {
32  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
33  // Run Segmentation Search.
34  SegSearch(word_res, &best_choice_bundle, NULL);
35 }
MATRIX * ratings
Definition: pageres.h:215
int dimension() const
Definition: matrix.h:521
void SegSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:37

◆ end_recog()

int tesseract::Wordrec::end_recog ( )

Definition at line 65 of file tface.cpp.

65  {
66  program_editdown (0);
67 
68  return (0);
69 }
void program_editdown(inT32 elasped_time)
Definition: tface.cpp:78

◆ fill_filtered_fragment_list()

void tesseract::Wordrec::fill_filtered_fragment_list ( BLOB_CHOICE_LIST *  choices,
int  fragment_pos,
int  num_frag_parts,
BLOB_CHOICE_LIST *  filtered_choices 
)

Definition at line 105 of file pieces.cpp.

108  {
109  BLOB_CHOICE_IT filtered_choices_it(filtered_choices);
110  BLOB_CHOICE_IT choices_it(choices);
111 
112  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
113  choices_it.forward()) {
114  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
115  const CHAR_FRAGMENT *frag = unicharset.get_fragment(choice_unichar_id);
116 
117  if (frag != NULL && frag->get_pos() == fragment_pos &&
118  frag->get_total() == num_frag_parts) {
119  // Recover the unichar_id of the unichar that this fragment is
120  // a part of
121  BLOB_CHOICE *b = new BLOB_CHOICE(*choices_it.data());
122  int original_unichar = unicharset.unichar_to_id(frag->get_unichar());
123  b->set_unichar_id(original_unichar);
124  filtered_choices_it.add_to_end(b);
125  }
126  }
127 
128  filtered_choices->sort(SortByUnicharID<BLOB_CHOICE>);
129 }
int UNICHAR_ID
Definition: unichar.h:33
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
int get_pos() const
Definition: unicharset.h:65
const char * get_unichar() const
Definition: unicharset.h:64
int get_total() const
Definition: unicharset.h:66
void set_unichar_id(UNICHAR_ID newunichar_id)
Definition: ratngs.h:144
UNICHARSET unicharset
Definition: ccutil.h:68
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:194

◆ FillLattice()

void tesseract::Wordrec::FillLattice ( const MATRIX ratings,
const WERD_CHOICE_LIST &  best_choices,
const UNICHARSET unicharset,
BlamerBundle blamer_bundle 
)

◆ get_fragment_lists()

void tesseract::Wordrec::get_fragment_lists ( inT16  current_frag,
inT16  current_row,
inT16  start,
inT16  num_frag_parts,
inT16  num_blobs,
MATRIX ratings,
BLOB_CHOICE_LIST *  choice_lists 
)

Definition at line 281 of file pieces.cpp.

284  {
285  if (current_frag == num_frag_parts) {
286  merge_and_put_fragment_lists(start, current_row - 1, num_frag_parts,
287  choice_lists, ratings);
288  return;
289  }
290 
291  for (inT16 x = current_row; x < num_blobs; x++) {
292  BLOB_CHOICE_LIST *choices = ratings->get(current_row, x);
293  if (choices == NULL)
294  continue;
295 
296  fill_filtered_fragment_list(choices, current_frag, num_frag_parts,
297  &choice_lists[current_frag]);
298  if (!choice_lists[current_frag].empty()) {
299  get_fragment_lists(current_frag + 1, x + 1, start, num_frag_parts,
300  num_blobs, ratings, choice_lists);
301  choice_lists[current_frag].clear();
302  }
303  }
304 }
T get(ICOORD pos) const
Definition: matrix.h:223
int16_t inT16
Definition: host.h:36
void fill_filtered_fragment_list(BLOB_CHOICE_LIST *choices, int fragment_pos, int num_frag_parts, BLOB_CHOICE_LIST *filtered_choices)
Definition: pieces.cpp:105
void get_fragment_lists(inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:281
void merge_and_put_fragment_lists(inT16 row, inT16 column, inT16 num_frag_parts, BLOB_CHOICE_LIST *choice_lists, MATRIX *ratings)
Definition: pieces.cpp:138

◆ grade_sharpness()

PRIORITY tesseract::Wordrec::grade_sharpness ( register SPLIT split)

Definition at line 74 of file gradechop.cpp.

74  {
75  PRIORITY grade;
76 
77  grade = point_priority (split->point1) + point_priority (split->point2);
78 
79  if (grade < -360.0)
80  grade = 0;
81  else
82  grade += 360.0;
83 
84  grade *= chop_sharpness_knob; /* Values 0 to -360 */
85 
86  return (grade);
87 }
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:54
double chop_sharpness_knob
Definition: wordrec.h:154
float PRIORITY
Definition: seam.h:42

◆ grade_split_length()

PRIORITY tesseract::Wordrec::grade_split_length ( register SPLIT split)

Definition at line 51 of file gradechop.cpp.

51  {
52  PRIORITY grade;
53  float split_length;
54 
55  split_length =
56  split->point1->WeightedDistance(*split->point2, chop_x_y_weight);
57 
58  if (split_length <= 0)
59  grade = 0;
60  else
61  grade = sqrt (split_length) * chop_split_dist_knob;
62 
63  return (MAX (0.0, grade));
64 }
float PRIORITY
Definition: seam.h:42
double chop_split_dist_knob
Definition: wordrec.h:149
#define MAX(x, y)
Definition: ndminx.h:24

◆ improve_by_chopping()

void tesseract::Wordrec::improve_by_chopping ( float  rating_cert_scale,
WERD_RES word,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending 
)

Definition at line 456 of file chopper.cpp.

461  {
462  int blob_number;
463  do { // improvement loop.
464  // Make a simple vector of BLOB_CHOICEs to make it easy to pick which
465  // one to chop.
466  GenericVector<BLOB_CHOICE*> blob_choices;
467  int num_blobs = word->ratings->dimension();
468  for (int i = 0; i < num_blobs; ++i) {
469  BLOB_CHOICE_LIST* choices = word->ratings->get(i, i);
470  if (choices == NULL || choices->empty()) {
471  blob_choices.push_back(NULL);
472  } else {
473  BLOB_CHOICE_IT bc_it(choices);
474  blob_choices.push_back(bc_it.data());
475  }
476  }
477  SEAM* seam = improve_one_blob(blob_choices, &best_choice_bundle->fixpt,
478  false, false, word, &blob_number);
479  if (seam == NULL) break;
480  // A chop has been made. We have to correct all the data structures to
481  // take into account the extra bottom-level blob.
482  // Put the seam into the seam_array and correct everything else on the
483  // word: ratings matrix (including matrix location in the BLOB_CHOICES),
484  // states in WERD_CHOICEs, and blob widths.
485  word->InsertSeam(blob_number, seam);
486  // Insert a new entry in the beam array.
487  best_choice_bundle->beam.insert(new LanguageModelState, blob_number);
488  // Fixpts are outdated, but will get recalculated.
489  best_choice_bundle->fixpt.clear();
490  // Remap existing pain points.
491  pain_points->RemapForSplit(blob_number);
492  // Insert a new pending at the chop point.
493  pending->insert(SegSearchPending(), blob_number);
494 
495  // Classify the two newly created blobs using ProcessSegSearchPainPoint,
496  // as that updates the pending correctly and adds new pain points.
497  MATRIX_COORD pain_point(blob_number, blob_number);
498  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop1", pending, word,
499  pain_points, blamer_bundle);
500  pain_point.col = blob_number + 1;
501  pain_point.row = blob_number + 1;
502  ProcessSegSearchPainPoint(0.0f, pain_point, "Chop2", pending, word,
503  pain_points, blamer_bundle);
505  // N-gram evaluation depends on the number of blobs in a chunk, so we
506  // have to re-evaluate everything in the word.
507  ResetNGramSearch(word, best_choice_bundle, pending);
508  blob_number = 0;
509  }
510  // Run language model incrementally. (Except with the n-gram model on.)
511  UpdateSegSearchNodes(rating_cert_scale, blob_number, pending,
512  word, pain_points, best_choice_bundle, blamer_bundle);
513  } while (!language_model_->AcceptableChoiceFound() &&
514  word->ratings->dimension() < kMaxNumChunks);
515 
516  // If after running only the chopper best_choice is incorrect and no blame
517  // has been yet set, blame the classifier if best_choice is classifier's
518  // top choice and is a dictionary word (i.e. language model could not have
519  // helped). Otherwise blame the tradeoff between the classifier and
520  // the old language model (permuters).
521  if (word->blamer_bundle != NULL &&
523  !word->blamer_bundle->ChoiceIsCorrect(word->best_choice)) {
524  bool valid_permuter = word->best_choice != NULL &&
527  getDict().getUnicharset(),
528  valid_permuter,
530  }
531 }
WERD_CHOICE * best_choice
Definition: pageres.h:219
Dict & getDict()
Definition: classify.h:65
BlamerBundle * blamer_bundle
Definition: pageres.h:230
static bool valid_word_permuter(uinT8 perm, bool numbers_ok)
Check all the DAWGs to see if this word is in any of them.
Definition: dict.h:455
SEAM * improve_one_blob(const GenericVector< BLOB_CHOICE *> &blob_choices, DANGERR *fixpt, bool split_next_to_fragment, bool italic_blob, WERD_RES *word, int *blob_number)
Definition: chopper.cpp:329
T get(ICOORD pos) const
Definition: matrix.h:223
int push_back(T object)
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:194
void ResetNGramSearch(WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, GenericVector< SegSearchPending > *pending)
Definition: segsearch.cpp:325
void BlameClassifierOrLangModel(const WERD_RES *word, const UNICHARSET &unicharset, bool valid_permuter, bool debug)
Definition: blamer.cpp:369
MATRIX * ratings
Definition: pageres.h:215
IncorrectResultReason incorrect_result_reason() const
Definition: blamer.h:106
uinT8 permuter() const
Definition: ratngs.h:344
void InsertSeam(int blob_number, SEAM *seam)
Definition: pageres.cpp:410
void insert(T t, int index)
Definition: seam.h:44
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:262
int dimension() const
Definition: matrix.h:521
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:111
LanguageModel * language_model_
Definition: wordrec.h:410
bool wordrec_debug_blamer
Definition: wordrec.h:167

◆ improve_one_blob()

SEAM * tesseract::Wordrec::improve_one_blob ( const GenericVector< BLOB_CHOICE *> &  blob_choices,
DANGERR fixpt,
bool  split_next_to_fragment,
bool  italic_blob,
WERD_RES word,
int blob_number 
)

Definition at line 329 of file chopper.cpp.

334  {
335  float rating_ceiling = MAX_FLOAT32;
336  SEAM *seam = NULL;
337  do {
338  *blob_number = select_blob_to_split_from_fixpt(fixpt);
339  if (chop_debug) tprintf("blob_number from fixpt = %d\n", *blob_number);
340  bool split_point_from_dict = (*blob_number != -1);
341  if (split_point_from_dict) {
342  fixpt->clear();
343  } else {
344  *blob_number = select_blob_to_split(blob_choices, rating_ceiling,
345  split_next_to_fragment);
346  }
347  if (chop_debug) tprintf("blob_number = %d\n", *blob_number);
348  if (*blob_number == -1)
349  return NULL;
350 
351  // TODO(rays) it may eventually help to allow italic_blob to be true,
352  seam = chop_numbered_blob(word->chopped_word, *blob_number, italic_blob,
353  word->seam_array);
354  if (seam != NULL)
355  return seam; // Success!
356  if (blob_choices[*blob_number] == NULL)
357  return NULL;
358  if (!split_point_from_dict) {
359  // We chopped the worst rated blob, try something else next time.
360  rating_ceiling = blob_choices[*blob_number]->rating();
361  }
362  } while (true);
363  return seam;
364 }
int select_blob_to_split(const GenericVector< BLOB_CHOICE *> &blob_choices, float rating_ceiling, bool split_next_to_fragment)
Definition: chopper.cpp:540
#define tprintf(...)
Definition: tprintf.h:31
int select_blob_to_split_from_fixpt(DANGERR *fixpt)
Definition: chopper.cpp:628
Definition: seam.h:44
#define MAX_FLOAT32
Definition: host.h:66
SEAM * chop_numbered_blob(TWERD *word, inT32 blob_number, bool italic_blob, const GenericVector< SEAM *> &seams)
Definition: chopper.cpp:223
TWERD * chopped_word
Definition: pageres.h:201
GenericVector< SEAM * > seam_array
Definition: pageres.h:203

◆ InitBlamerForSegSearch()

void tesseract::Wordrec::InitBlamerForSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle,
STRING blamer_debug 
)
protected

Definition at line 342 of file segsearch.cpp.

345  {
346  pain_points->Clear(); // Clear pain points heap.
348  pain_points, &LMPainPoints::GenerateForBlamer,
349  static_cast<double>(segsearch_max_char_wh_ratio), word_res);
350  blamer_bundle->InitForSegSearch(word_res->best_choice, word_res->ratings,
351  getDict().WildcardID(), wordrec_debug_blamer,
352  blamer_debug, pp_cb);
353  delete pp_cb;
354 }
_ConstTessMemberResultCallback_0_0< false, R, T1 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)() const)
Definition: tesscallback.h:116
WERD_CHOICE * best_choice
Definition: pageres.h:219
Dict & getDict()
Definition: classify.h:65
MATRIX * ratings
Definition: pageres.h:215
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
bool wordrec_debug_blamer
Definition: wordrec.h:167
bool GenerateForBlamer(double max_char_wh_ratio, WERD_RES *word_res, int col, int row)
void InitForSegSearch(const WERD_CHOICE *best_choice, MATRIX *ratings, UNICHAR_ID wildcard_id, bool debug, STRING *debug_str, TessResultCallback2< bool, int, int > *pp_cb)
Definition: blamer.cpp:473

◆ InitialSegSearch()

void tesseract::Wordrec::InitialSegSearch ( WERD_RES word_res,
LMPainPoints pain_points,
GenericVector< SegSearchPending > *  pending,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 150 of file segsearch.cpp.

153  {
154  if (segsearch_debug_level > 0) {
155  tprintf("Starting SegSearch on ratings matrix%s:\n",
156  wordrec_enable_assoc ? " (with assoc)" : "");
157  word_res->ratings->print(getDict().getUnicharset());
158  }
159 
160  pain_points->GenerateInitial(word_res);
161 
162  // Compute scaling factor that will help us recover blob outline length
163  // from classifier rating and certainty for the blob.
164  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
165 
168  segsearch_max_char_wh_ratio, rating_cert_scale);
169 
170  // Initialize blamer-related information: map character boxes recorded in
171  // blamer_bundle->norm_truth_word to the corresponding i,j indices in the
172  // ratings matrix. We expect this step to succeed, since when running the
173  // chopper we checked that the correct chops are present.
174  if (blamer_bundle != NULL) {
175  blamer_bundle->SetupCorrectSegmentation(word_res->chopped_word,
177  }
178 
179  // pending[col] tells whether there is update work to do to combine
180  // best_choice_bundle->beam[col - 1] with some BLOB_CHOICEs in matrix[col, *].
181  // As the language model state is updated, pending entries are modified to
182  // minimize duplication of work. It is important that during the update the
183  // children are considered in the non-decreasing order of their column, since
184  // this guarantees that all the parents would be up to date before an update
185  // of a child is done.
186  pending->init_to_size(word_res->ratings->dimension(), SegSearchPending());
187 
188  // Search the ratings matrix for the initial best path.
189  (*pending)[0].SetColumnClassified();
190  UpdateSegSearchNodes(rating_cert_scale, 0, pending, word_res,
191  pain_points, best_choice_bundle, blamer_bundle);
192 }
double certainty_scale
Definition: dict.h:611
void init_to_size(int size, T t)
Dict & getDict()
Definition: classify.h:65
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
bool wordrec_enable_assoc
Definition: wordrec.h:130
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
#define tprintf(...)
Definition: tprintf.h:31
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:194
MATRIX * ratings
Definition: pageres.h:215
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
int dimension() const
Definition: matrix.h:521
int segsearch_debug_level
Definition: wordrec.h:169
void SetupCorrectSegmentation(const TWERD *word, bool debug)
Definition: blamer.cpp:407
LanguageModel * language_model_
Definition: wordrec.h:410
bool wordrec_debug_blamer
Definition: wordrec.h:167
TWERD * chopped_word
Definition: pageres.h:201
WERD_CHOICE * prev_word_best_choice_
Definition: wordrec.h:415

◆ is_inside_angle()

bool tesseract::Wordrec::is_inside_angle ( EDGEPT pt)

Definition at line 78 of file chop.cpp.

78  {
79  return angle_change(pt->prev, pt, pt->next) < chop_inside_angle;
80 }
EDGEPT * prev
Definition: blobs.h:170
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:88
EDGEPT * next
Definition: blobs.h:169

◆ merge_and_put_fragment_lists()

void tesseract::Wordrec::merge_and_put_fragment_lists ( inT16  row,
inT16  column,
inT16  num_frag_parts,
BLOB_CHOICE_LIST *  choice_lists,
MATRIX ratings 
)

Definition at line 138 of file pieces.cpp.

141  {
142  BLOB_CHOICE_IT *choice_lists_it = new BLOB_CHOICE_IT[num_frag_parts];
143 
144  for (int i = 0; i < num_frag_parts; i++) {
145  choice_lists_it[i].set_to_list(&choice_lists[i]);
146  choice_lists_it[i].mark_cycle_pt();
147  }
148 
149  BLOB_CHOICE_LIST *merged_choice = ratings->get(row, column);
150  if (merged_choice == NULL)
151  merged_choice = new BLOB_CHOICE_LIST;
152 
153  bool end_of_list = false;
154  BLOB_CHOICE_IT merged_choice_it(merged_choice);
155  while (!end_of_list) {
156  // Find the maximum unichar_id of the current entry the iterators
157  // are pointing at
158  UNICHAR_ID max_unichar_id = choice_lists_it[0].data()->unichar_id();
159  for (int i = 0; i < num_frag_parts; i++) {
160  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
161  if (max_unichar_id < unichar_id) {
162  max_unichar_id = unichar_id;
163  }
164  }
165 
166  // Move the each iterators until it gets to an entry that has a
167  // value greater than or equal to max_unichar_id
168  for (int i = 0; i < num_frag_parts; i++) {
169  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
170  while (!choice_lists_it[i].cycled_list() &&
171  unichar_id < max_unichar_id) {
172  choice_lists_it[i].forward();
173  unichar_id = choice_lists_it[i].data()->unichar_id();
174  }
175  if (choice_lists_it[i].cycled_list()) {
176  end_of_list = true;
177  break;
178  }
179  }
180 
181  if (end_of_list)
182  break;
183 
184  // Checks if the fragments are parts of the same character
185  UNICHAR_ID first_unichar_id = choice_lists_it[0].data()->unichar_id();
186  bool same_unichar = true;
187  for (int i = 1; i < num_frag_parts; i++) {
188  UNICHAR_ID unichar_id = choice_lists_it[i].data()->unichar_id();
189  if (unichar_id != first_unichar_id) {
190  same_unichar = false;
191  break;
192  }
193  }
194 
195  if (same_unichar) {
196  // Add the merged character to the result
197  UNICHAR_ID merged_unichar_id = first_unichar_id;
198  GenericVector<ScoredFont> merged_fonts =
199  choice_lists_it[0].data()->fonts();
200  float merged_min_xheight = choice_lists_it[0].data()->min_xheight();
201  float merged_max_xheight = choice_lists_it[0].data()->max_xheight();
202  float positive_yshift = 0, negative_yshift = 0;
203  int merged_script_id = choice_lists_it[0].data()->script_id();
204  BlobChoiceClassifier classifier = choice_lists_it[0].data()->classifier();
205 
206  float merged_rating = 0, merged_certainty = 0;
207  for (int i = 0; i < num_frag_parts; i++) {
208  float rating = choice_lists_it[i].data()->rating();
209  float certainty = choice_lists_it[i].data()->certainty();
210 
211  if (i == 0 || certainty < merged_certainty)
212  merged_certainty = certainty;
213  merged_rating += rating;
214 
215  choice_lists_it[i].forward();
216  if (choice_lists_it[i].cycled_list())
217  end_of_list = true;
218  IntersectRange(choice_lists_it[i].data()->min_xheight(),
219  choice_lists_it[i].data()->max_xheight(),
220  &merged_min_xheight, &merged_max_xheight);
221  float yshift = choice_lists_it[i].data()->yshift();
222  if (yshift > positive_yshift) positive_yshift = yshift;
223  if (yshift < negative_yshift) negative_yshift = yshift;
224  // Use the min font rating over the parts.
225  // TODO(rays) font lists are unsorted. Need to be faster?
226  const GenericVector<ScoredFont>& frag_fonts =
227  choice_lists_it[i].data()->fonts();
228  for (int f = 0; f < frag_fonts.size(); ++f) {
229  int merged_f = 0;
230  for (merged_f = 0; merged_f < merged_fonts.size() &&
231  merged_fonts[merged_f].fontinfo_id != frag_fonts[f].fontinfo_id;
232  ++merged_f) {}
233  if (merged_f == merged_fonts.size()) {
234  merged_fonts.push_back(frag_fonts[f]);
235  } else if (merged_fonts[merged_f].score > frag_fonts[f].score) {
236  merged_fonts[merged_f].score = frag_fonts[f].score;
237  }
238  }
239  }
240 
241  float merged_yshift = positive_yshift != 0
242  ? (negative_yshift != 0 ? 0 : positive_yshift)
243  : negative_yshift;
244  BLOB_CHOICE* choice = new BLOB_CHOICE(merged_unichar_id,
245  merged_rating,
246  merged_certainty,
247  merged_script_id,
248  merged_min_xheight,
249  merged_max_xheight,
250  merged_yshift,
251  classifier);
252  choice->set_fonts(merged_fonts);
253  merged_choice_it.add_to_end(choice);
254  }
255  }
256 
258  print_ratings_list("Merged Fragments", merged_choice,
259  unicharset);
260 
261  if (merged_choice->empty())
262  delete merged_choice;
263  else
264  ratings->put(row, column, merged_choice);
265 
266  delete [] choice_lists_it;
267 }
int UNICHAR_ID
Definition: unichar.h:33
T get(ICOORD pos) const
Definition: matrix.h:223
int push_back(T object)
void set_fonts(const GenericVector< tesseract::ScoredFont > &fonts)
Definition: ratngs.h:94
int size() const
Definition: genericvector.h:72
void IntersectRange(const T &lower1, const T &upper1, T *lower2, T *upper2)
Definition: helpers.h:153
UNICHARSET unicharset
Definition: ccutil.h:68
void put(ICOORD pos, const T &thing)
Definition: matrix.h:215
BlobChoiceClassifier
Definition: ratngs.h:40
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819

◆ merge_fragments()

void tesseract::Wordrec::merge_fragments ( MATRIX ratings,
inT16  num_blobs 
)

Definition at line 313 of file pieces.cpp.

313  {
314  BLOB_CHOICE_LIST choice_lists[CHAR_FRAGMENT::kMaxChunks];
315  for (inT16 start = 0; start < num_blobs; start++) {
316  for (int frag_parts = 2; frag_parts <= CHAR_FRAGMENT::kMaxChunks;
317  frag_parts++) {
318  get_fragment_lists(0, start, start, frag_parts, num_blobs,
319  ratings, choice_lists);
320  }
321  }
322 
323  // Delete fragments from the rating matrix
324  for (inT16 x = 0; x < num_blobs; x++) {
325  for (inT16 y = x; y < num_blobs; y++) {
326  BLOB_CHOICE_LIST *choices = ratings->get(x, y);
327  if (choices != NULL) {
328  BLOB_CHOICE_IT choices_it(choices);
329  for (choices_it.mark_cycle_pt(); !choices_it.cycled_list();
330  choices_it.forward()) {
331  UNICHAR_ID choice_unichar_id = choices_it.data()->unichar_id();
332  const CHAR_FRAGMENT *frag =
333  unicharset.get_fragment(choice_unichar_id);
334  if (frag != NULL)
335  delete choices_it.extract();
336  }
337  }
338  }
339  }
340 }
int UNICHAR_ID
Definition: unichar.h:33
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
static const int kMaxChunks
Definition: unicharset.h:49
T get(ICOORD pos) const
Definition: matrix.h:223
int16_t inT16
Definition: host.h:36
void get_fragment_lists(inT16 current_frag, inT16 current_row, inT16 start, inT16 num_frag_parts, inT16 num_blobs, MATRIX *ratings, BLOB_CHOICE_LIST *choice_lists)
Definition: pieces.cpp:281
UNICHARSET unicharset
Definition: ccutil.h:68

◆ near_point()

bool tesseract::Wordrec::near_point ( EDGEPT point,
EDGEPT line_pt_0,
EDGEPT line_pt_1,
EDGEPT **  near_pt 
)

Definition at line 49 of file outlines.cpp.

51  {
52  TPOINT p;
53 
54  float slope;
55  float intercept;
56 
57  float x0 = line_pt_0->pos.x;
58  float x1 = line_pt_1->pos.x;
59  float y0 = line_pt_0->pos.y;
60  float y1 = line_pt_1->pos.y;
61 
62  if (x0 == x1) {
63  /* Handle vertical line */
64  p.x = (inT16) x0;
65  p.y = point->pos.y;
66  }
67  else {
68  /* Slope and intercept */
69  slope = (y0 - y1) / (x0 - x1);
70  intercept = y1 - x1 * slope;
71 
72  /* Find perpendicular */
73  p.x = (inT16) ((point->pos.x + (point->pos.y - intercept) * slope) /
74  (slope * slope + 1));
75  p.y = (inT16) (slope * p.x + intercept);
76  }
77 
78  if (is_on_line (p, line_pt_0->pos, line_pt_1->pos) &&
79  (!same_point (p, line_pt_0->pos)) && (!same_point (p, line_pt_1->pos))) {
80  /* Intersection on line */
81  *near_pt = make_edgept(p.x, p.y, line_pt_1, line_pt_0);
82  return true;
83  } else { /* Intersection not on line */
84  *near_pt = closest(point, line_pt_0, line_pt_1);
85  return false;
86  }
87 }
EDGEPT * make_edgept(int x, int y, EDGEPT *next, EDGEPT *prev)
Definition: split.cpp:147
TPOINT pos
Definition: blobs.h:163
int16_t inT16
Definition: host.h:36
inT16 x
Definition: blobs.h:71
#define closest(test_p, p1, p2)
Definition: outlines.h:71
#define same_point(p1, p2)
Definition: outlines.h:49
inT16 y
Definition: blobs.h:72
Definition: blobs.h:50
#define is_on_line(p, p0, p1)
Definition: outlines.h:120

◆ new_max_point()

void tesseract::Wordrec::new_max_point ( EDGEPT local_max,
PointHeap points 
)

Definition at line 245 of file chop.cpp.

245  {
246  inT16 dir;
247 
248  dir = direction (local_max);
249 
250  if (dir > 0) {
251  add_point_to_list(points, local_max);
252  return;
253  }
254 
255  if (dir == 0 && point_priority (local_max) < 0) {
256  add_point_to_list(points, local_max);
257  return;
258  }
259 }
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:54
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
int16_t inT16
Definition: host.h:36
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:64

◆ new_min_point()

void tesseract::Wordrec::new_min_point ( EDGEPT local_min,
PointHeap points 
)

Definition at line 221 of file chop.cpp.

221  {
222  inT16 dir;
223 
224  dir = direction (local_min);
225 
226  if (dir < 0) {
227  add_point_to_list(points, local_min);
228  return;
229  }
230 
231  if (dir == 0 && point_priority (local_min) < 0) {
232  add_point_to_list(points, local_min);
233  return;
234  }
235 }
PRIORITY point_priority(EDGEPT *point)
Definition: chop.cpp:54
int direction(EDGEPT *point)
Definition: vecfuncs.cpp:43
int16_t inT16
Definition: host.h:36
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:64

◆ pick_close_point()

EDGEPT * tesseract::Wordrec::pick_close_point ( EDGEPT critical_point,
EDGEPT vertical_point,
int best_dist 
)

Definition at line 124 of file chop.cpp.

126  {
127  EDGEPT *best_point = NULL;
128  int this_distance;
129  int found_better;
130 
131  do {
132  found_better = FALSE;
133 
134  this_distance = edgept_dist (critical_point, vertical_point);
135  if (this_distance <= *best_dist) {
136 
137  if (!(same_point (critical_point->pos, vertical_point->pos) ||
138  same_point (critical_point->pos, vertical_point->next->pos) ||
139  (best_point && same_point (best_point->pos, vertical_point->pos)) ||
140  is_exterior_point (critical_point, vertical_point))) {
141  *best_dist = this_distance;
142  best_point = vertical_point;
144  found_better = TRUE;
145  }
146  }
147  vertical_point = vertical_point->next;
148  }
149  while (found_better == TRUE);
150 
151  return (best_point);
152 }
TPOINT pos
Definition: blobs.h:163
#define TRUE
Definition: capi.h:45
#define edgept_dist(p1, p2)
Definition: outlines.h:87
#define FALSE
Definition: capi.h:46
#define is_exterior_point(edge, point)
Definition: outlines.h:97
EDGEPT * next
Definition: blobs.h:169
#define same_point(p1, p2)
Definition: outlines.h:49
Definition: blobs.h:76
bool chop_vertical_creep
Definition: wordrec.h:141

◆ pick_good_seam()

SEAM * tesseract::Wordrec::pick_good_seam ( TBLOB blob)

Definition at line 215 of file findseam.cpp.

215  {
216  SeamPile seam_pile(chop_seam_pile_size);
217  EDGEPT *points[MAX_NUM_POINTS];
218  EDGEPT_CLIST new_points;
219  SEAM *seam = NULL;
220  TESSLINE *outline;
221  inT16 num_points = 0;
222 
223 #ifndef GRAPHICS_DISABLED
224  if (chop_debug > 2)
225  wordrec_display_splits.set_value(true);
226 
227  draw_blob_edges(blob);
228 #endif
229 
230  PointHeap point_heap(MAX_NUM_POINTS);
231  for (outline = blob->outlines; outline; outline = outline->next)
232  prioritize_points(outline, &point_heap);
233 
234  while (!point_heap.empty() && num_points < MAX_NUM_POINTS) {
235  points[num_points++] = point_heap.PeekTop().data;
236  point_heap.Pop(NULL);
237  }
238 
239  /* Initialize queue */
240  SeamQueue seam_queue(MAX_NUM_SEAMS);
241 
242  try_point_pairs(points, num_points, &seam_queue, &seam_pile, &seam, blob);
243  try_vertical_splits(points, num_points, &new_points,
244  &seam_queue, &seam_pile, &seam, blob);
245 
246  if (seam == NULL) {
247  choose_best_seam(&seam_queue, NULL, BAD_PRIORITY, &seam, blob, &seam_pile);
248  } else if (seam->priority() > chop_good_split) {
249  choose_best_seam(&seam_queue, NULL, seam->priority(), &seam, blob,
250  &seam_pile);
251  }
252 
253  EDGEPT_C_IT it(&new_points);
254  for (it.mark_cycle_pt(); !it.cycled_list(); it.forward()) {
255  EDGEPT *inserted_point = it.data();
256  if (seam == NULL || !seam->UsesPoint(inserted_point)) {
257  for (outline = blob->outlines; outline; outline = outline->next) {
258  if (outline->loop == inserted_point) {
259  outline->loop = outline->loop->next;
260  }
261  }
262  remove_edgept(inserted_point);
263  }
264  }
265 
266  if (seam) {
267  if (seam->priority() > chop_ok_split) {
268  delete seam;
269  seam = NULL;
270  }
271 #ifndef GRAPHICS_DISABLED
272  else if (wordrec_display_splits) {
273  seam->Mark(edge_window);
274  if (chop_debug > 2) {
277  }
278  }
279 #endif
280  }
281 
282  if (chop_debug)
283  wordrec_display_splits.set_value(false);
284 
285  return (seam);
286 }
TESSLINE * next
Definition: blobs.h:258
#define edge_window_wait()
Definition: plotedges.h:57
bool wordrec_display_splits
Definition: split.cpp:49
void draw_blob_edges(TBLOB *blob)
Definition: plotedges.cpp:77
double chop_ok_split
Definition: wordrec.h:156
TESSLINE * outlines
Definition: blobs.h:377
void remove_edgept(EDGEPT *point)
Definition: split.cpp:208
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:102
void try_point_pairs(EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:296
int16_t inT16
Definition: host.h:36
#define MAX_NUM_POINTS
Definition: chop.h:39
#define MAX_NUM_SEAMS
Definition: findseam.cpp:45
int chop_seam_pile_size
Definition: wordrec.h:145
EDGEPT * loop
Definition: blobs.h:257
Definition: seam.h:44
void prioritize_points(TESSLINE *outline, PointHeap *points)
Definition: chop.cpp:162
float priority() const
Definition: seam.h:65
EDGEPT * next
Definition: blobs.h:169
Definition: blobs.h:76
bool UsesPoint(const EDGEPT *point) const
Definition: seam.h:88
void try_vertical_splits(EDGEPT *points[MAX_NUM_POINTS], inT16 num_points, EDGEPT_CLIST *new_points, SeamQueue *seam_queue, SeamPile *seam_pile, SEAM **seam, TBLOB *blob)
Definition: findseam.cpp:334
double chop_good_split
Definition: wordrec.h:157
void Mark(ScrollView *window) const
Definition: seam.cpp:186
ScrollView * edge_window
Definition: plotedges.cpp:43
#define update_edge_window()
Definition: plotedges.h:45
#define BAD_PRIORITY
Definition: findseam.cpp:50

◆ point_priority()

PRIORITY tesseract::Wordrec::point_priority ( EDGEPT point)

Definition at line 54 of file chop.cpp.

54  {
55  return (PRIORITY)angle_change(point->prev, point, point->next);
56 }
EDGEPT * prev
Definition: blobs.h:170
float PRIORITY
Definition: seam.h:42
int angle_change(EDGEPT *point1, EDGEPT *point2, EDGEPT *point3)
Definition: chop.cpp:88
EDGEPT * next
Definition: blobs.h:169

◆ prioritize_points()

void tesseract::Wordrec::prioritize_points ( TESSLINE outline,
PointHeap points 
)

Definition at line 162 of file chop.cpp.

162  {
163  EDGEPT *this_point;
164  EDGEPT *local_min = NULL;
165  EDGEPT *local_max = NULL;
166 
167  this_point = outline->loop;
168  local_min = this_point;
169  local_max = this_point;
170  do {
171  if (this_point->vec.y < 0) {
172  /* Look for minima */
173  if (local_max != NULL)
174  new_max_point(local_max, points);
175  else if (is_inside_angle (this_point))
176  add_point_to_list(points, this_point);
177  local_max = NULL;
178  local_min = this_point->next;
179  }
180  else if (this_point->vec.y > 0) {
181  /* Look for maxima */
182  if (local_min != NULL)
183  new_min_point(local_min, points);
184  else if (is_inside_angle (this_point))
185  add_point_to_list(points, this_point);
186  local_min = NULL;
187  local_max = this_point->next;
188  }
189  else {
190  /* Flat area */
191  if (local_max != NULL) {
192  if (local_max->prev->vec.y != 0) {
193  new_max_point(local_max, points);
194  }
195  local_max = this_point->next;
196  local_min = NULL;
197  }
198  else {
199  if (local_min->prev->vec.y != 0) {
200  new_min_point(local_min, points);
201  }
202  local_min = this_point->next;
203  local_max = NULL;
204  }
205  }
206 
207  /* Next point */
208  this_point = this_point->next;
209  }
210  while (this_point != outline->loop);
211 }
void new_max_point(EDGEPT *local_max, PointHeap *points)
Definition: chop.cpp:245
void new_min_point(EDGEPT *local_min, PointHeap *points)
Definition: chop.cpp:221
EDGEPT * prev
Definition: blobs.h:170
VECTOR vec
Definition: blobs.h:164
EDGEPT * loop
Definition: blobs.h:257
EDGEPT * next
Definition: blobs.h:169
Definition: blobs.h:76
void add_point_to_list(PointHeap *point_heap, EDGEPT *point)
Definition: chop.cpp:64
inT16 y
Definition: blobs.h:72
bool is_inside_angle(EDGEPT *pt)
Definition: chop.cpp:78

◆ ProcessSegSearchPainPoint()

void tesseract::Wordrec::ProcessSegSearchPainPoint ( float  pain_point_priority,
const MATRIX_COORD pain_point,
const char *  pain_point_type,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BlamerBundle blamer_bundle 
)
protected

Definition at line 262 of file segsearch.cpp.

266  {
267  if (segsearch_debug_level > 0) {
268  tprintf("Classifying pain point %s priority=%.4f, col=%d, row=%d\n",
269  pain_point_type, pain_point_priority,
270  pain_point.col, pain_point.row);
271  }
272  ASSERT_HOST(pain_points != NULL);
273  MATRIX *ratings = word_res->ratings;
274  // Classify blob [pain_point.col pain_point.row]
275  if (!pain_point.Valid(*ratings)) {
276  ratings->IncreaseBandSize(pain_point.row + 1 - pain_point.col);
277  }
278  ASSERT_HOST(pain_point.Valid(*ratings));
279  BLOB_CHOICE_LIST *classified = classify_piece(word_res->seam_array,
280  pain_point.col, pain_point.row,
281  pain_point_type,
282  word_res->chopped_word,
283  blamer_bundle);
284  BLOB_CHOICE_LIST *lst = ratings->get(pain_point.col, pain_point.row);
285  if (lst == NULL) {
286  ratings->put(pain_point.col, pain_point.row, classified);
287  } else {
288  // We can not delete old BLOB_CHOICEs, since they might contain
289  // ViterbiStateEntries that are parents of other "active" entries.
290  // Thus if the matrix cell already contains classifications we add
291  // the new ones to the beginning of the list.
292  BLOB_CHOICE_IT it(lst);
293  it.add_list_before(classified);
294  delete classified; // safe to delete, since empty after add_list_before()
295  classified = NULL;
296  }
297 
298  if (segsearch_debug_level > 0) {
299  print_ratings_list("Updated ratings matrix with a new entry:",
300  ratings->get(pain_point.col, pain_point.row),
301  getDict().getUnicharset());
302  ratings->print(getDict().getUnicharset());
303  }
304 
305  // Insert initial "pain points" to join the newly classified blob
306  // with its left and right neighbors.
307  if (classified != NULL && !classified->empty()) {
308  if (pain_point.col > 0) {
309  pain_points->GeneratePainPoint(
310  pain_point.col - 1, pain_point.row, LM_PPTYPE_SHAPE, 0.0,
311  true, segsearch_max_char_wh_ratio, word_res);
312  }
313  if (pain_point.row + 1 < ratings->dimension()) {
314  pain_points->GeneratePainPoint(
315  pain_point.col, pain_point.row + 1, LM_PPTYPE_SHAPE, 0.0,
316  true, segsearch_max_char_wh_ratio, word_res);
317  }
318  }
319  (*pending)[pain_point.col].SetBlobClassified(pain_point.row);
320 }
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
Dict & getDict()
Definition: classify.h:65
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
T get(ICOORD pos) const
Definition: matrix.h:223
#define tprintf(...)
Definition: tprintf.h:31
#define ASSERT_HOST(x)
Definition: errcode.h:84
MATRIX * ratings
Definition: pageres.h:215
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
int dimension() const
Definition: matrix.h:521
void put(ICOORD pos, const T &thing)
Definition: matrix.h:215
Definition: matrix.h:563
int segsearch_debug_level
Definition: wordrec.h:169
bool Valid(const MATRIX &m) const
Definition: matrix.h:601
virtual BLOB_CHOICE_LIST * classify_piece(const GenericVector< SEAM *> &seams, inT16 start, inT16 end, const char *description, TWERD *word, BlamerBundle *blamer_bundle)
Definition: pieces.cpp:56
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:819
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
TWERD * chopped_word
Definition: pageres.h:201
GenericVector< SEAM * > seam_array
Definition: pageres.h:203

◆ program_editdown()

void tesseract::Wordrec::program_editdown ( inT32  elasped_time)

Definition at line 78 of file tface.cpp.

78  {
80  getDict().End();
81 }
Dict & getDict()
Definition: classify.h:65
void EndAdaptiveClassifier()
Definition: adaptmatch.cpp:456
void End()
Definition: dict.cpp:347

◆ program_editup()

void tesseract::Wordrec::program_editup ( const char *  textbase,
TessdataManager init_classifier,
TessdataManager init_dict 
)

Definition at line 46 of file tface.cpp.

48  {
49  if (textbase != NULL) imagefile = textbase;
51  InitAdaptiveClassifier(init_classifier);
52  if (init_dict) {
54  getDict().Load(lang, init_dict);
55  getDict().FinishLoad();
56  }
58 }
void InitFeatureDefs(FEATURE_DEFS_STRUCT *featuredefs)
Definition: featdefs.cpp:121
Dict & getDict()
Definition: classify.h:65
double chop_ok_split
Definition: wordrec.h:156
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:506
PRIORITY pass2_ok_split
Definition: wordrec.h:411
STRING lang
Definition: ccutil.h:66
STRING imagefile
Definition: ccutil.h:70
void InitAdaptiveClassifier(TessdataManager *mgr)
Definition: adaptmatch.cpp:527
static DawgCache * GlobalDawgCache()
Definition: dict.cpp:198
void SetupForLoad(DawgCache *dawg_cache)
Definition: dict.cpp:206
void Load(const STRING &lang, TessdataManager *data_file)
Definition: dict.cpp:224
bool FinishLoad()
Definition: dict.cpp:327

◆ ResetNGramSearch()

void tesseract::Wordrec::ResetNGramSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
GenericVector< SegSearchPending > *  pending 
)
protected

Definition at line 325 of file segsearch.cpp.

327  {
328  // TODO(rays) More refactoring required here.
329  // Delete existing viterbi states.
330  for (int col = 0; col < best_choice_bundle->beam.size(); ++col) {
331  best_choice_bundle->beam[col]->Clear();
332  }
333  // Reset best_choice_bundle.
334  word_res->ClearWordChoices();
335  best_choice_bundle->best_vse = NULL;
336  // Clear out all existing pendings and add a new one for the first column.
337  (*pending)[0].SetColumnClassified();
338  for (int i = 1; i < pending->size(); ++i)
339  (*pending)[i].Clear();
340 }
int size() const
Definition: genericvector.h:72
void ClearWordChoices()
Definition: pageres.cpp:1175

◆ SaveAltChoices()

void tesseract::Wordrec::SaveAltChoices ( const LIST best_choices,
WERD_RES word 
)

◆ SegSearch()

void tesseract::Wordrec::SegSearch ( WERD_RES word_res,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)

Definition at line 37 of file segsearch.cpp.

39  {
40  LMPainPoints pain_points(segsearch_max_pain_points,
44  // Compute scaling factor that will help us recover blob outline length
45  // from classifier rating and certainty for the blob.
46  float rating_cert_scale = -1.0 * getDict().certainty_scale / rating_scale;
48  InitialSegSearch(word_res, &pain_points, &pending, best_choice_bundle,
49  blamer_bundle);
50 
51  if (!SegSearchDone(0)) { // find a better choice
52  if (chop_enable && word_res->chopped_word != NULL) {
53  improve_by_chopping(rating_cert_scale, word_res, best_choice_bundle,
54  blamer_bundle, &pain_points, &pending);
55  }
56  if (chop_debug) SEAM::PrintSeams("Final seam list:", word_res->seam_array);
57 
58  if (blamer_bundle != NULL &&
59  !blamer_bundle->ChoiceIsCorrect(word_res->best_choice)) {
60  blamer_bundle->SetChopperBlame(word_res, wordrec_debug_blamer);
61  }
62  }
63  // Keep trying to find a better path by fixing the "pain points".
64 
65  MATRIX_COORD pain_point;
66  float pain_point_priority;
67  int num_futile_classifications = 0;
68  STRING blamer_debug;
69  while (wordrec_enable_assoc &&
70  (!SegSearchDone(num_futile_classifications) ||
71  (blamer_bundle != NULL &&
72  blamer_bundle->GuidedSegsearchStillGoing()))) {
73  // Get the next valid "pain point".
74  bool found_nothing = true;
75  LMPainPointsType pp_type;
76  while ((pp_type = pain_points.Deque(&pain_point, &pain_point_priority)) !=
77  LM_PPTYPE_NUM) {
78  if (!pain_point.Valid(*word_res->ratings)) {
79  word_res->ratings->IncreaseBandSize(
80  pain_point.row - pain_point.col + 1);
81  }
82  if (pain_point.Valid(*word_res->ratings) &&
83  !word_res->ratings->Classified(pain_point.col, pain_point.row,
84  getDict().WildcardID())) {
85  found_nothing = false;
86  break;
87  }
88  }
89  if (found_nothing) {
90  if (segsearch_debug_level > 0) tprintf("Pain points queue is empty\n");
91  break;
92  }
93  ProcessSegSearchPainPoint(pain_point_priority, pain_point,
95  &pending, word_res, &pain_points, blamer_bundle);
96 
97  UpdateSegSearchNodes(rating_cert_scale, pain_point.col, &pending,
98  word_res, &pain_points, best_choice_bundle,
99  blamer_bundle);
100  if (!best_choice_bundle->updated) ++num_futile_classifications;
101 
102  if (segsearch_debug_level > 0) {
103  tprintf("num_futile_classifications %d\n", num_futile_classifications);
104  }
105 
106  best_choice_bundle->updated = false; // reset updated
107 
108  // See if it's time to terminate SegSearch or time for starting a guided
109  // search for the true path to find the blame for the incorrect best_choice.
110  if (SegSearchDone(num_futile_classifications) &&
111  blamer_bundle != NULL &&
112  blamer_bundle->GuidedSegsearchNeeded(word_res->best_choice)) {
113  InitBlamerForSegSearch(word_res, &pain_points, blamer_bundle,
114  &blamer_debug);
115  }
116  } // end while loop exploring alternative paths
117  if (blamer_bundle != NULL) {
118  blamer_bundle->FinishSegSearch(word_res->best_choice,
119  wordrec_debug_blamer, &blamer_debug);
120  }
121 
122  if (segsearch_debug_level > 0) {
123  tprintf("Done with SegSearch (AcceptableChoiceFound: %d)\n",
125  }
126 }
bool SegSearchDone(int num_futile_classifications)
Definition: wordrec.h:425
double certainty_scale
Definition: dict.h:611
WERD_CHOICE * best_choice
Definition: pageres.h:219
Dict & getDict()
Definition: classify.h:65
bool wordrec_enable_assoc
Definition: wordrec.h:130
void FinishSegSearch(const WERD_CHOICE *best_choice, bool debug, STRING *debug_str)
Definition: blamer.cpp:506
#define tprintf(...)
Definition: tprintf.h:31
void SetChopperBlame(const WERD_RES *word, bool debug)
Definition: blamer.cpp:310
void UpdateSegSearchNodes(float rating_cert_scale, int starting_col, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:194
void InitBlamerForSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle, STRING *blamer_debug)
Definition: segsearch.cpp:342
MATRIX * ratings
Definition: pageres.h:215
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
bool GuidedSegsearchStillGoing() const
Definition: blamer.cpp:501
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
Definition: strngs.h:45
bool GuidedSegsearchNeeded(const WERD_CHOICE *best_choice) const
Definition: blamer.cpp:461
int segsearch_max_pain_points
Definition: wordrec.h:171
void ProcessSegSearchPainPoint(float pain_point_priority, const MATRIX_COORD &pain_point, const char *pain_point_type, GenericVector< SegSearchPending > *pending, WERD_RES *word_res, LMPainPoints *pain_points, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:262
bool ChoiceIsCorrect(const WERD_CHOICE *word_choice) const
Definition: blamer.cpp:111
bool Classified(int col, int row, int wildcard_id) const
Definition: matrix.cpp:36
int segsearch_debug_level
Definition: wordrec.h:169
void improve_by_chopping(float rating_cert_scale, WERD_RES *word, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending)
Definition: chopper.cpp:456
bool Valid(const MATRIX &m) const
Definition: matrix.h:601
static void PrintSeams(const char *label, const GenericVector< SEAM *> &seams)
Definition: seam.cpp:173
LanguageModel * language_model_
Definition: wordrec.h:410
bool wordrec_debug_blamer
Definition: wordrec.h:167
void IncreaseBandSize(int bandwidth)
Definition: matrix.cpp:49
TWERD * chopped_word
Definition: pageres.h:201
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:150
static const char * PainPointDescription(LMPainPointsType type)
GenericVector< SEAM * > seam_array
Definition: pageres.h:203

◆ SegSearchDone()

bool tesseract::Wordrec::SegSearchDone ( int  num_futile_classifications)
inlineprotected

Definition at line 425 of file wordrec.h.

425  {
427  num_futile_classifications >=
429  }
int segsearch_max_futile_classifications
Definition: wordrec.h:173
LanguageModel * language_model_
Definition: wordrec.h:410

◆ select_blob_to_split()

int tesseract::Wordrec::select_blob_to_split ( const GenericVector< BLOB_CHOICE *> &  blob_choices,
float  rating_ceiling,
bool  split_next_to_fragment 
)

Definition at line 540 of file chopper.cpp.

542  {
543  BLOB_CHOICE *blob_choice;
544  int x;
545  float worst = -MAX_FLOAT32;
546  int worst_index = -1;
547  float worst_near_fragment = -MAX_FLOAT32;
548  int worst_index_near_fragment = -1;
549  const CHAR_FRAGMENT **fragments = NULL;
550 
551  if (chop_debug) {
552  if (rating_ceiling < MAX_FLOAT32)
553  tprintf("rating_ceiling = %8.4f\n", rating_ceiling);
554  else
555  tprintf("rating_ceiling = No Limit\n");
556  }
557 
558  if (split_next_to_fragment && blob_choices.size() > 0) {
559  fragments = new const CHAR_FRAGMENT *[blob_choices.length()];
560  if (blob_choices[0] != NULL) {
561  fragments[0] = getDict().getUnicharset().get_fragment(
562  blob_choices[0]->unichar_id());
563  } else {
564  fragments[0] = NULL;
565  }
566  }
567 
568  for (x = 0; x < blob_choices.size(); ++x) {
569  if (blob_choices[x] == NULL) {
570  delete[] fragments;
571  return x;
572  } else {
573  blob_choice = blob_choices[x];
574  // Populate fragments for the following position.
575  if (split_next_to_fragment && x+1 < blob_choices.size()) {
576  if (blob_choices[x + 1] != NULL) {
577  fragments[x + 1] = getDict().getUnicharset().get_fragment(
578  blob_choices[x + 1]->unichar_id());
579  } else {
580  fragments[x + 1] = NULL;
581  }
582  }
583  if (blob_choice->rating() < rating_ceiling &&
584  blob_choice->certainty() < tessedit_certainty_threshold) {
585  // Update worst and worst_index.
586  if (blob_choice->rating() > worst) {
587  worst_index = x;
588  worst = blob_choice->rating();
589  }
590  if (split_next_to_fragment) {
591  // Update worst_near_fragment and worst_index_near_fragment.
592  bool expand_following_fragment =
593  (x + 1 < blob_choices.size() &&
594  fragments[x+1] != NULL && !fragments[x+1]->is_beginning());
595  bool expand_preceding_fragment =
596  (x > 0 && fragments[x-1] != NULL && !fragments[x-1]->is_ending());
597  if ((expand_following_fragment || expand_preceding_fragment) &&
598  blob_choice->rating() > worst_near_fragment) {
599  worst_index_near_fragment = x;
600  worst_near_fragment = blob_choice->rating();
601  if (chop_debug) {
602  tprintf("worst_index_near_fragment=%d"
603  " expand_following_fragment=%d"
604  " expand_preceding_fragment=%d\n",
605  worst_index_near_fragment,
606  expand_following_fragment,
607  expand_preceding_fragment);
608  }
609  }
610  }
611  }
612  }
613  }
614  delete[] fragments;
615  // TODO(daria): maybe a threshold of badness for
616  // worst_near_fragment would be useful.
617  return worst_index_near_fragment != -1 ?
618  worst_index_near_fragment : worst_index;
619 }
const UNICHARSET & getUnicharset() const
Definition: dict.h:97
bool is_ending() const
Definition: unicharset.h:102
bool is_beginning() const
Definition: unicharset.h:99
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:694
Dict & getDict()
Definition: classify.h:65
float rating() const
Definition: ratngs.h:79
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
float certainty() const
Definition: ratngs.h:82
#define MAX_FLOAT32
Definition: host.h:66
double tessedit_certainty_threshold
Definition: wordrec.h:138
int length() const
Definition: genericvector.h:85

◆ select_blob_to_split_from_fixpt()

int tesseract::Wordrec::select_blob_to_split_from_fixpt ( DANGERR fixpt)

Definition at line 628 of file chopper.cpp.

628  {
629  if (!fixpt)
630  return -1;
631  for (int i = 0; i < fixpt->size(); i++) {
632  if ((*fixpt)[i].begin + 1 == (*fixpt)[i].end &&
633  (*fixpt)[i].dangerous &&
634  (*fixpt)[i].correct_is_ngram) {
635  return (*fixpt)[i].begin;
636  }
637  }
638  return -1;
639 }
int size() const
Definition: genericvector.h:72

◆ set_pass1()

void tesseract::Wordrec::set_pass1 ( )

Definition at line 89 of file tface.cpp.

89  {
90  chop_ok_split.set_value(70.0);
92  SettupPass1();
93 }
double chop_ok_split
Definition: wordrec.h:156
ParamsModel & getParamsModel()
LanguageModel * language_model_
Definition: wordrec.h:410
void SetPass(PassEnum pass)
Definition: params_model.h:72

◆ set_pass2()

void tesseract::Wordrec::set_pass2 ( )

Definition at line 101 of file tface.cpp.

◆ try_point_pairs()

void tesseract::Wordrec::try_point_pairs ( EDGEPT points[MAX_NUM_POINTS],
inT16  num_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 296 of file findseam.cpp.

301  {
302  inT16 x;
303  inT16 y;
304  PRIORITY priority;
305 
306  for (x = 0; x < num_points; x++) {
307  for (y = x + 1; y < num_points; y++) {
308  if (points[y] &&
309  points[x]->WeightedDistance(*points[y], chop_x_y_weight) <
311  points[x] != points[y]->next && points[y] != points[x]->next &&
312  !is_exterior_point(points[x], points[y]) &&
313  !is_exterior_point(points[y], points[x])) {
314  SPLIT split(points[x], points[y]);
315  priority = partial_split_priority(&split);
316 
317  choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
318  }
319  }
320  }
321 }
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:102
int16_t inT16
Definition: host.h:36
float PRIORITY
Definition: seam.h:42
#define is_exterior_point(edge, point)
Definition: outlines.h:97
Definition: split.h:37
#define partial_split_priority(split)
Definition: gradechop.h:46

◆ try_vertical_splits()

void tesseract::Wordrec::try_vertical_splits ( EDGEPT points[MAX_NUM_POINTS],
inT16  num_points,
EDGEPT_CLIST *  new_points,
SeamQueue seam_queue,
SeamPile seam_pile,
SEAM **  seam,
TBLOB blob 
)

Definition at line 334 of file findseam.cpp.

340  {
341  EDGEPT *vertical_point = NULL;
342  inT16 x;
343  PRIORITY priority;
344  TESSLINE *outline;
345 
346  for (x = 0; x < num_points; x++) {
347  vertical_point = NULL;
348  for (outline = blob->outlines; outline; outline = outline->next) {
349  vertical_projection_point(points[x], outline->loop,
350  &vertical_point, new_points);
351  }
352 
353  if (vertical_point && points[x] != vertical_point->next &&
354  vertical_point != points[x]->next &&
355  points[x]->WeightedDistance(*vertical_point, chop_x_y_weight) <
357  SPLIT split(points[x], vertical_point);
358  priority = partial_split_priority(&split);
359  choose_best_seam(seam_queue, &split, priority, seam, blob, seam_pile);
360  }
361  }
362 }
TESSLINE * next
Definition: blobs.h:258
TESSLINE * outlines
Definition: blobs.h:377
void choose_best_seam(SeamQueue *seam_queue, const SPLIT *split, PRIORITY priority, SEAM **seam_result, TBLOB *blob, SeamPile *seam_pile)
Definition: findseam.cpp:102
int16_t inT16
Definition: host.h:36
float PRIORITY
Definition: seam.h:42
int WeightedDistance(const EDGEPT &other, int x_factor) const
Definition: blobs.h:99
EDGEPT * loop
Definition: blobs.h:257
EDGEPT * next
Definition: blobs.h:169
Definition: blobs.h:76
void vertical_projection_point(EDGEPT *split_point, EDGEPT *target_point, EDGEPT **best_point, EDGEPT_CLIST *new_points)
Definition: chop.cpp:274
Definition: split.h:37
#define partial_split_priority(split)
Definition: gradechop.h:46

◆ UpdateSegSearchNodes()

void tesseract::Wordrec::UpdateSegSearchNodes ( float  rating_cert_scale,
int  starting_col,
GenericVector< SegSearchPending > *  pending,
WERD_RES word_res,
LMPainPoints pain_points,
BestChoiceBundle best_choice_bundle,
BlamerBundle blamer_bundle 
)
protected

Definition at line 194 of file segsearch.cpp.

201  {
202  MATRIX *ratings = word_res->ratings;
203  ASSERT_HOST(ratings->dimension() == pending->size());
204  ASSERT_HOST(ratings->dimension() == best_choice_bundle->beam.size());
205  for (int col = starting_col; col < ratings->dimension(); ++col) {
206  if (!(*pending)[col].WorkToDo()) continue;
207  int first_row = col;
208  int last_row = MIN(ratings->dimension() - 1,
209  col + ratings->bandwidth() - 1);
210  if ((*pending)[col].SingleRow() >= 0) {
211  first_row = last_row = (*pending)[col].SingleRow();
212  }
213  if (segsearch_debug_level > 0) {
214  tprintf("\n\nUpdateSegSearchNodes: col=%d, rows=[%d,%d], alljust=%d\n",
215  col, first_row, last_row,
216  (*pending)[col].IsRowJustClassified(MAX_INT32));
217  }
218  // Iterate over the pending list for this column.
219  for (int row = first_row; row <= last_row; ++row) {
220  // Update language model state of this child+parent pair.
221  BLOB_CHOICE_LIST *current_node = ratings->get(col, row);
222  LanguageModelState *parent_node =
223  col == 0 ? NULL : best_choice_bundle->beam[col - 1];
224  if (current_node != NULL &&
225  language_model_->UpdateState((*pending)[col].IsRowJustClassified(row),
226  col, row, current_node, parent_node,
227  pain_points, word_res,
228  best_choice_bundle, blamer_bundle) &&
229  row + 1 < ratings->dimension()) {
230  // Since the language model state of this entry changed, process all
231  // the child column.
232  (*pending)[row + 1].RevisitWholeColumn();
233  if (segsearch_debug_level > 0) {
234  tprintf("Added child col=%d to pending\n", row + 1);
235  }
236  } // end if UpdateState.
237  } // end for row.
238  } // end for col.
239  if (best_choice_bundle->best_vse != NULL) {
240  ASSERT_HOST(word_res->StatesAllValid());
241  if (best_choice_bundle->best_vse->updated) {
242  pain_points->GenerateFromPath(rating_cert_scale,
243  best_choice_bundle->best_vse, word_res);
244  if (!best_choice_bundle->fixpt.empty()) {
245  pain_points->GenerateFromAmbigs(best_choice_bundle->fixpt,
246  best_choice_bundle->best_vse, word_res);
247  }
248  }
249  }
250  // The segsearch is completed. Reset all updated flags on all VSEs and reset
251  // all pendings.
252  for (int col = 0; col < pending->size(); ++col) {
253  (*pending)[col].Clear();
254  ViterbiStateEntry_IT
255  vse_it(&best_choice_bundle->beam[col]->viterbi_state_entries);
256  for (vse_it.mark_cycle_pt(); !vse_it.cycled_list(); vse_it.forward()) {
257  vse_it.data()->updated = false;
258  }
259  }
260 }
#define MAX_INT32
Definition: host.h:62
T get(ICOORD pos) const
Definition: matrix.h:223
#define tprintf(...)
Definition: tprintf.h:31
int size() const
Definition: genericvector.h:72
#define ASSERT_HOST(x)
Definition: errcode.h:84
MATRIX * ratings
Definition: pageres.h:215
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
int bandwidth() const
Definition: matrix.h:523
int dimension() const
Definition: matrix.h:521
#define MIN(x, y)
Definition: ndminx.h:28
Definition: matrix.h:563
int segsearch_debug_level
Definition: wordrec.h:169
LanguageModel * language_model_
Definition: wordrec.h:410

◆ vertical_projection_point()

void tesseract::Wordrec::vertical_projection_point ( EDGEPT split_point,
EDGEPT target_point,
EDGEPT **  best_point,
EDGEPT_CLIST *  new_points 
)

Definition at line 274 of file chop.cpp.

276  {
277  EDGEPT *p; /* Iterator */
278  EDGEPT *this_edgept; /* Iterator */
279  EDGEPT_C_IT new_point_it(new_points);
280  int x = split_point->pos.x; /* X value of vertical */
281  int best_dist = LARGE_DISTANCE;/* Best point found */
282 
283  if (*best_point != NULL)
284  best_dist = edgept_dist(split_point, *best_point);
285 
286  p = target_point;
287  /* Look at each edge point */
288  do {
289  if (((p->pos.x <= x && x <= p->next->pos.x) ||
290  (p->next->pos.x <= x && x <= p->pos.x)) &&
291  !same_point(split_point->pos, p->pos) &&
292  !same_point(split_point->pos, p->next->pos) &&
293  !p->IsChopPt() &&
294  (*best_point == NULL || !same_point((*best_point)->pos, p->pos))) {
295 
296  if (near_point(split_point, p, p->next, &this_edgept)) {
297  new_point_it.add_before_then_move(this_edgept);
298  }
299 
300  if (*best_point == NULL)
301  best_dist = edgept_dist (split_point, this_edgept);
302 
303  this_edgept =
304  pick_close_point(split_point, this_edgept, &best_dist);
305  if (this_edgept)
306  *best_point = this_edgept;
307  }
308 
309  p = p->next;
310  }
311  while (p != target_point);
312 }
bool IsChopPt() const
Definition: blobs.h:159
TPOINT pos
Definition: blobs.h:163
bool near_point(EDGEPT *point, EDGEPT *line_pt_0, EDGEPT *line_pt_1, EDGEPT **near_pt)
Definition: outlines.cpp:49
#define edgept_dist(p1, p2)
Definition: outlines.h:87
inT16 x
Definition: blobs.h:71
#define LARGE_DISTANCE
Definition: outlines.h:36
EDGEPT * next
Definition: blobs.h:169
#define same_point(p1, p2)
Definition: outlines.h:49
Definition: blobs.h:76
EDGEPT * pick_close_point(EDGEPT *critical_point, EDGEPT *vertical_point, int *best_dist)
Definition: chop.cpp:124

◆ WordSearch()

void tesseract::Wordrec::WordSearch ( WERD_RES word_res)

Definition at line 130 of file segsearch.cpp.

130  {
131  LMPainPoints pain_points(segsearch_max_pain_points,
136  BestChoiceBundle best_choice_bundle(word_res->ratings->dimension());
137  // Run Segmentation Search.
138  InitialSegSearch(word_res, &pain_points, &pending, &best_choice_bundle, NULL);
139  if (segsearch_debug_level > 0) {
140  tprintf("Ending ratings matrix%s:\n",
141  wordrec_enable_assoc ? " (with assoc)" : "");
142  word_res->ratings->print(getDict().getUnicharset());
143  }
144 }
Dict & getDict()
Definition: classify.h:65
void print(const UNICHARSET &unicharset) const
Definition: matrix.cpp:112
bool wordrec_enable_assoc
Definition: wordrec.h:130
#define tprintf(...)
Definition: tprintf.h:31
MATRIX * ratings
Definition: pageres.h:215
double segsearch_max_char_wh_ratio
Definition: wordrec.h:175
bool assume_fixed_pitch_char_segment
Definition: wordrec.h:161
int segsearch_max_pain_points
Definition: wordrec.h:171
int dimension() const
Definition: matrix.h:521
int segsearch_debug_level
Definition: wordrec.h:169
void InitialSegSearch(WERD_RES *word_res, LMPainPoints *pain_points, GenericVector< SegSearchPending > *pending, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
Definition: segsearch.cpp:150

Member Data Documentation

◆ assume_fixed_pitch_char_segment

bool tesseract::Wordrec::assume_fixed_pitch_char_segment = FALSE

"include fixed-pitch heuristics in char segmentation"

Definition at line 161 of file wordrec.h.

◆ blame_reasons_

GenericVector<int> tesseract::Wordrec::blame_reasons_

Definition at line 417 of file wordrec.h.

◆ chop_center_knob

double tesseract::Wordrec::chop_center_knob = 0.15

"Split center adjustment"

Definition at line 151 of file wordrec.h.

◆ chop_centered_maxwidth

int tesseract::Wordrec::chop_centered_maxwidth = 90

"Width of (smaller) chopped blobs " "above which we don't care that a chop is not near the center."

Definition at line 153 of file wordrec.h.

◆ chop_debug

int tesseract::Wordrec::chop_debug = 0

"Chop debug"

Definition at line 139 of file wordrec.h.

◆ chop_enable

bool tesseract::Wordrec::chop_enable = 1

"Chop enable"

Definition at line 140 of file wordrec.h.

◆ chop_good_split

double tesseract::Wordrec::chop_good_split = 50.0

"Good split limit"

Definition at line 157 of file wordrec.h.

◆ chop_inside_angle

int tesseract::Wordrec::chop_inside_angle = -50

"Min Inside Angle Bend"

Definition at line 147 of file wordrec.h.

◆ chop_min_outline_area

int tesseract::Wordrec::chop_min_outline_area = 2000

"Min Outline Area"

Definition at line 148 of file wordrec.h.

◆ chop_min_outline_points

int tesseract::Wordrec::chop_min_outline_points = 6

"Min Number of Points on Outline"

Definition at line 144 of file wordrec.h.

◆ chop_new_seam_pile

bool tesseract::Wordrec::chop_new_seam_pile = 1

"Use new seam_pile"

Definition at line 146 of file wordrec.h.

◆ chop_ok_split

double tesseract::Wordrec::chop_ok_split = 100.0

"OK split limit"

Definition at line 156 of file wordrec.h.

◆ chop_overlap_knob

double tesseract::Wordrec::chop_overlap_knob = 0.9

"Split overlap adjustment"

Definition at line 150 of file wordrec.h.

◆ chop_same_distance

int tesseract::Wordrec::chop_same_distance = 2

"Same distance"

Definition at line 143 of file wordrec.h.

◆ chop_seam_pile_size

int tesseract::Wordrec::chop_seam_pile_size = 150

"Max number of seams in seam_pile"

Definition at line 145 of file wordrec.h.

◆ chop_sharpness_knob

double tesseract::Wordrec::chop_sharpness_knob = 0.06

"Split sharpness adjustment"

Definition at line 154 of file wordrec.h.

◆ chop_split_dist_knob

double tesseract::Wordrec::chop_split_dist_knob = 0.5

"Split length adjustment"

Definition at line 149 of file wordrec.h.

◆ chop_split_length

int tesseract::Wordrec::chop_split_length = 10000

"Split Length"

Definition at line 142 of file wordrec.h.

◆ chop_vertical_creep

bool tesseract::Wordrec::chop_vertical_creep = 0

"Vertical creep"

Definition at line 141 of file wordrec.h.

◆ chop_width_change_knob

double tesseract::Wordrec::chop_width_change_knob = 5.0

"Width change adjustment"

Definition at line 155 of file wordrec.h.

◆ chop_x_y_weight

int tesseract::Wordrec::chop_x_y_weight = 3

"X / Y length weight"

Definition at line 158 of file wordrec.h.

◆ fill_lattice_

void(Wordrec::* tesseract::Wordrec::fill_lattice_) (const MATRIX &ratings, const WERD_CHOICE_LIST &best_choices, const UNICHARSET &unicharset, BlamerBundle *blamer_bundle)

Definition at line 419 of file wordrec.h.

◆ force_word_assoc

bool tesseract::Wordrec::force_word_assoc = FALSE

"force associator to run regardless of what enable_assoc is." "This is used for CJK where component grouping is necessary."

Definition at line 133 of file wordrec.h.

◆ fragments_guide_chopper

bool tesseract::Wordrec::fragments_guide_chopper = FALSE

"Use information from fragments to guide chopping process"

Definition at line 136 of file wordrec.h.

◆ language_model_

LanguageModel* tesseract::Wordrec::language_model_

Definition at line 410 of file wordrec.h.

◆ merge_fragments_in_matrix

bool tesseract::Wordrec::merge_fragments_in_matrix = TRUE

"Merge the fragments in the ratings matrix and delete them " "after merging"

Definition at line 128 of file wordrec.h.

◆ pass2_ok_split

PRIORITY tesseract::Wordrec::pass2_ok_split

Definition at line 411 of file wordrec.h.

◆ prev_word_best_choice_

WERD_CHOICE* tesseract::Wordrec::prev_word_best_choice_

Definition at line 415 of file wordrec.h.

◆ repair_unchopped_blobs

int tesseract::Wordrec::repair_unchopped_blobs = 1

"Fix blobs that aren't chopped"

Definition at line 137 of file wordrec.h.

◆ save_alt_choices

bool tesseract::Wordrec::save_alt_choices = true

"Save alternative paths found during chopping " "and segmentation search"

Definition at line 178 of file wordrec.h.

◆ segment_adjust_debug

int tesseract::Wordrec::segment_adjust_debug = 0

"Segmentation adjustment debug"

Definition at line 159 of file wordrec.h.

◆ segsearch_debug_level

int tesseract::Wordrec::segsearch_debug_level = 0

"SegSearch debug level"

Definition at line 169 of file wordrec.h.

◆ segsearch_max_char_wh_ratio

double tesseract::Wordrec::segsearch_max_char_wh_ratio = 2.0

"Maximum character width-to-height ratio"

Definition at line 175 of file wordrec.h.

◆ segsearch_max_futile_classifications

int tesseract::Wordrec::segsearch_max_futile_classifications = 10

"Maximum number of pain point classifications per word."

Definition at line 173 of file wordrec.h.

◆ segsearch_max_pain_points

int tesseract::Wordrec::segsearch_max_pain_points = 2000

"Maximum number of pain points stored in the queue"

Definition at line 171 of file wordrec.h.

◆ tessedit_certainty_threshold

double tesseract::Wordrec::tessedit_certainty_threshold = -2.25

"Good blob limit"

Definition at line 138 of file wordrec.h.

◆ wordrec_debug_blamer

bool tesseract::Wordrec::wordrec_debug_blamer = false

"Print blamer debug messages"

Definition at line 167 of file wordrec.h.

◆ wordrec_debug_level

int tesseract::Wordrec::wordrec_debug_level = 0

"Debug level for wordrec"

Definition at line 162 of file wordrec.h.

◆ wordrec_enable_assoc

bool tesseract::Wordrec::wordrec_enable_assoc = TRUE

"Associator Enable"

Definition at line 130 of file wordrec.h.

◆ wordrec_max_join_chunks

int tesseract::Wordrec::wordrec_max_join_chunks = 4

"Max number of broken pieces to associate"

Definition at line 164 of file wordrec.h.

◆ wordrec_no_block

bool tesseract::Wordrec::wordrec_no_block = FALSE

"Don't output block information"

Definition at line 129 of file wordrec.h.

◆ wordrec_run_blamer

bool tesseract::Wordrec::wordrec_run_blamer = false

"Try to set the blame for errors"

Definition at line 168 of file wordrec.h.

◆ wordrec_skip_no_truth_words

bool tesseract::Wordrec::wordrec_skip_no_truth_words = false

"Only run OCR for words that had truth recorded in BlamerBundle"

Definition at line 166 of file wordrec.h.

◆ wordrec_worst_state

double tesseract::Wordrec::wordrec_worst_state = 1

"Worst segmentation state"

Definition at line 134 of file wordrec.h.


The documentation for this class was generated from the following files: