tesseract  4.00.00dev
blobclass.cpp
Go to the documentation of this file.
1 /******************************************************************************
2  ** Filename: blobclass.c
3  ** Purpose: High level blob classification and training routines.
4  ** Author: Dan Johnson
5  ** History: 7/21/89, DSJ, Created.
6  **
7  ** (c) Copyright Hewlett-Packard Company, 1988.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  ******************************************************************************/
18 
22 #include "blobclass.h"
23 
24 #include <stdio.h>
25 
26 #include "classify.h"
27 #include "efio.h"
28 #include "featdefs.h"
29 #include "mf.h"
30 #include "normfeat.h"
31 
32 static const char kUnknownFontName[] = "UnknownFont";
33 
34 STRING_VAR(classify_font_name, kUnknownFontName,
35  "Default font name to be used in training");
36 
37 namespace tesseract {
41 // Finds the name of the training font and returns it in fontname, by cutting
42 // it out based on the expectation that the filename is of the form:
43 // /path/to/dir/[lang].[fontname].exp[num]
44 // The [lang], [fontname] and [num] fields should not have '.' characters.
45 // If the global parameter classify_font_name is set, its value is used instead.
46 void ExtractFontName(const STRING& filename, STRING* fontname) {
47  *fontname = classify_font_name;
48  if (*fontname == kUnknownFontName) {
49  // filename is expected to be of the form [lang].[fontname].exp[num]
50  // The [lang], [fontname] and [num] fields should not have '.' characters.
51  const char *basename = strrchr(filename.string(), '/');
52  const char *firstdot = strchr(basename ? basename : filename.string(), '.');
53  const char *lastdot = strrchr(filename.string(), '.');
54  if (firstdot != lastdot && firstdot != NULL && lastdot != NULL) {
55  ++firstdot;
56  *fontname = firstdot;
57  fontname->truncate_at(lastdot - firstdot);
58  }
59  }
60 }
61 
62 /*---------------------------------------------------------------------------*/
63 // Extracts features from the given blob and saves them in the tr_file_data_
64 // member variable.
65 // fontname: Name of font that this blob was printed in.
66 // cn_denorm: Character normalization transformation to apply to the blob.
67 // fx_info: Character normalization parameters computed with cn_denorm.
68 // blob_text: Ground truth text for the blob.
69 void Classify::LearnBlob(const STRING& fontname, TBLOB* blob,
70  const DENORM& cn_denorm,
71  const INT_FX_RESULT_STRUCT& fx_info,
72  const char* blob_text) {
74  CharDesc->FeatureSets[0] = ExtractMicros(blob, cn_denorm);
75  CharDesc->FeatureSets[1] = ExtractCharNormFeatures(fx_info);
76  CharDesc->FeatureSets[2] = ExtractIntCNFeatures(*blob, fx_info);
77  CharDesc->FeatureSets[3] = ExtractIntGeoFeatures(*blob, fx_info);
78 
79  if (ValidCharDescription(feature_defs_, CharDesc)) {
80  // Label the features with a class name and font name.
81  tr_file_data_ += "\n";
82  tr_file_data_ += fontname;
83  tr_file_data_ += " ";
84  tr_file_data_ += blob_text;
85  tr_file_data_ += "\n";
86 
87  // write micro-features to file and clean up
88  WriteCharDescription(feature_defs_, CharDesc, &tr_file_data_);
89  } else {
90  tprintf("Blob learned was invalid!\n");
91  }
92  FreeCharDescription(CharDesc);
93 } // LearnBlob
94 
95 // Writes stored training data to a .tr file based on the given filename.
96 // Returns false on error.
98  STRING tr_filename = filename + ".tr";
99  FILE* fp = Efopen(tr_filename.string(), "wb");
100  size_t len = tr_file_data_.length();
101  bool result =
102  fwrite(&tr_file_data_[0], sizeof(tr_file_data_[0]), len, fp) == len;
103  fclose(fp);
104  tr_file_data_.truncate_at(0);
105  return result;
106 }
107 
108 } // namespace tesseract.
void WriteCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc, STRING *str)
Definition: featdefs.cpp:193
void truncate_at(inT32 index)
Definition: strngs.cpp:269
bool ValidCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs, CHAR_DESC CharDesc)
Definition: featdefs.cpp:214
FEATURE_SET ExtractIntCNFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:230
FEATURE_DEFS_STRUCT feature_defs_
Definition: classify.h:506
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
inT32 length() const
Definition: strngs.cpp:193
void ExtractFontName(const STRING &filename, STRING *fontname)
Definition: blobclass.cpp:46
FEATURE_SET FeatureSets[NUM_FEATURE_TYPES]
Definition: featdefs.h:44
CHAR_DESC NewCharDescription(const FEATURE_DEFS_STRUCT &FeatureDefs)
Definition: featdefs.cpp:162
FEATURE_SET ExtractCharNormFeatures(const INT_FX_RESULT_STRUCT &fx_info)
Definition: normfeat.cpp:61
FEATURE_SET ExtractIntGeoFeatures(const TBLOB &blob, const INT_FX_RESULT_STRUCT &fx_info)
Definition: picofeat.cpp:262
FEATURE_SET ExtractMicros(TBLOB *Blob, const DENORM &cn_denorm)
Definition: mf.cpp:45
Definition: strngs.h:45
void FreeCharDescription(CHAR_DESC CharDesc)
Definition: featdefs.cpp:141
FILE * Efopen(const char *Name, const char *Mode)
Definition: efio.cpp:43
Definition: blobs.h:261
const char * filename
Definition: ioapi.h:38
void LearnBlob(const STRING &fontname, TBLOB *Blob, const DENORM &cn_denorm, const INT_FX_RESULT_STRUCT &fx_info, const char *blob_text)
Definition: blobclass.cpp:69
bool WriteTRFile(const STRING &filename)
Definition: blobclass.cpp:97
#define STRING_VAR(name, val, comment)
Definition: params.h:282
char * classify_font_name
Definition: blobclass.cpp:35