tesseract  4.00.00dev
tesseract::TessdataManager Class Reference

#include <tessdatamanager.h>

Public Member Functions

 TessdataManager ()
 
 TessdataManager (FileReader reader)
 
 ~TessdataManager ()
 
bool swap () const
 
bool is_loaded () const
 
bool Init (const char *data_file_name)
 
bool LoadMemBuffer (const char *name, const char *data, int size)
 
bool SaveFile (const STRING &filename, FileWriter writer) const
 
void Serialize (GenericVector< char > *data) const
 
void Clear ()
 
void Directory () const
 
bool GetComponent (TessdataType type, TFile *fp)
 
bool IsBaseAvailable () const
 
bool IsLSTMAvailable () const
 
const STRINGGetDataFileName () const
 
bool CombineDataFiles (const char *language_data_path_prefix, const char *output_filename)
 
bool OverwriteComponents (const char *new_traineddata_filename, char **component_filenames, int num_new_components)
 
bool ExtractToFile (const char *filename)
 

Static Public Member Functions

static bool TessdataTypeFromFileSuffix (const char *suffix, TessdataType *type)
 
static bool TessdataTypeFromFileName (const char *filename, TessdataType *type)
 

Detailed Description

Definition at line 121 of file tessdatamanager.h.

Constructor & Destructor Documentation

◆ TessdataManager() [1/2]

tesseract::TessdataManager::TessdataManager ( )
inline

Definition at line 123 of file tessdatamanager.h.

123 : reader_(nullptr), is_loaded_(false), swap_(false) {}

◆ TessdataManager() [2/2]

tesseract::TessdataManager::TessdataManager ( FileReader  reader)
inlineexplicit

Definition at line 124 of file tessdatamanager.h.

125  : reader_(reader), is_loaded_(false), swap_(false) {}

◆ ~TessdataManager()

tesseract::TessdataManager::~TessdataManager ( )
inline

Definition at line 126 of file tessdatamanager.h.

126 {}

Member Function Documentation

◆ Clear()

void tesseract::TessdataManager::Clear ( )

Definition at line 117 of file tessdatamanager.cpp.

117  {
118  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
119  entries_[i].clear();
120  }
121  is_loaded_ = false;
122 }

◆ CombineDataFiles()

bool tesseract::TessdataManager::CombineDataFiles ( const char *  language_data_path_prefix,
const char *  output_filename 
)

Reads all the standard tesseract config and data files for a language at the given path and bundles them up into one binary data file. Returns true if the combined traineddata file was successfully written.

Definition at line 146 of file tessdatamanager.cpp.

148  {
149  // Load individual tessdata components from files.
150  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
151  TessdataType type;
152  ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
153  STRING filename = language_data_path_prefix;
154  filename += kTessdataFileSuffixes[i];
155  FILE *fp = fopen(filename.string(), "rb");
156  if (fp != nullptr) {
157  fclose(fp);
158  if (!LoadDataFromFile(filename, &entries_[type])) {
159  tprintf("Load of file %s failed!\n", filename.string());
160  return false;
161  }
162  }
163  }
164  is_loaded_ = true;
165 
166  // Make sure that the required components are present.
167  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
168  tprintf(
169  "Error: traineddata file must contain at least (a unicharset file"
170  "and inttemp) OR an lstm file.\n");
171  return false;
172  }
173  // Write updated data to the output traineddata file.
174  return SaveFile(output_filename, nullptr);
175 }
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool SaveFile(const STRING &filename, FileWriter writer) const
Definition: strngs.h:45
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
const char * filename
Definition: ioapi.h:38

◆ Directory()

void tesseract::TessdataManager::Directory ( ) const

Definition at line 125 of file tessdatamanager.cpp.

125  {
126  int offset = TESSDATA_NUM_ENTRIES * sizeof(inT64);
127  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
128  if (!entries_[i].empty()) {
129  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
130  entries_[i].size(), offset);
131  offset += entries_[i].size();
132  }
133  }
134 }
int64_t inT64
Definition: host.h:40
voidpf void uLong size
Definition: ioapi.h:39
#define tprintf(...)
Definition: tprintf.h:31
voidpf uLong offset
Definition: ioapi.h:42
int size() const
Definition: genericvector.h:72

◆ ExtractToFile()

bool tesseract::TessdataManager::ExtractToFile ( const char *  filename)

Extracts tessdata component implied by the name of the input file from the combined traineddata loaded into TessdataManager. Writes the extracted component to the file indicated by the file name. E.g. if the filename given is somepath/somelang.unicharset, unicharset will be extracted from the data loaded into the TessdataManager and will be written to somepath/somelang.unicharset.

Returns
true if the component was successfully extracted, false if the component was not present in the traineddata loaded into TessdataManager.

Definition at line 196 of file tessdatamanager.cpp.

196  {
198  ASSERT_HOST(
200  if (entries_[type].empty()) return false;
201  return SaveDataToFile(entries_[type], filename);
202 }
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
#define ASSERT_HOST(x)
Definition: errcode.h:84
const char * filename
Definition: ioapi.h:38

◆ GetComponent()

bool tesseract::TessdataManager::GetComponent ( TessdataType  type,
TFile fp 
)

Definition at line 138 of file tessdatamanager.cpp.

138  {
139  if (!is_loaded_ && !Init(data_file_name_.string())) return false;
140  if (entries_[type].empty()) return false;
141  fp->Open(&entries_[type][0], entries_[type].size());
142  fp->set_swap(swap_);
143  return true;
144 }
voidpf void uLong size
Definition: ioapi.h:39
const char * string() const
Definition: strngs.cpp:198
bool Init(const char *data_file_name)

◆ GetDataFileName()

const STRING& tesseract::TessdataManager::GetDataFileName ( ) const
inline

Definition at line 164 of file tessdatamanager.h.

164 { return data_file_name_; }

◆ Init()

bool tesseract::TessdataManager::Init ( const char *  data_file_name)

Opens and reads the given data file right now.

Returns
true on success.

Definition at line 36 of file tessdatamanager.cpp.

36  {
38  if (reader_ == nullptr) {
39  if (!LoadDataFromFile(data_file_name, &data)) return false;
40  } else {
41  if (!(*reader_)(data_file_name, &data)) return false;
42  }
43  return LoadMemBuffer(data_file_name, &data[0], data.size());
44 }
bool LoadMemBuffer(const char *name, const char *data, int size)
int size() const
Definition: genericvector.h:72
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)

◆ is_loaded()

bool tesseract::TessdataManager::is_loaded ( ) const
inline

Definition at line 129 of file tessdatamanager.h.

129 { return is_loaded_; }

◆ IsBaseAvailable()

bool tesseract::TessdataManager::IsBaseAvailable ( ) const
inline

Definition at line 155 of file tessdatamanager.h.

155  {
156  return !entries_[TESSDATA_UNICHARSET].empty() &&
157  !entries_[TESSDATA_INTTEMP].empty();
158  }
bool empty() const
Definition: genericvector.h:90

◆ IsLSTMAvailable()

bool tesseract::TessdataManager::IsLSTMAvailable ( ) const
inline

Definition at line 161 of file tessdatamanager.h.

161 { return !entries_[TESSDATA_LSTM].empty(); }
bool empty() const
Definition: genericvector.h:90

◆ LoadMemBuffer()

bool tesseract::TessdataManager::LoadMemBuffer ( const char *  name,
const char *  data,
int  size 
)

Definition at line 47 of file tessdatamanager.cpp.

48  {
49  data_file_name_ = name;
50  TFile fp;
51  fp.Open(data, size);
52  inT32 num_entries = TESSDATA_NUM_ENTRIES;
53  if (fp.FRead(&num_entries, sizeof(num_entries), 1) != 1) return false;
54  swap_ = num_entries > kMaxNumTessdataEntries || num_entries < 0;
55  fp.set_swap(swap_);
56  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
57  if (num_entries > kMaxNumTessdataEntries || num_entries < 0) return false;
58  GenericVector<inT64> offset_table;
59  offset_table.resize_no_init(num_entries);
60  if (fp.FReadEndian(&offset_table[0], sizeof(offset_table[0]), num_entries) !=
61  num_entries)
62  return false;
63  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
64  if (offset_table[i] >= 0) {
65  inT64 entry_size = size - offset_table[i];
66  int j = i + 1;
67  while (j < num_entries && offset_table[j] == -1) ++j;
68  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
69  entries_[i].resize_no_init(entry_size);
70  if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false;
71  }
72  }
73  is_loaded_ = true;
74  return true;
75 }
int64_t inT64
Definition: host.h:40
int32_t inT32
Definition: host.h:38
voidpf void uLong size
Definition: ioapi.h:39
void resize_no_init(int size)
Definition: genericvector.h:66
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:184

◆ OverwriteComponents()

bool tesseract::TessdataManager::OverwriteComponents ( const char *  new_traineddata_filename,
char **  component_filenames,
int  num_new_components 
)

Gets the individual components from the data_file_ with which the class was initialized. Overwrites the components specified by component_filenames. Writes the updated traineddata file to new_traineddata_filename.

Definition at line 177 of file tessdatamanager.cpp.

180  {
181  // Open the files with the new components.
182  for (int i = 0; i < num_new_components; ++i) {
183  TessdataType type;
184  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
185  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
186  tprintf("Failed to read component file:%s\n", component_filenames[i]);
187  return false;
188  }
189  }
190  }
191 
192  // Write updated data to the output traineddata file.
193  return SaveFile(new_traineddata_filename, nullptr);
194 }
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
#define tprintf(...)
Definition: tprintf.h:31
bool SaveFile(const STRING &filename, FileWriter writer) const
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)

◆ SaveFile()

bool tesseract::TessdataManager::SaveFile ( const STRING filename,
FileWriter  writer 
) const

Definition at line 78 of file tessdatamanager.cpp.

79  {
80  ASSERT_HOST(is_loaded_);
82  Serialize(&data);
83  if (writer == nullptr)
84  return SaveDataToFile(data, filename);
85  else
86  return (*writer)(data, filename);
87 }
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
#define ASSERT_HOST(x)
Definition: errcode.h:84
void Serialize(GenericVector< char > *data) const
const char * filename
Definition: ioapi.h:38

◆ Serialize()

void tesseract::TessdataManager::Serialize ( GenericVector< char > *  data) const

Definition at line 90 of file tessdatamanager.cpp.

90  {
91  ASSERT_HOST(is_loaded_);
92  // Compute the offset_table and total size.
93  inT64 offset_table[TESSDATA_NUM_ENTRIES];
94  inT64 offset = sizeof(inT32) + sizeof(offset_table);
95  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
96  if (entries_[i].empty()) {
97  offset_table[i] = -1;
98  } else {
99  offset_table[i] = offset;
100  offset += entries_[i].size();
101  }
102  }
103  data->init_to_size(offset, 0);
104  inT32 num_entries = TESSDATA_NUM_ENTRIES;
105  TFile fp;
106  fp.OpenWrite(data);
107  fp.FWrite(&num_entries, sizeof(num_entries), 1);
108  fp.FWrite(offset_table, sizeof(offset_table), 1);
109  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
110  if (!entries_[i].empty()) {
111  fp.FWrite(&entries_[i][0], entries_[i].size(), 1);
112  }
113  }
114 }
int64_t inT64
Definition: host.h:40
int32_t inT32
Definition: host.h:38
void init_to_size(int size, T t)
voidpf void uLong size
Definition: ioapi.h:39
voidpf uLong offset
Definition: ioapi.h:42
int size() const
Definition: genericvector.h:72
#define ASSERT_HOST(x)
Definition: errcode.h:84

◆ swap()

bool tesseract::TessdataManager::swap ( ) const
inline

Definition at line 128 of file tessdatamanager.h.

128 { return swap_; }

◆ TessdataTypeFromFileName()

bool tesseract::TessdataManager::TessdataTypeFromFileName ( const char *  filename,
TessdataType type 
)
static

Tries to determine tessdata component file suffix from filename, returns true on success.

Definition at line 217 of file tessdatamanager.cpp.

218  {
219  // Get the file suffix (extension)
220  const char *suffix = strrchr(filename, '.');
221  if (suffix == nullptr || *(++suffix) == '\0') return false;
222  return TessdataTypeFromFileSuffix(suffix, type);
223 }
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
const char * filename
Definition: ioapi.h:38

◆ TessdataTypeFromFileSuffix()

bool tesseract::TessdataManager::TessdataTypeFromFileSuffix ( const char *  suffix,
TessdataType type 
)
static

Fills type with TessdataType of the tessdata component represented by the given file name. E.g. tessdata/eng.unicharset -> TESSDATA_UNICHARSET.

Returns
true if the tessdata component type could be determined from the given file name.

Definition at line 204 of file tessdatamanager.cpp.

205  {
206  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
207  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
208  *type = static_cast<TessdataType>(i);
209  return true;
210  }
211  }
212  tprintf("TessdataManager can't determine which tessdata"
213  " component is represented by %s\n", suffix);
214  return false;
215 }
#define tprintf(...)
Definition: tprintf.h:31

The documentation for this class was generated from the following files: