tesseract  4.00.00dev
tessdatamanager.cpp
Go to the documentation of this file.
1 // File: tessdatamanager.cpp
3 // Description: Functions to handle loading/combining tesseract data files.
4 // Author: Daria Antonova
5 // Created: Wed Jun 03 11:26:43 PST 2009
6 //
7 // (C) Copyright 2009, Google Inc.
8 // Licensed under the Apache License, Version 2.0 (the "License");
9 // you may not use this file except in compliance with the License.
10 // You may obtain a copy of the License at
11 // http://www.apache.org/licenses/LICENSE-2.0
12 // Unless required by applicable law or agreed to in writing, software
13 // distributed under the License is distributed on an "AS IS" BASIS,
14 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15 // See the License for the specific language governing permissions and
16 // limitations under the License.
17 //
19 
20 #ifdef _MSC_VER
21 #pragma warning(disable:4244) // Conversion warnings
22 #endif
23 
24 #include "tessdatamanager.h"
25 
26 #include <stdio.h>
27 
28 #include "helpers.h"
29 #include "serialis.h"
30 #include "strngs.h"
31 #include "tprintf.h"
32 #include "params.h"
33 
34 namespace tesseract {
35 
36 bool TessdataManager::Init(const char *data_file_name) {
38  if (reader_ == nullptr) {
39  if (!LoadDataFromFile(data_file_name, &data)) return false;
40  } else {
41  if (!(*reader_)(data_file_name, &data)) return false;
42  }
43  return LoadMemBuffer(data_file_name, &data[0], data.size());
44 }
45 
46 // Loads from the given memory buffer as if a file.
47 bool TessdataManager::LoadMemBuffer(const char *name, const char *data,
48  int size) {
49  data_file_name_ = name;
50  TFile fp;
51  fp.Open(data, size);
52  inT32 num_entries = TESSDATA_NUM_ENTRIES;
53  if (fp.FRead(&num_entries, sizeof(num_entries), 1) != 1) return false;
54  swap_ = num_entries > kMaxNumTessdataEntries || num_entries < 0;
55  fp.set_swap(swap_);
56  if (swap_) ReverseN(&num_entries, sizeof(num_entries));
57  if (num_entries > kMaxNumTessdataEntries || num_entries < 0) return false;
58  GenericVector<inT64> offset_table;
59  offset_table.resize_no_init(num_entries);
60  if (fp.FReadEndian(&offset_table[0], sizeof(offset_table[0]), num_entries) !=
61  num_entries)
62  return false;
63  for (int i = 0; i < num_entries && i < TESSDATA_NUM_ENTRIES; ++i) {
64  if (offset_table[i] >= 0) {
65  inT64 entry_size = size - offset_table[i];
66  int j = i + 1;
67  while (j < num_entries && offset_table[j] == -1) ++j;
68  if (j < num_entries) entry_size = offset_table[j] - offset_table[i];
69  entries_[i].resize_no_init(entry_size);
70  if (fp.FRead(&entries_[i][0], 1, entry_size) != entry_size) return false;
71  }
72  }
73  is_loaded_ = true;
74  return true;
75 }
76 
77 // Saves to the given filename.
79  FileWriter writer) const {
80  ASSERT_HOST(is_loaded_);
82  Serialize(&data);
83  if (writer == nullptr)
84  return SaveDataToFile(data, filename);
85  else
86  return (*writer)(data, filename);
87 }
88 
89 // Serializes to the given vector.
91  ASSERT_HOST(is_loaded_);
92  // Compute the offset_table and total size.
93  inT64 offset_table[TESSDATA_NUM_ENTRIES];
94  inT64 offset = sizeof(inT32) + sizeof(offset_table);
95  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
96  if (entries_[i].empty()) {
97  offset_table[i] = -1;
98  } else {
99  offset_table[i] = offset;
100  offset += entries_[i].size();
101  }
102  }
103  data->init_to_size(offset, 0);
104  inT32 num_entries = TESSDATA_NUM_ENTRIES;
105  TFile fp;
106  fp.OpenWrite(data);
107  fp.FWrite(&num_entries, sizeof(num_entries), 1);
108  fp.FWrite(offset_table, sizeof(offset_table), 1);
109  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
110  if (!entries_[i].empty()) {
111  fp.FWrite(&entries_[i][0], entries_[i].size(), 1);
112  }
113  }
114 }
115 
116 // Resets to the initial state, keeping the reader.
118  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
119  entries_[i].clear();
120  }
121  is_loaded_ = false;
122 }
123 
124 // Prints a directory of contents.
126  int offset = TESSDATA_NUM_ENTRIES * sizeof(inT64);
127  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
128  if (!entries_[i].empty()) {
129  tprintf("%d:%s:size=%d, offset=%d\n", i, kTessdataFileSuffixes[i],
130  entries_[i].size(), offset);
131  offset += entries_[i].size();
132  }
133  }
134 }
135 
136 // Opens the given TFile pointer to the given component type.
137 // Returns false in case of failure.
139  if (!is_loaded_ && !Init(data_file_name_.string())) return false;
140  if (entries_[type].empty()) return false;
141  fp->Open(&entries_[type][0], entries_[type].size());
142  fp->set_swap(swap_);
143  return true;
144 }
145 
147  const char *language_data_path_prefix,
148  const char *output_filename) {
149  // Load individual tessdata components from files.
150  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
151  TessdataType type;
152  ASSERT_HOST(TessdataTypeFromFileSuffix(kTessdataFileSuffixes[i], &type));
153  STRING filename = language_data_path_prefix;
154  filename += kTessdataFileSuffixes[i];
155  FILE *fp = fopen(filename.string(), "rb");
156  if (fp != nullptr) {
157  fclose(fp);
158  if (!LoadDataFromFile(filename, &entries_[type])) {
159  tprintf("Load of file %s failed!\n", filename.string());
160  return false;
161  }
162  }
163  }
164  is_loaded_ = true;
165 
166  // Make sure that the required components are present.
167  if (!IsBaseAvailable() && !IsLSTMAvailable()) {
168  tprintf(
169  "Error: traineddata file must contain at least (a unicharset file"
170  "and inttemp) OR an lstm file.\n");
171  return false;
172  }
173  // Write updated data to the output traineddata file.
174  return SaveFile(output_filename, nullptr);
175 }
176 
178  const char *new_traineddata_filename,
179  char **component_filenames,
180  int num_new_components) {
181  // Open the files with the new components.
182  for (int i = 0; i < num_new_components; ++i) {
183  TessdataType type;
184  if (TessdataTypeFromFileName(component_filenames[i], &type)) {
185  if (!LoadDataFromFile(component_filenames[i], &entries_[type])) {
186  tprintf("Failed to read component file:%s\n", component_filenames[i]);
187  return false;
188  }
189  }
190  }
191 
192  // Write updated data to the output traineddata file.
193  return SaveFile(new_traineddata_filename, nullptr);
194 }
195 
198  ASSERT_HOST(
200  if (entries_[type].empty()) return false;
201  return SaveDataToFile(entries_[type], filename);
202 }
203 
205  TessdataType *type) {
206  for (int i = 0; i < TESSDATA_NUM_ENTRIES; ++i) {
207  if (strcmp(kTessdataFileSuffixes[i], suffix) == 0) {
208  *type = static_cast<TessdataType>(i);
209  return true;
210  }
211  }
212  tprintf("TessdataManager can't determine which tessdata"
213  " component is represented by %s\n", suffix);
214  return false;
215 }
216 
218  TessdataType *type) {
219  // Get the file suffix (extension)
220  const char *suffix = strrchr(filename, '.');
221  if (suffix == nullptr || *(++suffix) == '\0') return false;
222  return TessdataTypeFromFileSuffix(suffix, type);
223 }
224 
225 } // namespace tesseract
bool OverwriteComponents(const char *new_traineddata_filename, char **component_filenames, int num_new_components)
static bool TessdataTypeFromFileName(const char *filename, TessdataType *type)
int64_t inT64
Definition: host.h:40
int32_t inT32
Definition: host.h:38
bool SaveDataToFile(const GenericVector< char > &data, const STRING &filename)
void init_to_size(int size, T t)
bool LoadMemBuffer(const char *name, const char *data, int size)
voidpf void uLong size
Definition: ioapi.h:39
#define tprintf(...)
Definition: tprintf.h:31
const char * string() const
Definition: strngs.cpp:198
void resize_no_init(int size)
Definition: genericvector.h:66
voidpf uLong offset
Definition: ioapi.h:42
int FReadEndian(void *buffer, int size, int count)
Definition: serialis.cpp:97
bool ExtractToFile(const char *filename)
int size() const
Definition: genericvector.h:72
void OpenWrite(GenericVector< char > *data)
Definition: serialis.cpp:125
#define ASSERT_HOST(x)
Definition: errcode.h:84
bool SaveFile(const STRING &filename, FileWriter writer) const
Definition: strngs.h:45
bool GetComponent(TessdataType type, TFile *fp)
bool LoadDataFromFile(const char *filename, GenericVector< char > *data)
void set_swap(bool value)
Definition: serialis.h:65
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:148
void Serialize(GenericVector< char > *data) const
bool(* FileWriter)(const GenericVector< char > &data, const STRING &filename)
static bool TessdataTypeFromFileSuffix(const char *suffix, TessdataType *type)
const char * filename
Definition: ioapi.h:38
bool Open(const STRING &filename, FileReader reader)
Definition: serialis.cpp:38
bool CombineDataFiles(const char *language_data_path_prefix, const char *output_filename)
bool Init(const char *data_file_name)
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:184
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:108