tesseract  4.00.00dev
strngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: strngs.c (Formerly strings.c)
3  * Description: STRING class functions.
4  * Author: Ray Smith
5  * Created: Fri Feb 15 09:13:30 GMT 1991
6  *
7  * (C) Copyright 1991, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include "strngs.h"
21 
22 #include <assert.h>
23 
24 #include "genericvector.h"
25 #include "helpers.h"
26 #include "serialis.h"
27 #include "tprintf.h"
28 
29 using tesseract::TFile;
30 
31 // Size of buffer needed to host the decimal representation of the maximum
32 // possible length of an int (in 64 bits), being -<20 digits>.
33 const int kMaxIntSize = 22;
34 // Size of buffer needed to host the decimal representation of the maximum
35 // possible length of a %.8g being -1.2345678e+999<nul> = 16.
36 const int kMaxDoubleSize = 16;
37 
38 /**********************************************************************
39  * STRING_HEADER provides metadata about the allocated buffer,
40  * including total capacity and how much used (strlen with '\0').
41  *
42  * The implementation hides this header at the start of the data
43  * buffer and appends the string on the end to keep sizeof(STRING)
44  * unchanged from earlier versions so serialization is not affected.
45  *
46  * The collection of MACROS provide different implementations depending
47  * on whether the string keeps track of its strlen or not so that this
48  * feature can be added in later when consumers don't modify the string
49  **********************************************************************/
50 
51 // Smallest string to allocate by default
52 const int kMinCapacity = 16;
53 
54 char* STRING::AllocData(int used, int capacity) {
55  data_ = (STRING_HEADER *)alloc_string(capacity + sizeof(STRING_HEADER));
56 
57  // header is the metadata for this memory block
58  STRING_HEADER* header = GetHeader();
59  header->capacity_ = capacity;
60  header->used_ = used;
61  return GetCStr();
62 }
63 
64 void STRING::DiscardData() {
65  free_string((char *)data_);
66 }
67 
68 // This is a private method; ensure FixHeader is called (or used_ is well defined)
69 // beforehand
70 char* STRING::ensure_cstr(inT32 min_capacity) {
71  STRING_HEADER* orig_header = GetHeader();
72  if (min_capacity <= orig_header->capacity_)
73  return ((char *)this->data_) + sizeof(STRING_HEADER);
74 
75  // if we are going to grow bigger, than double our existing
76  // size, but if that still is not big enough then keep the
77  // requested capacity
78  if (min_capacity < 2 * orig_header->capacity_)
79  min_capacity = 2 * orig_header->capacity_;
80 
81  int alloc = sizeof(STRING_HEADER) + min_capacity;
82  STRING_HEADER* new_header = (STRING_HEADER*)(alloc_string(alloc));
83 
84  memcpy(&new_header[1], GetCStr(), orig_header->used_);
85  new_header->capacity_ = min_capacity;
86  new_header->used_ = orig_header->used_;
87 
88  // free old memory, then rebind to new memory
89  DiscardData();
90  data_ = new_header;
91 
92  assert(InvariantOk());
93  return ((char *)data_) + sizeof(STRING_HEADER);
94 }
95 
96 // This is const, but is modifying a mutable field
97 // this way it can be used on const or non-const instances.
98 void STRING::FixHeader() const {
99  const STRING_HEADER* header = GetHeader();
100  if (header->used_ < 0)
101  header->used_ = strlen(GetCStr()) + 1;
102 }
103 
104 
106  // Empty STRINGs contain just the "\0".
107  memcpy(AllocData(1, kMinCapacity), "", 1);
108 }
109 
110 STRING::STRING(const STRING& str) {
111  str.FixHeader();
112  const STRING_HEADER* str_header = str.GetHeader();
113  int str_used = str_header->used_;
114  char *this_cstr = AllocData(str_used, str_used);
115  memcpy(this_cstr, str.GetCStr(), str_used);
116  assert(InvariantOk());
117 }
118 
119 STRING::STRING(const char* cstr) {
120  if (cstr == NULL) {
121  // Empty STRINGs contain just the "\0".
122  memcpy(AllocData(1, kMinCapacity), "", 1);
123  } else {
124  int len = strlen(cstr) + 1;
125  char* this_cstr = AllocData(len, len);
126  memcpy(this_cstr, cstr, len);
127  }
128  assert(InvariantOk());
129 }
130 
131 STRING::STRING(const char *data, int length) {
132  if (data == NULL) {
133  // Empty STRINGs contain just the "\0".
134  memcpy(AllocData(1, kMinCapacity), "", 1);
135  } else {
136  char* this_cstr = AllocData(length + 1, length + 1);
137  memcpy(this_cstr, data, length);
138  this_cstr[length] = '\0';
139  }
140 }
141 
143  DiscardData();
144 }
145 
146 // TODO(rays) Change all callers to use TFile and remove the old functions.
147 // Writes to the given file. Returns false in case of error.
148 bool STRING::Serialize(FILE* fp) const {
149  inT32 len = length();
150  if (fwrite(&len, sizeof(len), 1, fp) != 1) return false;
151  if (static_cast<int>(fwrite(GetCStr(), 1, len, fp)) != len) return false;
152  return true;
153 }
154 // Writes to the given file. Returns false in case of error.
155 bool STRING::Serialize(TFile* fp) const {
156  inT32 len = length();
157  if (fp->FWrite(&len, sizeof(len), 1) != 1) return false;
158  if (fp->FWrite(GetCStr(), 1, len) != len) return false;
159  return true;
160 }
161 // Reads from the given file. Returns false in case of error.
162 // If swap is true, assumes a big/little-endian swap is needed.
163 bool STRING::DeSerialize(bool swap, FILE* fp) {
164  inT32 len;
165  if (fread(&len, sizeof(len), 1, fp) != 1) return false;
166  if (swap)
167  ReverseN(&len, sizeof(len));
168  truncate_at(len);
169  if (static_cast<int>(fread(GetCStr(), 1, len, fp)) != len) return false;
170  return true;
171 }
172 // Reads from the given file. Returns false in case of error.
173 // If swap is true, assumes a big/little-endian swap is needed.
175  inT32 len;
176  if (fp->FReadEndian(&len, sizeof(len), 1) != 1) return false;
177  truncate_at(len);
178  if (fp->FRead(GetCStr(), 1, len) != len) return false;
179  return true;
180 }
181 
182 // As DeSerialize, but only seeks past the data - hence a static method.
184  inT32 len;
185  if (fp->FReadEndian(&len, sizeof(len), 1) != 1) return false;
186  return fp->FRead(NULL, 1, len) == len;
187 }
188 
189 BOOL8 STRING::contains(const char c) const {
190  return (c != '\0') && (strchr (GetCStr(), c) != NULL);
191 }
192 
194  FixHeader();
195  return GetHeader()->used_ - 1;
196 }
197 
198 const char* STRING::string() const {
199  const STRING_HEADER* header = GetHeader();
200  if (header->used_ == 0)
201  return NULL;
202 
203  // mark header length unreliable because tesseract might
204  // cast away the const and mutate the string directly.
205  header->used_ = -1;
206  return GetCStr();
207 }
208 
209 const char* STRING::c_str() const {
210  return string();
211 }
212 
213 /******
214  * The STRING_IS_PROTECTED interface adds additional support to migrate
215  * code that needs to modify the STRING in ways not otherwise supported
216  * without violating encapsulation.
217  *
218  * Also makes the [] operator return a const so it is immutable
219  */
220 #if STRING_IS_PROTECTED
221 const char& STRING::operator[](inT32 index) const {
222  return GetCStr()[index];
223 }
224 
225 void STRING::insert_range(inT32 index, const char* str, int len) {
226  // if index is outside current range, then also grow size of string
227  // to accmodate the requested range.
228  STRING_HEADER* this_header = GetHeader();
229  int used = this_header->used_;
230  if (index > used)
231  used = index;
232 
233  char* this_cstr = ensure_cstr(used + len + 1);
234  if (index < used) {
235  // move existing string from index to '\0' inclusive.
236  memmove(this_cstr + index + len,
237  this_cstr + index,
238  this_header->used_ - index);
239  } else if (len > 0) {
240  // We are going to overwrite previous null terminator, so write the new one.
241  this_cstr[this_header->used_ + len - 1] = '\0';
242 
243  // If the old header did not have the terminator,
244  // then we need to account for it now that we've added it.
245  // Otherwise it was already accounted for; we just moved it.
246  if (this_header->used_ == 0)
247  ++this_header->used_;
248  }
249 
250  // Write new string to index.
251  // The string is already terminated from the conditions above.
252  memcpy(this_cstr + index, str, len);
253  this_header->used_ += len;
254 
255  assert(InvariantOk());
256 }
257 
258 void STRING::erase_range(inT32 index, int len) {
259  char* this_cstr = GetCStr();
260  STRING_HEADER* this_header = GetHeader();
261 
262  memcpy(this_cstr+index, this_cstr+index+len,
263  this_header->used_ - index - len);
264  this_header->used_ -= len;
265  assert(InvariantOk());
266 }
267 
268 #else
270  ASSERT_HOST(index >= 0);
271  FixHeader();
272  char* this_cstr = ensure_cstr(index + 1);
273  this_cstr[index] = '\0';
274  GetHeader()->used_ = index + 1;
275  assert(InvariantOk());
276 }
277 
278 char& STRING::operator[](inT32 index) const {
279  // Code is casting away this const and mutating the string,
280  // so mark used_ as -1 to flag it unreliable.
281  GetHeader()->used_ = -1;
282  return ((char *)GetCStr())[index];
283 }
284 #endif
285 
286 void STRING::split(const char c, GenericVector<STRING> *splited) {
287  int start_index = 0;
288  int len = length();
289  for (int i = 0; i < len; i++) {
290  if ((*this)[i] == c) {
291  if (i != start_index) {
292  (*this)[i] = '\0';
293  splited->push_back(STRING(GetCStr() + start_index, i - start_index));
294  (*this)[i] = c;
295  }
296  start_index = i + 1;
297  }
298  }
299 
300  if (len != start_index) {
301  splited->push_back(STRING(GetCStr() + start_index, len - start_index));
302  }
303 }
304 
305 BOOL8 STRING::operator==(const STRING& str) const {
306  FixHeader();
307  str.FixHeader();
308  const STRING_HEADER* str_header = str.GetHeader();
309  const STRING_HEADER* this_header = GetHeader();
310  int this_used = this_header->used_;
311  int str_used = str_header->used_;
312 
313  return (this_used == str_used)
314  && (memcmp(GetCStr(), str.GetCStr(), this_used) == 0);
315 }
316 
317 BOOL8 STRING::operator!=(const STRING& str) const {
318  FixHeader();
319  str.FixHeader();
320  const STRING_HEADER* str_header = str.GetHeader();
321  const STRING_HEADER* this_header = GetHeader();
322  int this_used = this_header->used_;
323  int str_used = str_header->used_;
324 
325  return (this_used != str_used)
326  || (memcmp(GetCStr(), str.GetCStr(), this_used) != 0);
327 }
328 
329 BOOL8 STRING::operator!=(const char* cstr) const {
330  FixHeader();
331  const STRING_HEADER* this_header = GetHeader();
332 
333  if (cstr == NULL)
334  return this_header->used_ > 1; // either '\0' or NULL
335  else {
336  inT32 length = strlen(cstr) + 1;
337  return (this_header->used_ != length)
338  || (memcmp(GetCStr(), cstr, length) != 0);
339  }
340 }
341 
343  str.FixHeader();
344  const STRING_HEADER* str_header = str.GetHeader();
345  int str_used = str_header->used_;
346 
347  GetHeader()->used_ = 0; // clear since ensure doesn't need to copy data
348  char* this_cstr = ensure_cstr(str_used);
349  STRING_HEADER* this_header = GetHeader();
350 
351  memcpy(this_cstr, str.GetCStr(), str_used);
352  this_header->used_ = str_used;
353 
354  assert(InvariantOk());
355  return *this;
356 }
357 
359  FixHeader();
360  str.FixHeader();
361  const STRING_HEADER* str_header = str.GetHeader();
362  const char* str_cstr = str.GetCStr();
363  int str_used = str_header->used_;
364  int this_used = GetHeader()->used_;
365  char* this_cstr = ensure_cstr(this_used + str_used);
366 
367  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
368 
369  if (this_used > 1) {
370  memcpy(this_cstr + this_used - 1, str_cstr, str_used);
371  this_header->used_ += str_used - 1; // overwrite '\0'
372  } else {
373  memcpy(this_cstr, str_cstr, str_used);
374  this_header->used_ = str_used;
375  }
376 
377  assert(InvariantOk());
378  return *this;
379 }
380 
381 void STRING::add_str_int(const char* str, int number) {
382  if (str != NULL)
383  *this += str;
384  // Allow space for the maximum possible length of inT64.
385  char num_buffer[kMaxIntSize];
386  snprintf(num_buffer, kMaxIntSize - 1, "%d", number);
387  num_buffer[kMaxIntSize - 1] = '\0';
388  *this += num_buffer;
389 }
390 // Appends the given string and double (as a %.8g) to this.
391 void STRING::add_str_double(const char* str, double number) {
392  if (str != NULL)
393  *this += str;
394  // Allow space for the maximum possible length of %8g.
395  char num_buffer[kMaxDoubleSize];
396  snprintf(num_buffer, kMaxDoubleSize - 1, "%.8g", number);
397  num_buffer[kMaxDoubleSize - 1] = '\0';
398  *this += num_buffer;
399 }
400 
401 STRING & STRING::operator=(const char* cstr) {
402  STRING_HEADER* this_header = GetHeader();
403  if (cstr) {
404  int len = strlen(cstr) + 1;
405 
406  this_header->used_ = 0; // don't bother copying data if need to realloc
407  char* this_cstr = ensure_cstr(len);
408  this_header = GetHeader(); // for realloc
409  memcpy(this_cstr, cstr, len);
410  this_header->used_ = len;
411  } else {
412  // Reallocate to same state as default constructor.
413  DiscardData();
414  // Empty STRINGs contain just the "\0".
415  memcpy(AllocData(1, kMinCapacity), "", 1);
416  }
417 
418  assert(InvariantOk());
419  return *this;
420 }
421 
422 void STRING::assign(const char *cstr, int len) {
423  STRING_HEADER* this_header = GetHeader();
424  this_header->used_ = 0; // don't bother copying data if need to realloc
425  char* this_cstr = ensure_cstr(len + 1); // +1 for '\0'
426 
427  this_header = GetHeader(); // for realloc
428  memcpy(this_cstr, cstr, len);
429  this_cstr[len] = '\0';
430  this_header->used_ = len + 1;
431 
432  assert(InvariantOk());
433 }
434 
435 STRING STRING::operator+(const STRING& str) const {
436  STRING result(*this);
437  result += str;
438 
439  assert(InvariantOk());
440  return result;
441 }
442 
443 
444 STRING STRING::operator+(const char ch) const {
445  STRING result;
446  FixHeader();
447  const STRING_HEADER* this_header = GetHeader();
448  int this_used = this_header->used_;
449  char* result_cstr = result.ensure_cstr(this_used + 1);
450  STRING_HEADER* result_header = result.GetHeader();
451  int result_used = result_header->used_;
452 
453  // copies '\0' but we'll overwrite that
454  memcpy(result_cstr, GetCStr(), this_used);
455  result_cstr[result_used] = ch; // overwrite old '\0'
456  result_cstr[result_used + 1] = '\0'; // append on '\0'
457  ++result_header->used_;
458 
459  assert(InvariantOk());
460  return result;
461 }
462 
463 
464 STRING& STRING::operator+=(const char *str) {
465  if (!str || !*str) // empty string has no effect
466  return *this;
467 
468  FixHeader();
469  int len = strlen(str) + 1;
470  int this_used = GetHeader()->used_;
471  char* this_cstr = ensure_cstr(this_used + len);
472  STRING_HEADER* this_header = GetHeader(); // after ensure for realloc
473 
474  // if we had non-empty string then append overwriting old '\0'
475  // otherwise replace
476  if (this_used > 0) {
477  memcpy(this_cstr + this_used - 1, str, len);
478  this_header->used_ += len - 1;
479  } else {
480  memcpy(this_cstr, str, len);
481  this_header->used_ = len;
482  }
483 
484  assert(InvariantOk());
485  return *this;
486 }
487 
488 
489 STRING& STRING::operator+=(const char ch) {
490  if (ch == '\0')
491  return *this;
492 
493  FixHeader();
494  int this_used = GetHeader()->used_;
495  char* this_cstr = ensure_cstr(this_used + 1);
496  STRING_HEADER* this_header = GetHeader();
497 
498  if (this_used > 0)
499  --this_used; // undo old empty null if there was one
500 
501  this_cstr[this_used++] = ch; // append ch to end
502  this_cstr[this_used++] = '\0'; // append '\0' after ch
503  this_header->used_ = this_used;
504 
505  assert(InvariantOk());
506  return *this;
507 }
STRING & operator+=(const char *string)
Definition: strngs.cpp:464
void add_str_int(const char *str, int number)
Definition: strngs.cpp:381
const int kMaxDoubleSize
Definition: strngs.cpp:36
char & operator[](inT32 index) const
Definition: strngs.cpp:278
void truncate_at(inT32 index)
Definition: strngs.cpp:269
int32_t inT32
Definition: host.h:38
STRING operator+(const STRING &string) const
Definition: strngs.cpp:435
bool DeSerialize(bool swap, FILE *fp)
Definition: strngs.cpp:163
~STRING()
Definition: strngs.cpp:142
int push_back(T object)
const char * string() const
Definition: strngs.cpp:198
int FReadEndian(void *buffer, int size, int count)
Definition: serialis.cpp:97
inT32 length() const
Definition: strngs.cpp:193
void assign(const char *cstr, int len)
Definition: strngs.cpp:422
#define ASSERT_HOST(x)
Definition: errcode.h:84
unsigned char BOOL8
Definition: host.h:44
Definition: strngs.h:45
BOOL8 contains(const char c) const
Definition: strngs.cpp:189
void add_str_double(const char *str, double number)
Definition: strngs.cpp:391
bool Serialize(FILE *fp) const
Definition: strngs.cpp:148
int FWrite(const void *buffer, int size, int count)
Definition: serialis.cpp:148
char * alloc_string(inT32 count)
Definition: memry.cpp:30
BOOL8 operator!=(const STRING &string) const
Definition: strngs.cpp:317
const int kMinCapacity
Definition: strngs.cpp:52
BOOL8 operator==(const STRING &string) const
Definition: strngs.cpp:305
void free_string(char *string)
Definition: memry.cpp:35
const char * c_str() const
Definition: strngs.cpp:209
const int kMaxIntSize
Definition: strngs.cpp:33
static bool SkipDeSerialize(tesseract::TFile *fp)
Definition: strngs.cpp:183
STRING & operator=(const char *string)
Definition: strngs.cpp:401
STRING()
Definition: strngs.cpp:105
void split(const char c, GenericVector< STRING > *splited)
Definition: strngs.cpp:286
void ReverseN(void *ptr, int num_bytes)
Definition: helpers.h:184
int FRead(void *buffer, int size, int count)
Definition: serialis.cpp:108