tesseract  4.00.00dev
scanutils.cpp
Go to the documentation of this file.
1 // Copyright 2006 Google Inc.
2 // All Rights Reserved.
3 // Author: renn
4 //
5 // The fscanf, vfscanf and creat functions are implemented so that their
6 // functionality is mostly like their stdio counterparts. However, currently
7 // these functions do not use any buffering, making them rather slow.
8 // File streams are thus processed one character at a time.
9 // Although the implementations of the scanf functions do lack a few minor
10 // features, they should be sufficient for their use in tesseract.
11 //
12 // Licensed under the Apache License, Version 2.0 (the "License");
13 // you may not use this file except in compliance with the License.
14 // You may obtain a copy of the License at
15 // http://www.apache.org/licenses/LICENSE-2.0
16 // Unless required by applicable law or agreed to in writing, software
17 // distributed under the License is distributed on an "AS IS" BASIS,
18 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
19 // See the License for the specific language governing permissions and
20 // limitations under the License.
21 
22 #ifdef HAVE_CONFIG_H
23 #include "config_auto.h"
24 #endif
25 
26 #include <ctype.h>
27 #include <math.h>
28 #include <stdarg.h>
29 #include <stddef.h>
30 #include <string.h>
31 #include <limits.h>
32 #include <stdio.h>
33 #include <sys/types.h>
34 #include <sys/stat.h>
35 #include <fcntl.h>
36 
37 #include "scanutils.h"
38 #include "tprintf.h"
39 
40 enum Flags {
41  FL_SPLAT = 0x01, // Drop the value, do not assign
42  FL_INV = 0x02, // Character-set with inverse
43  FL_WIDTH = 0x04, // Field width specified
44  FL_MINUS = 0x08, // Negative number
45 };
46 
47 enum Ranks {
48  RANK_CHAR = -2,
49  RANK_SHORT = -1,
50  RANK_INT = 0,
51  RANK_LONG = 1,
53  RANK_PTR = INT_MAX // Special value used for pointers
54 };
55 
56 const enum Ranks kMinRank = RANK_CHAR;
58 
60 const enum Ranks kSizeTRank = RANK_LONG;
62 
63 enum Bail {
64  BAIL_NONE = 0, // No error condition
65  BAIL_EOF, // Hit EOF
66  BAIL_ERR // Conversion mismatch
67 };
68 
69 // Helper functions ------------------------------------------------------------
70 inline size_t LongBit() {
71  return CHAR_BIT * sizeof(long);
72 }
73 
74 static inline int
75 SkipSpace(FILE *s) {
76  int p;
77  while (isspace(p = fgetc(s)));
78  ungetc(p, s); // Make sure next char is available for reading
79  return p;
80 }
81 
82 static inline void
83 SetBit(unsigned long *bitmap, unsigned int bit) {
84  bitmap[bit/LongBit()] |= 1UL << (bit%LongBit());
85 }
86 
87 static inline int
88 TestBit(unsigned long *bitmap, unsigned int bit) {
89  return static_cast<int>(bitmap[bit/LongBit()] >> (bit%LongBit())) & 1;
90 }
91 
92 static inline int DigitValue(int ch, int base) {
93  if (ch >= '0' && ch <= '9') {
94  if (base >= 10 || ch <= '7')
95  return ch-'0';
96  } else if (ch >= 'A' && ch <= 'Z' && base == 16) {
97  return ch-'A'+10;
98  } else if (ch >= 'a' && ch <= 'z' && base == 16) {
99  return ch-'a'+10;
100  }
101  return -1;
102 }
103 
104 // IO (re-)implementations -----------------------------------------------------
105 uintmax_t streamtoumax(FILE* s, int base) {
106  int minus = 0;
107  uintmax_t v = 0;
108  int d, c = 0;
109 
110  for (c = fgetc(s);
111  isspace(static_cast<unsigned char>(c)) && (c != EOF);
112  c = fgetc(s)) {}
113 
114  // Single optional + or -
115  if (c == '-' || c == '+') {
116  minus = (c == '-');
117  c = fgetc(s);
118  }
119 
120  // Assign correct base
121  if (base == 0) {
122  if (c == '0') {
123  c = fgetc(s);
124  if (c == 'x' || c == 'X') {
125  base = 16;
126  c = fgetc(s);
127  } else {
128  base = 8;
129  }
130  }
131  } else if (base == 16) {
132  if (c == '0') {
133  c = fgetc(s);
134  if (c == 'x' || c == 'X') c = fgetc(s);
135  }
136  }
137 
138  // Actual number parsing
139  for (; (c != EOF) && (d = DigitValue(c, base)) >= 0; c = fgetc(s))
140  v = v*base + d;
141 
142  ungetc(c, s);
143  return minus ? -v : v;
144 }
145 
146 double streamtofloat(FILE* s) {
147  int minus = 0;
148  int v = 0;
149  int d, c = 0;
150  int k = 1;
151  int w = 0;
152 
153  for (c = fgetc(s);
154  isspace(static_cast<unsigned char>(c)) && (c != EOF);
155  c = fgetc(s));
156 
157  // Single optional + or -
158  if (c == '-' || c == '+') {
159  minus = (c == '-');
160  c = fgetc(s);
161  }
162 
163  // Actual number parsing
164  for (; c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s))
165  v = v*10 + d;
166  if (c == '.') {
167  for (c = fgetc(s); c != EOF && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
168  w = w*10 + d;
169  k *= 10;
170  }
171  }
172  double f = static_cast<double>(v)
173  + static_cast<double>(w) / static_cast<double>(k);
174  if (c == 'e' || c == 'E') {
175  c = fgetc(s);
176  int expsign = 1;
177  if (c == '-' || c == '+') {
178  expsign = (c == '-') ? -1 : 1;
179  c = fgetc(s);
180  }
181  int exponent = 0;
182  for (; (c != EOF) && (d = DigitValue(c, 10)) >= 0; c = fgetc(s)) {
183  exponent = exponent * 10 + d;
184  }
185  exponent *= expsign;
186  f *= pow(10.0, static_cast<double>(exponent));
187  }
188  ungetc(c, s);
189 
190  return minus ? -f : f;
191 }
192 
193 double strtofloat(const char* s) {
194  int minus = 0;
195  int v = 0;
196  int d;
197  int k = 1;
198  int w = 0;
199 
200  while(*s && isspace(static_cast<unsigned char>(*s))) s++;
201 
202  // Single optional + or -
203  if (*s == '-' || *s == '+') {
204  minus = (*s == '-');
205  s++;
206  }
207 
208  // Actual number parsing
209  for (; *s && (d = DigitValue(*s, 10)) >= 0; s++)
210  v = v*10 + d;
211  if (*s == '.') {
212  for (++s; *s && (d = DigitValue(*s, 10)) >= 0; s++) {
213  w = w*10 + d;
214  k *= 10;
215  }
216  }
217  if (*s == 'e' || *s == 'E')
218  tprintf("WARNING: Scientific Notation not supported!");
219 
220  double f = static_cast<double>(v)
221  + static_cast<double>(w) / static_cast<double>(k);
222 
223  return minus ? -f : f;
224 }
225 
226 static int tvfscanf(FILE* stream, const char *format, va_list ap);
227 
228 int tfscanf(FILE* stream, const char *format, ...) {
229  va_list ap;
230  int rv;
231 
232  va_start(ap, format);
233  rv = tvfscanf(stream, format, ap);
234  va_end(ap);
235 
236  return rv;
237 }
238 
239 #ifdef EMBEDDED
240 
241 int fscanf(FILE* stream, const char *format, ...) {
242  va_list ap;
243  int rv;
244 
245  va_start(ap, format);
246  rv = tvfscanf(stream, format, ap);
247  va_end(ap);
248 
249  return rv;
250 }
251 
252 int vfscanf(FILE* stream, const char *format, ...) {
253  va_list ap;
254  int rv;
255 
256  va_start(ap, format);
257  rv = tvfscanf(stream, format, ap);
258  va_end(ap);
259 
260  return rv;
261 }
262 #endif
263 
264 static int tvfscanf(FILE* stream, const char *format, va_list ap) {
265  const char *p = format;
266  char ch;
267  int q = 0;
268  uintmax_t val = 0;
269  int rank = RANK_INT; // Default rank
270  unsigned int width = UINT_MAX;
271  int base;
272  int flags = 0;
273  enum {
274  ST_NORMAL, // Ground state
275  ST_FLAGS, // Special flags
276  ST_WIDTH, // Field width
277  ST_MODIFIERS, // Length or conversion modifiers
278  ST_MATCH_INIT, // Initial state of %[ sequence
279  ST_MATCH, // Main state of %[ sequence
280  ST_MATCH_RANGE, // After - in a %[ sequence
281  } state = ST_NORMAL;
282  char *sarg = NULL; // %s %c or %[ string argument
283  enum Bail bail = BAIL_NONE;
284  int converted = 0; // Successful conversions
285  unsigned long matchmap[((1 << CHAR_BIT)+(CHAR_BIT * sizeof(long) - 1)) /
286  (CHAR_BIT * sizeof(long))];
287  int matchinv = 0; // Is match map inverted?
288  unsigned char range_start = 0;
289  off_t start_off = ftell(stream);
290 
291  // Skip leading spaces
292  SkipSpace(stream);
293 
294  while ((ch = *p++) && !bail) {
295  switch (state) {
296  case ST_NORMAL:
297  if (ch == '%') {
298  state = ST_FLAGS;
299  flags = 0; rank = RANK_INT; width = UINT_MAX;
300  } else if (isspace(static_cast<unsigned char>(ch))) {
301  SkipSpace(stream);
302  } else {
303  if (fgetc(stream) != ch)
304  bail = BAIL_ERR; // Match failure
305  }
306  break;
307 
308  case ST_FLAGS:
309  if (ch == '*') {
310  flags |= FL_SPLAT;
311  } else if ('0' <= ch && ch <= '9') {
312  width = (ch-'0');
313  state = ST_WIDTH;
314  flags |= FL_WIDTH;
315  } else {
316  state = ST_MODIFIERS;
317  p--; // Process this character again
318  }
319  break;
320 
321  case ST_WIDTH:
322  if (ch >= '0' && ch <= '9') {
323  width = width*10+(ch-'0');
324  } else {
325  state = ST_MODIFIERS;
326  p--; // Process this character again
327  }
328  break;
329 
330  case ST_MODIFIERS:
331  switch (ch) {
332  // Length modifiers - nonterminal sequences
333  case 'h':
334  rank--; // Shorter rank
335  break;
336  case 'l':
337  rank++; // Longer rank
338  break;
339  case 'j':
340  rank = kIntMaxRank;
341  break;
342  case 'z':
343  rank = kSizeTRank;
344  break;
345  case 't':
346  rank = kPtrDiffRank;
347  break;
348  case 'L':
349  case 'q':
350  rank = RANK_LONGLONG; // long double/long long
351  break;
352 
353  default:
354  // Output modifiers - terminal sequences
355  state = ST_NORMAL; // Next state will be normal
356  if (rank < kMinRank) // Canonicalize rank
357  rank = kMinRank;
358  else if (rank > kMaxRank)
359  rank = kMaxRank;
360 
361  switch (ch) {
362  case 'P': // Upper case pointer
363  case 'p': // Pointer
364  rank = RANK_PTR;
365  base = 0;
366  goto scan_int;
367 
368  case 'i': // Base-independent integer
369  base = 0;
370  goto scan_int;
371 
372  case 'd': // Decimal integer
373  base = 10;
374  goto scan_int;
375 
376  case 'o': // Octal integer
377  base = 8;
378  goto scan_int;
379 
380  case 'u': // Unsigned decimal integer
381  base = 10;
382  goto scan_int;
383 
384  case 'x': // Hexadecimal integer
385  case 'X':
386  base = 16;
387  goto scan_int;
388 
389  case 'n': // Number of characters consumed
390  val = ftell(stream) - start_off;
391  goto set_integer;
392 
393  scan_int:
394  q = SkipSpace(stream);
395  if ( q <= 0 ) {
396  bail = BAIL_EOF;
397  break;
398  }
399  val = streamtoumax(stream, base);
400  // fall through
401 
402  set_integer:
403  if (!(flags & FL_SPLAT)) {
404  converted++;
405  switch(rank) {
406  case RANK_CHAR:
407  *va_arg(ap, unsigned char *)
408  = static_cast<unsigned char>(val);
409  break;
410  case RANK_SHORT:
411  *va_arg(ap, unsigned short *)
412  = static_cast<unsigned short>(val);
413  break;
414  case RANK_INT:
415  *va_arg(ap, unsigned int *)
416  = static_cast<unsigned int>(val);
417  break;
418  case RANK_LONG:
419  *va_arg(ap, unsigned long *)
420  = static_cast<unsigned long>(val);
421  break;
422  case RANK_LONGLONG:
423  *va_arg(ap, unsigned long long *)
424  = static_cast<unsigned long long>(val);
425  break;
426  case RANK_PTR:
427  *va_arg(ap, void **)
428  = reinterpret_cast<void *>(static_cast<uintptr_t>(val));
429  break;
430  }
431  }
432  break;
433 
434  case 'f': // Preliminary float value parsing
435  case 'g':
436  case 'G':
437  case 'e':
438  case 'E':
439  q = SkipSpace(stream);
440  if (q <= 0) {
441  bail = BAIL_EOF;
442  break;
443  }
444 
445  {
446  double fval = streamtofloat(stream);
447  if (!(flags & FL_SPLAT)) {
448  if (rank == RANK_INT)
449  *va_arg(ap, float *) = static_cast<float>(fval);
450  else if (rank == RANK_LONG)
451  *va_arg(ap, double *) = static_cast<double>(fval);
452  converted++;
453  }
454  }
455  break;
456 
457  case 'c': // Character
458  width = (flags & FL_WIDTH) ? width : 1; // Default width == 1
459  sarg = va_arg(ap, char *);
460  while (width--) {
461  if ((q = fgetc(stream)) <= 0) {
462  bail = BAIL_EOF;
463  break;
464  }
465  if (!(flags & FL_SPLAT)) {
466  *sarg++ = q;
467  converted++;
468  }
469  }
470  break;
471 
472  case 's': // String
473  {
474  char *sp;
475  sp = sarg = va_arg(ap, char *);
476  while (width--) {
477  q = fgetc(stream);
478  if (isspace(static_cast<unsigned char>(q)) || q <= 0) {
479  ungetc(q, stream);
480  break;
481  }
482  if (!(flags & FL_SPLAT)) *sp = q;
483  sp++;
484  }
485  if (sarg == sp) {
486  bail = BAIL_EOF;
487  } else if (!(flags & FL_SPLAT)) {
488  *sp = '\0'; // Terminate output
489  converted++;
490  } else {
491  }
492  }
493  break;
494 
495  case '[': // Character range
496  sarg = va_arg(ap, char *);
497  state = ST_MATCH_INIT;
498  matchinv = 0;
499  memset(matchmap, 0, sizeof matchmap);
500  break;
501 
502  case '%': // %% sequence
503  if (fgetc(stream) != '%' )
504  bail = BAIL_ERR;
505  break;
506 
507  default: // Anything else
508  bail = BAIL_ERR; // Unknown sequence
509  break;
510  }
511  }
512  break;
513 
514  case ST_MATCH_INIT: // Initial state for %[ match
515  if (ch == '^' && !(flags & FL_INV)) {
516  matchinv = 1;
517  } else {
518  SetBit(matchmap, static_cast<unsigned char>(ch));
519  state = ST_MATCH;
520  }
521  break;
522 
523  case ST_MATCH: // Main state for %[ match
524  if (ch == ']') {
525  goto match_run;
526  } else if (ch == '-') {
527  range_start = static_cast<unsigned char>(ch);
528  state = ST_MATCH_RANGE;
529  } else {
530  SetBit(matchmap, static_cast<unsigned char>(ch));
531  }
532  break;
533 
534  case ST_MATCH_RANGE: // %[ match after -
535  if (ch == ']') {
536  SetBit(matchmap, static_cast<unsigned char>('-'));
537  goto match_run;
538  } else {
539  int i;
540  for (i = range_start ; i < (static_cast<unsigned char>(ch)) ; i++)
541  SetBit(matchmap, i);
542  state = ST_MATCH;
543  }
544  break;
545 
546  match_run: // Match expression finished
547  char* oarg = sarg;
548  while (width) {
549  q = fgetc(stream);
550  unsigned char qc = static_cast<unsigned char>(q);
551  if (q <= 0 || !(TestBit(matchmap, qc)^matchinv)) {
552  ungetc(q, stream);
553  break;
554  }
555  if (!(flags & FL_SPLAT)) *sarg = q;
556  sarg++;
557  }
558  if (oarg == sarg) {
559  bail = (q <= 0) ? BAIL_EOF : BAIL_ERR;
560  } else if (!(flags & FL_SPLAT)) {
561  *sarg = '\0';
562  converted++;
563  }
564  break;
565  }
566  }
567 
568  if (bail == BAIL_EOF && !converted)
569  converted = -1; // Return EOF (-1)
570 
571  return converted;
572 }
573 
574 #ifdef EMBEDDED
575 int creat(const char *pathname, mode_t mode) {
576  return open(pathname, O_CREAT | O_TRUNC | O_WRONLY, mode);
577 }
578 
579 #endif // EMBEDDED
int tfscanf(FILE *stream, const char *format,...)
Definition: scanutils.cpp:228
size_t LongBit()
Definition: scanutils.cpp:70
voidpf stream
Definition: ioapi.h:39
Ranks
Definition: scanutils.cpp:47
enum Ranks kMinRank
Definition: scanutils.cpp:56
Bail
Definition: scanutils.cpp:63
#define tprintf(...)
Definition: tprintf.h:31
Flags
Definition: scanutils.cpp:40
enum Ranks kIntMaxRank
Definition: scanutils.cpp:59
enum Ranks kMaxRank
Definition: scanutils.cpp:57
const char int mode
Definition: ioapi.h:38
enum Ranks kPtrDiffRank
Definition: scanutils.cpp:61
enum Ranks kSizeTRank
Definition: scanutils.cpp:60
typedef long(ZCALLBACK *tell_file_func) OF((voidpf opaque
double streamtofloat(FILE *s)
Definition: scanutils.cpp:146
double v[max]
uintmax_t streamtoumax(FILE *s, int base)
Definition: scanutils.cpp:105
double strtofloat(const char *s)
Definition: scanutils.cpp:193