tesseract  4.00.00dev
pango_font_info.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: pango_font_info.cpp
3  * Description: Font-related objects and helper functions
4  * Author: Ranjith Unnikrishnan
5  * Created: Mon Nov 18 2013
6  *
7  * (C) Copyright 2013, Google Inc.
8  * Licensed under the Apache License, Version 2.0 (the "License");
9  * you may not use this file except in compliance with the License.
10  * You may obtain a copy of the License at
11  * http://www.apache.org/licenses/LICENSE-2.0
12  * Unless required by applicable law or agreed to in writing, software
13  * distributed under the License is distributed on an "AS IS" BASIS,
14  * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  * See the License for the specific language governing permissions and
16  * limitations under the License.
17  *
18  **********************************************************************/
19 
20 // Include automatically generated configuration file if running autoconf.
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #if (defined __MINGW32__) || (defined __CYGWIN__)
26 // workaround for stdlib.h and putenv
27 #undef __STRICT_ANSI__
28 
29 #if (defined __MINGW32__)
30 #include "strcasestr.h"
31 #elif !defined(_GNU_SOURCE)
32 // needed for strcasestr in string.h
33 #define _GNU_SOURCE
34 #endif
35 
36 #elif defined(_MSC_VER)
37 #include "strcasestr.h"
38 #define strncasecmp _strnicmp
39 #define strcasecmp _stricmp
40 #endif
41 
42 #include <stdlib.h>
43 #include <stdio.h>
44 #include <string.h>
45 #ifndef _MSC_VER
46 #include <sys/param.h>
47 #endif
48 #include <algorithm>
49 
50 #include "pango_font_info.h"
51 #include "commandlineflags.h"
52 #include "fileio.h"
53 #include "normstrngs.h"
54 #include "tlog.h"
55 #include "unichar.h"
56 #include "util.h"
57 #include "pango/pango.h"
58 #include "pango/pangocairo.h"
59 #include "pango/pangofc-font.h"
60 
61 STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp",
62  "Overrides fontconfig default temporary dir");
63 
64 #ifndef USE_STD_NAMESPACE
65 #include "ocr/trainingdata/typesetting/legacy_fonts.h"
66 BOOL_PARAM_FLAG(use_only_legacy_fonts, false,
67  "Overrides --fonts_dir and sets the known universe of fonts to"
68  "the list in legacy_fonts.h");
69 
70 STRING_PARAM_FLAG(fonts_dir, "/auto/ocr-data/tesstraining/fonts",
71  "Overrides system default font location");
72 #else
73 using std::pair;
74 STRING_PARAM_FLAG(fonts_dir, "",
75  "If empty it use system default. Otherwise it overrides"
76  " system default font location");
77 #endif
78 
79 namespace tesseract {
80 
81 // Default assumed output resolution. Required only for providing font metrics
82 // in pixels.
83 const int kDefaultResolution = 300;
84 
85 string PangoFontInfo::fonts_dir_;
86 string PangoFontInfo::cache_dir_;
87 
89  : desc_(nullptr), resolution_(kDefaultResolution) {
90  Clear();
91 }
92 
93 PangoFontInfo::PangoFontInfo(const string& desc)
94  : desc_(nullptr), resolution_(kDefaultResolution) {
95  if (!ParseFontDescriptionName(desc)) {
96  tprintf("ERROR: Could not parse %s\n", desc.c_str());
97  Clear();
98  }
99 }
100 
101 void PangoFontInfo::Clear() {
102  font_size_ = 0;
103  is_bold_ = false;
104  is_italic_ = false;
105  is_smallcaps_ = false;
106  is_monospace_ = false;
107  family_name_.clear();
108  font_type_ = UNKNOWN;
109  if (desc_) {
110  pango_font_description_free(desc_);
111  desc_ = nullptr;
112  }
113 }
114 
115 PangoFontInfo::~PangoFontInfo() { pango_font_description_free(desc_); }
116 
118  if (!desc_) return "";
119  char* desc_str = pango_font_description_to_string(desc_);
120  string desc_name(desc_str);
121  g_free(desc_str);
122  return desc_name;
123 }
124 
125 // If not already initialized, initializes FontConfig by setting its
126 // environment variable and creating a fonts.conf file that points to the
127 // FLAGS_fonts_dir and the cache to FLAGS_fontconfig_tmpdir.
128 /* static */
130  if (fonts_dir_.empty()) {
131  HardInitFontConfig(FLAGS_fonts_dir.c_str(),
132  FLAGS_fontconfig_tmpdir.c_str());
133  }
134 }
135 
136 // Re-initializes font config, whether or not already initialized.
137 // If already initialized, any existing cache is deleted, just to be sure.
138 /* static */
139 void PangoFontInfo::HardInitFontConfig(const string& fonts_dir,
140  const string& cache_dir) {
141  if (!cache_dir_.empty()) {
143  File::JoinPath(cache_dir_.c_str(), "*cache-?").c_str());
144  }
145  const int MAX_FONTCONF_FILESIZE = 1024;
146  char fonts_conf_template[MAX_FONTCONF_FILESIZE];
147  cache_dir_ = cache_dir;
148  fonts_dir_ = fonts_dir;
149  snprintf(fonts_conf_template, MAX_FONTCONF_FILESIZE,
150  "<?xml version=\"1.0\"?>\n"
151  "<!DOCTYPE fontconfig SYSTEM \"fonts.dtd\">\n"
152  "<fontconfig>\n"
153  "<dir>%s</dir>\n"
154  "<cachedir>%s</cachedir>\n"
155  "<config></config>\n"
156  "</fontconfig>",
157  fonts_dir.c_str(), cache_dir_.c_str());
158  string fonts_conf_file = File::JoinPath(cache_dir_.c_str(), "fonts.conf");
159  File::WriteStringToFileOrDie(fonts_conf_template, fonts_conf_file);
160 #ifdef _WIN32
161  std::string env("FONTCONFIG_PATH=");
162  env.append(cache_dir_.c_str());
163  putenv(env.c_str());
164  putenv("LANG=en_US.utf8");
165 #else
166  setenv("FONTCONFIG_PATH", cache_dir_.c_str(), true);
167  // Fix the locale so that the reported font names are consistent.
168  setenv("LANG", "en_US.utf8", true);
169 #endif // _WIN32
170 
171  if (FcInitReinitialize() != FcTrue) {
172  tprintf("FcInitiReinitialize failed!!\n");
173  }
175  // Clear Pango's font cache too.
176  pango_cairo_font_map_set_default(nullptr);
177 }
178 
179 static void ListFontFamilies(PangoFontFamily*** families,
180  int* n_families) {
182  PangoFontMap* font_map = pango_cairo_font_map_get_default();
184  pango_font_map_list_families(font_map, families, n_families);
185 }
186 
187 // Inspects whether a given font family is monospace. If the font is not
188 // available, it cannot make a decision and returns false by default.
189 static bool IsMonospaceFontFamily(const char* family_name) {
190  PangoFontFamily** families = 0;
191  int n_families = 0;
192  bool is_monospace = false;
193  ListFontFamilies(&families, &n_families);
194  ASSERT_HOST(n_families > 0);
195  bool found = false;
196  for (int i = 0; i < n_families; ++i) {
197  if (!strcasecmp(family_name, pango_font_family_get_name(families[i]))) {
198  is_monospace = pango_font_family_is_monospace(families[i]);
199  found = true;
200  break;
201  }
202  }
203  if (!found) {
204  tlog(1, "Could not find monospace property of family %s\n", family_name);
205  }
206  g_free(families);
207  return is_monospace;
208 }
209 
210 bool PangoFontInfo::ParseFontDescription(const PangoFontDescription *desc) {
211  Clear();
212  const char* family = pango_font_description_get_family(desc);
213  if (!family) {
214  char* desc_str = pango_font_description_to_string(desc);
215  tprintf("WARNING: Could not parse family name from description: '%s'\n",
216  desc_str);
217  g_free(desc_str);
218  return false;
219  }
220  family_name_ = string(family);
221  desc_ = pango_font_description_copy(desc);
222  is_monospace_ = IsMonospaceFontFamily(family);
223 
224  // Set font size in points
225  font_size_ = pango_font_description_get_size(desc);
226  if (!pango_font_description_get_size_is_absolute(desc)) {
227  font_size_ /= PANGO_SCALE;
228  }
229 
230  PangoStyle style = pango_font_description_get_style(desc);
231  is_italic_ = (PANGO_STYLE_ITALIC == style ||
232  PANGO_STYLE_OBLIQUE == style);
233  is_smallcaps_ = (pango_font_description_get_variant(desc)
234  == PANGO_VARIANT_SMALL_CAPS);
235 
236  is_bold_ = (pango_font_description_get_weight(desc) >= PANGO_WEIGHT_BOLD);
237  // We don't have a way to detect whether a font is of type Fraktur. The fonts
238  // we currently use all have "Fraktur" in their family name, so we do a
239  // fragile but functional check for that here.
240  is_fraktur_ = (strcasestr(family, "Fraktur") != nullptr);
241  return true;
242 }
243 
244 bool PangoFontInfo::ParseFontDescriptionName(const string& name) {
245  PangoFontDescription *desc = pango_font_description_from_string(name.c_str());
246  bool success = ParseFontDescription(desc);
247  pango_font_description_free(desc);
248  return success;
249 }
250 
251 // Returns the PangoFont structure corresponding to the closest available font
252 // in the font map. Note that if the font is wholly missing, this could
253 // correspond to a completely different font family and face.
254 PangoFont* PangoFontInfo::ToPangoFont() const {
256  PangoFontMap* font_map = pango_cairo_font_map_get_default();
257  PangoContext* context = pango_context_new();
258  pango_cairo_context_set_resolution(context, resolution_);
259  pango_context_set_font_map(context, font_map);
260  PangoFont* font = nullptr;
261  {
263  font = pango_font_map_load_font(font_map, context, desc_);
264  }
265  g_object_unref(context);
266  return font;
267 }
268 
269 bool PangoFontInfo::CoversUTF8Text(const char* utf8_text, int byte_length) const {
270  PangoFont* font = ToPangoFont();
271  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
272  for (UNICHAR::const_iterator it = UNICHAR::begin(utf8_text, byte_length);
273  it != UNICHAR::end(utf8_text, byte_length);
274  ++it) {
275  if (IsWhitespace(*it) || pango_is_zero_width(*it))
276  continue;
277  if (pango_coverage_get(coverage, *it) != PANGO_COVERAGE_EXACT) {
278  char tmp[5];
279  int len = it.get_utf8(tmp);
280  tmp[len] = '\0';
281  tlog(2, "'%s' (U+%x) not covered by font\n", tmp, *it);
282  return false;
283  }
284  }
285  return true;
286 }
287 
288 // This variant of strncpy permits src and dest to overlap. It will copy the
289 // first byte first.
290 static char* my_strnmove(char* dest, const char* src, size_t n) {
291  char* ret = dest;
292 
293  // Copy characters until n reaches zero or the src byte is a nul.
294  do {
295  *dest = *src;
296  --n;
297  ++dest;
298  ++src;
299  } while (n && src[0]);
300 
301  // If we reached a nul byte and there are more 'n' left, zero them out.
302  while (n) {
303  *dest = '\0';
304  --n;
305  ++dest;
306  }
307  return ret;
308 }
309 
310 int PangoFontInfo::DropUncoveredChars(string* utf8_text) const {
311  PangoFont* font = ToPangoFont();
312  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
313  int num_dropped_chars = 0;
314  // Maintain two iterators that point into the string. For space efficiency, we
315  // will repeatedly copy one covered UTF8 character from one to the other, and
316  // at the end resize the string to the right length.
317  char* out = const_cast<char*>(utf8_text->c_str());
318  const UNICHAR::const_iterator it_begin =
319  UNICHAR::begin(utf8_text->c_str(), utf8_text->length());
320  const UNICHAR::const_iterator it_end =
321  UNICHAR::end(utf8_text->c_str(), utf8_text->length());
322  for (UNICHAR::const_iterator it = it_begin; it != it_end;) {
323  // Skip bad utf-8.
324  if (!it.is_legal()) {
325  ++it; // One suitable error message will still be issued.
326  continue;
327  }
328  int unicode = *it;
329  int utf8_len = it.utf8_len();
330  const char* utf8_char = it.utf8_data();
331  // Move it forward before the data gets modified.
332  ++it;
333  if (!IsWhitespace(unicode) && !pango_is_zero_width(unicode) &&
334  pango_coverage_get(coverage, unicode) != PANGO_COVERAGE_EXACT) {
335  if (TLOG_IS_ON(2)) {
336  UNICHAR unichar(unicode);
337  char* str = unichar.utf8_str();
338  tlog(2, "'%s' (U+%x) not covered by font\n", str, unicode);
339  delete[] str;
340  }
341  ++num_dropped_chars;
342  continue;
343  }
344  my_strnmove(out, utf8_char, utf8_len);
345  out += utf8_len;
346  }
347  utf8_text->resize(out - utf8_text->c_str());
348  return num_dropped_chars;
349 }
350 
351 bool PangoFontInfo::GetSpacingProperties(const string& utf8_char,
352  int* x_bearing, int* x_advance) const {
353  // Convert to equivalent PangoFont structure
354  PangoFont* font = ToPangoFont();
355  // Find the glyph index in the font for the supplied utf8 character.
356  int total_advance = 0;
357  int min_bearing = 0;
358  // Handle multi-unicode strings by reporting the left-most position of the
359  // x-bearing, and right-most position of the x-advance if the string were to
360  // be rendered.
361  const UNICHAR::const_iterator it_begin = UNICHAR::begin(utf8_char.c_str(),
362  utf8_char.length());
363  const UNICHAR::const_iterator it_end = UNICHAR::end(utf8_char.c_str(),
364  utf8_char.length());
365  for (UNICHAR::const_iterator it = it_begin; it != it_end; ++it) {
366  PangoGlyph glyph_index = pango_fc_font_get_glyph(
367  reinterpret_cast<PangoFcFont*>(font), *it);
368  if (!glyph_index) {
369  // Glyph for given unicode character doesn't exist in font.
370  return false;
371  }
372  // Find the ink glyph extents for the glyph
373  PangoRectangle ink_rect, logical_rect;
374  pango_font_get_glyph_extents(font, glyph_index, &ink_rect, &logical_rect);
375  pango_extents_to_pixels(&ink_rect, nullptr);
376  pango_extents_to_pixels(&logical_rect, nullptr);
377 
378  int bearing = total_advance + PANGO_LBEARING(ink_rect);
379  if (it == it_begin || bearing < min_bearing) {
380  min_bearing = bearing;
381  }
382  total_advance += PANGO_RBEARING(logical_rect);
383  }
384  *x_bearing = min_bearing;
385  *x_advance = total_advance;
386  return true;
387 }
388 
389 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len) const {
390  std::vector<string> graphemes;
391  return CanRenderString(utf8_word, len, &graphemes);
392 }
393 
394 bool PangoFontInfo::CanRenderString(const char* utf8_word, int len,
395  std::vector<string>* graphemes) const {
396  if (graphemes) graphemes->clear();
397  // We check for font coverage of the text first, as otherwise Pango could
398  // (undesirably) fall back to another font that does have the required
399  // coverage.
400  if (!CoversUTF8Text(utf8_word, len)) {
401  return false;
402  }
403  // U+25CC dotted circle character that often (but not always) gets rendered
404  // when there is an illegal grapheme sequence.
405  const char32 kDottedCircleGlyph = 9676;
406  bool bad_glyph = false;
407  PangoFontMap* font_map = pango_cairo_font_map_get_default();
408  PangoContext* context = pango_context_new();
409  pango_context_set_font_map(context, font_map);
410  PangoLayout* layout;
411  {
412  // Pango is not relasing the cached layout.
414  layout = pango_layout_new(context);
415  }
416  if (desc_) {
417  pango_layout_set_font_description(layout, desc_);
418  } else {
419  PangoFontDescription *desc = pango_font_description_from_string(
420  DescriptionName().c_str());
421  pango_layout_set_font_description(layout, desc);
422  pango_font_description_free(desc);
423  }
424  pango_layout_set_text(layout, utf8_word, len);
425  PangoLayoutIter* run_iter = nullptr;
426  { // Fontconfig caches some information here that is not freed before exit.
428  run_iter = pango_layout_get_iter(layout);
429  }
430  do {
431  PangoLayoutRun* run = pango_layout_iter_get_run_readonly(run_iter);
432  if (!run) {
433  tlog(2, "Found end of line nullptr run marker\n");
434  continue;
435  }
436  PangoGlyph dotted_circle_glyph;
437  PangoFont* font = run->item->analysis.font;
438 
439 #ifdef _WIN32 // Fixme! Leaks memory and breaks unittests.
440  PangoGlyphString* glyphs = pango_glyph_string_new();
441  char s[] = "\xc2\xa7";
442  pango_shape(s, sizeof(s), &(run->item->analysis), glyphs);
443  dotted_circle_glyph = glyphs->glyphs[0].glyph;
444 #else
445  dotted_circle_glyph = pango_fc_font_get_glyph(
446  reinterpret_cast<PangoFcFont*>(font), kDottedCircleGlyph);
447 #endif
448 
449  if (TLOG_IS_ON(2)) {
450  PangoFontDescription* desc = pango_font_describe(font);
451  char* desc_str = pango_font_description_to_string(desc);
452  tlog(2, "Desc of font in run: %s\n", desc_str);
453  g_free(desc_str);
454  pango_font_description_free(desc);
455  }
456 
457  PangoGlyphItemIter cluster_iter;
458  gboolean have_cluster;
459  for (have_cluster = pango_glyph_item_iter_init_start(&cluster_iter,
460  run, utf8_word);
461  have_cluster && !bad_glyph;
462  have_cluster = pango_glyph_item_iter_next_cluster(&cluster_iter)) {
463  const int start_byte_index = cluster_iter.start_index;
464  const int end_byte_index = cluster_iter.end_index;
465  int start_glyph_index = cluster_iter.start_glyph;
466  int end_glyph_index = cluster_iter.end_glyph;
467  string cluster_text = string(utf8_word + start_byte_index,
468  end_byte_index - start_byte_index);
469  if (graphemes) graphemes->push_back(cluster_text);
470  if (IsUTF8Whitespace(cluster_text.c_str())) {
471  tlog(2, "Skipping whitespace\n");
472  continue;
473  }
474  if (TLOG_IS_ON(2)) {
475  printf("start_byte=%d end_byte=%d start_glyph=%d end_glyph=%d ",
476  start_byte_index, end_byte_index,
477  start_glyph_index, end_glyph_index);
478  }
479  for (int i = start_glyph_index,
480  step = (end_glyph_index > start_glyph_index) ? 1 : -1;
481  !bad_glyph && i != end_glyph_index; i+= step) {
482  const bool unknown_glyph =
483  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph &
484  PANGO_GLYPH_UNKNOWN_FLAG);
485  const bool illegal_glyph =
486  (cluster_iter.glyph_item->glyphs->glyphs[i].glyph ==
487  dotted_circle_glyph);
488  bad_glyph = unknown_glyph || illegal_glyph;
489  if (TLOG_IS_ON(2)) {
490  printf("(%d=%d)", cluster_iter.glyph_item->glyphs->glyphs[i].glyph,
491  bad_glyph ? 1 : 0);
492  }
493  }
494  if (TLOG_IS_ON(2)) {
495  printf(" '%s'\n", cluster_text.c_str());
496  }
497  if (bad_glyph)
498  tlog(1, "Found illegal glyph!\n");
499  }
500  } while (!bad_glyph && pango_layout_iter_next_run(run_iter));
501 
502  pango_layout_iter_free(run_iter);
503  g_object_unref(context);
504  g_object_unref(layout);
505  if (bad_glyph && graphemes) graphemes->clear();
506  return !bad_glyph;
507 }
508 
509 
510 // ------------------------ FontUtils ------------------------------------
511 std::vector<string> FontUtils::available_fonts_; // cache list
512 
513 // Returns whether the specified font description is available in the fonts
514 // directory.
515 //
516 // The generated list of font families and faces includes "synthesized" font
517 // faces that are not truly loadable. Pango versions >=1.18 have a
518 // pango_font_face_is_synthesized method that can be used to prune the list.
519 // Until then, we are restricted to using a hack where we try to load the font
520 // from the font_map, and then check what we loaded to see if it has the
521 // description we expected. If it is not, then the font is deemed unavailable.
522 /* static */
523 bool FontUtils::IsAvailableFont(const char* input_query_desc,
524  string* best_match) {
525  string query_desc(input_query_desc);
526 #if (PANGO_VERSION <= 12005)
527  // Strip commas and any ' Medium' substring in the name.
528  query_desc.erase(std::remove(query_desc.begin(), query_desc.end(), ','),
529  query_desc.end());
530  const string kMediumStr = " Medium";
531  std::size_t found = query_desc.find(kMediumStr);
532  if (found != std::string::npos) {
533  query_desc.erase(found, kMediumStr.length());
534  }
535 #endif
536  PangoFontDescription *desc = pango_font_description_from_string(
537  query_desc.c_str());
538  PangoFont* selected_font = nullptr;
539  {
541  PangoFontMap* font_map = pango_cairo_font_map_get_default();
542  PangoContext* context = pango_context_new();
543  pango_context_set_font_map(context, font_map);
544  {
546  selected_font = pango_font_map_load_font(font_map, context, desc);
547  }
548  g_object_unref(context);
549  }
550  if (selected_font == nullptr) {
551  pango_font_description_free(desc);
552  return false;
553  }
554  PangoFontDescription* selected_desc = pango_font_describe(selected_font);
555 
556  bool equal = pango_font_description_equal(desc, selected_desc);
557  tlog(3, "query weight = %d \t selected weight =%d\n",
558  pango_font_description_get_weight(desc),
559  pango_font_description_get_weight(selected_desc));
560 
561  char* selected_desc_str = pango_font_description_to_string(selected_desc);
562  tlog(2, "query_desc: '%s' Selected: '%s'\n", query_desc.c_str(),
563  selected_desc_str);
564  if (!equal && best_match != nullptr) {
565  *best_match = selected_desc_str;
566  // Clip the ending ' 0' if there is one. It seems that, if there is no
567  // point size on the end of the fontname, then Pango always appends ' 0'.
568  int len = best_match->size();
569  if (len > 2 && best_match->at(len - 1) == '0' &&
570  best_match->at(len - 2) == ' ') {
571  *best_match = best_match->substr(0, len - 2);
572  }
573  }
574  g_free(selected_desc_str);
575  pango_font_description_free(selected_desc);
576  g_object_unref(selected_font);
577  pango_font_description_free(desc);
578  return equal;
579 }
580 
581 static bool ShouldIgnoreFontFamilyName(const char* query) {
582  static const char* kIgnoredFamilyNames[] = {"Sans", "Serif", "Monospace",
583  nullptr};
584  const char** list = kIgnoredFamilyNames;
585  for (; *list != nullptr; ++list) {
586  if (!strcmp(*list, query))
587  return true;
588  }
589  return false;
590 }
591 
592 // Outputs description names of available fonts.
593 /* static */
594 const std::vector<string>& FontUtils::ListAvailableFonts() {
595  if (!available_fonts_.empty()) {
596  return available_fonts_;
597  }
598 #ifndef USE_STD_NAMESPACE
599  if (FLAGS_use_only_legacy_fonts) {
600  // Restrict view to list of fonts in legacy_fonts.h
601  tprintf("Using list of legacy fonts only\n");
602  const int kNumFontLists = 4;
603  for (int i = 0; i < kNumFontLists; ++i) {
604  for (int j = 0; kFontlists[i][j] != nullptr; ++j) {
605  available_fonts_.push_back(kFontlists[i][j]);
606  }
607  }
608  return available_fonts_;
609  }
610 #endif
611 
612  PangoFontFamily** families = 0;
613  int n_families = 0;
614  ListFontFamilies(&families, &n_families);
615  for (int i = 0; i < n_families; ++i) {
616  const char* family_name = pango_font_family_get_name(families[i]);
617  tlog(2, "Listing family %s\n", family_name);
618  if (ShouldIgnoreFontFamilyName(family_name)) {
619  continue;
620  }
621 
622  int n_faces;
623  PangoFontFace** faces = nullptr;
624  pango_font_family_list_faces(families[i], &faces, &n_faces);
625  for (int j = 0; j < n_faces; ++j) {
626  PangoFontDescription* desc = pango_font_face_describe(faces[j]);
627  char* desc_str = pango_font_description_to_string(desc);
628  if (IsAvailableFont(desc_str)) {
629  available_fonts_.push_back(desc_str);
630  }
631  pango_font_description_free(desc);
632  g_free(desc_str);
633  }
634  g_free(faces);
635  }
636  g_free(families);
637  std::sort(available_fonts_.begin(), available_fonts_.end());
638  return available_fonts_;
639 }
640 
641 
642 static void CharCoverageMapToBitmap(PangoCoverage* coverage,
643  std::vector<bool>* unichar_bitmap) {
644  const int kMinUnicodeValue = 33;
645  const int kMaxUnicodeValue = 0x10FFFF;
646  unichar_bitmap->resize(kMaxUnicodeValue + 1, false);
647  // Mark off characters that the font can render.
648  for (int i = kMinUnicodeValue; i <= kMaxUnicodeValue; ++i) {
649  if (IsInterchangeValid(i)) {
650  (*unichar_bitmap)[i]
651  = (pango_coverage_get(coverage, i) == PANGO_COVERAGE_EXACT);
652  }
653  }
654 }
655 
656 /* static */
657 void FontUtils::GetAllRenderableCharacters(std::vector<bool>* unichar_bitmap) {
658  const std::vector<string>& all_fonts = ListAvailableFonts();
659  return GetAllRenderableCharacters(all_fonts, unichar_bitmap);
660 }
661 
662 /* static */
663 void FontUtils::GetAllRenderableCharacters(const string& font_name,
664  std::vector<bool>* unichar_bitmap) {
665  PangoFontInfo font_info(font_name);
666  PangoCoverage* coverage =
667  pango_font_get_coverage(font_info.ToPangoFont(), nullptr);
668  CharCoverageMapToBitmap(coverage, unichar_bitmap);
669 }
670 
671 /* static */
672 void FontUtils::GetAllRenderableCharacters(const std::vector<string>& fonts,
673  std::vector<bool>* unichar_bitmap) {
674  // Form the union of coverage maps from the fonts
675  PangoCoverage* all_coverage = pango_coverage_new();
676  tlog(1, "Processing %u fonts\n", static_cast<unsigned>(fonts.size()));
677  for (unsigned i = 0; i < fonts.size(); ++i) {
678  PangoFontInfo font_info(fonts[i]);
679  PangoCoverage* coverage =
680  pango_font_get_coverage(font_info.ToPangoFont(), nullptr);
681  // Mark off characters that any font can render.
682  pango_coverage_max(all_coverage, coverage);
683  }
684  CharCoverageMapToBitmap(all_coverage, unichar_bitmap);
685  pango_coverage_unref(all_coverage);
686 }
687 
688 
689 // Utilities written to be backward compatible with StringRender
690 
691 /* static */
692 int FontUtils::FontScore(const std::unordered_map<char32, inT64>& ch_map,
693  const string& fontname, int* raw_score,
694  std::vector<bool>* ch_flags) {
695  PangoFontInfo font_info;
696  if (!font_info.ParseFontDescriptionName(fontname)) {
697  tprintf("ERROR: Could not parse %s\n", fontname.c_str());
698  }
699  PangoFont* font = font_info.ToPangoFont();
700  PangoCoverage* coverage = pango_font_get_coverage(font, nullptr);
701 
702  if (ch_flags) {
703  ch_flags->clear();
704  ch_flags->reserve(ch_map.size());
705  }
706  *raw_score = 0;
707  int ok_chars = 0;
708  for (std::unordered_map<char32, inT64>::const_iterator it = ch_map.begin();
709  it != ch_map.end(); ++it) {
710  bool covered = (IsWhitespace(it->first) ||
711  (pango_coverage_get(coverage, it->first)
712  == PANGO_COVERAGE_EXACT));
713  if (covered) {
714  ++(*raw_score);
715  ok_chars += it->second;
716  }
717  if (ch_flags) {
718  ch_flags->push_back(covered);
719  }
720  }
721  return ok_chars;
722 }
723 
724 
725 /* static */
727  const std::unordered_map<char32, inT64>& ch_map,
728  std::vector<std::pair<const char*, std::vector<bool> > >* fonts) {
729  const double kMinOKFraction = 0.99;
730  // Weighted fraction of characters that must be renderable in a font to make
731  // it OK even if the raw count is not good.
732  const double kMinWeightedFraction = 0.99995;
733 
734  fonts->clear();
735  std::vector<std::vector<bool> > font_flags;
736  std::vector<int> font_scores;
737  std::vector<int> raw_scores;
738  int most_ok_chars = 0;
739  int best_raw_score = 0;
740  const std::vector<string>& font_names = FontUtils::ListAvailableFonts();
741  for (unsigned i = 0; i < font_names.size(); ++i) {
742  std::vector<bool> ch_flags;
743  int raw_score = 0;
744  int ok_chars = FontScore(ch_map, font_names[i], &raw_score, &ch_flags);
745  most_ok_chars = MAX(ok_chars, most_ok_chars);
746  best_raw_score = MAX(raw_score, best_raw_score);
747 
748  font_flags.push_back(ch_flags);
749  font_scores.push_back(ok_chars);
750  raw_scores.push_back(raw_score);
751  }
752 
753  // Now select the fonts with a score above a threshold fraction
754  // of both the raw and weighted best scores. To prevent bogus fonts being
755  // selected for CJK, we require a high fraction (kMinOKFraction = 0.99) of
756  // BOTH weighted and raw scores.
757  // In low character-count scripts, the issue is more getting enough fonts,
758  // when only 1 or 2 might have all those rare dingbats etc in them, so we
759  // allow a font with a very high weighted (coverage) score
760  // (kMinWeightedFraction = 0.99995) to be used even if its raw score is poor.
761  int least_good_enough = static_cast<int>(most_ok_chars * kMinOKFraction);
762  int least_raw_enough = static_cast<int>(best_raw_score * kMinOKFraction);
763  int override_enough = static_cast<int>(most_ok_chars * kMinWeightedFraction);
764 
765  string font_list;
766  for (unsigned i = 0; i < font_names.size(); ++i) {
767  int score = font_scores[i];
768  int raw_score = raw_scores[i];
769  if ((score >= least_good_enough && raw_score >= least_raw_enough) ||
770  score >= override_enough) {
771  fonts->push_back(std::make_pair(font_names[i].c_str(), font_flags[i]));
772  tlog(1, "OK font %s = %.4f%%, raw = %d = %.2f%%\n",
773  font_names[i].c_str(),
774  100.0 * score / most_ok_chars,
775  raw_score, 100.0 * raw_score / best_raw_score);
776  font_list += font_names[i];
777  font_list += "\n";
778  } else if (score >= least_good_enough || raw_score >= least_raw_enough) {
779  tlog(1, "Runner-up font %s = %.4f%%, raw = %d = %.2f%%\n",
780  font_names[i].c_str(),
781  100.0 * score / most_ok_chars,
782  raw_score, 100.0 * raw_score / best_raw_score);
783  }
784  }
785  return font_list;
786 }
787 
788 /* static */
789 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
790  string* font_name, std::vector<string>* graphemes) {
791  return SelectFont(utf8_word, utf8_len, ListAvailableFonts(), font_name,
792  graphemes);
793 }
794 
795 /* static */
796 bool FontUtils::SelectFont(const char* utf8_word, const int utf8_len,
797  const std::vector<string>& all_fonts,
798  string* font_name, std::vector<string>* graphemes) {
799  if (font_name) font_name->clear();
800  if (graphemes) graphemes->clear();
801  for (unsigned i = 0; i < all_fonts.size(); ++i) {
802  PangoFontInfo font;
803  std::vector<string> found_graphemes;
804  ASSERT_HOST_MSG(font.ParseFontDescriptionName(all_fonts[i]),
805  "Could not parse font desc name %s\n",
806  all_fonts[i].c_str());
807  if (font.CanRenderString(utf8_word, utf8_len, &found_graphemes)) {
808  if (graphemes) graphemes->swap(found_graphemes);
809  if (font_name) *font_name = all_fonts[i];
810  return true;
811  }
812  }
813  return false;
814 }
815 
816 // PangoFontInfo is reinitialized, so clear the static list of fonts.
817 /* static */
818 void FontUtils::ReInit() { available_fonts_.clear(); }
819 
820 } // namespace tesseract
static const_iterator begin(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:200
static void WriteStringToFileOrDie(const string &str, const string &filename)
Definition: fileio.cpp:52
#define tlog(level,...)
Definition: tlog.h:33
#define ASSERT_HOST_MSG(x,...)
Definition: errcode.h:90
int DropUncoveredChars(string *utf8_text) const
static bool DeleteMatchingFiles(const char *pattern)
Definition: fileio.cpp:111
static const std::vector< string > & ListAvailableFonts()
static int FontScore(const std::unordered_map< char32, inT64 > &ch_map, const string &fontname, int *raw_score, std::vector< bool > *ch_flags)
STRING_PARAM_FLAG(fontconfig_tmpdir, "/tmp", "Overrides fontconfig default temporary dir")
static const_iterator end(const char *utf8_str, const int byte_length)
Definition: unichar.cpp:204
signed int char32
Definition: normstrngs.h:27
bool CanRenderString(const char *utf8_word, int len, std::vector< string > *graphemes) const
static string BestFonts(const std::unordered_map< char32, inT64 > &ch_map, std::vector< std::pair< const char *, std::vector< bool > > > *font_flag)
#define TLOG_IS_ON(level)
Definition: tlog.h:39
static void HardInitFontConfig(const string &fonts_dir, const string &cache_dir)
char * strcasestr(const char *haystack, const char *needle)
Locatea substring into a string, ignoring case.
Definition: strcasestr.cpp:43
#define tprintf(...)
Definition: tprintf.h:31
int utf8_len() const
Definition: unichar.cpp:186
BOOL_PARAM_FLAG(use_only_legacy_fonts, false, "Overrides --fonts_dir and sets the known universe of fonts to" "the list in legacy_fonts.h")
#define DISABLE_HEAP_LEAK_CHECK
Definition: util.h:63
static string JoinPath(const string &prefix, const string &suffix)
Definition: fileio.cpp:81
bool GetSpacingProperties(const string &utf8_char, int *x_bearing, int *x_advance) const
#define ASSERT_HOST(x)
Definition: errcode.h:84
static bool SelectFont(const char *utf8_word, const int utf8_len, string *font_name, std::vector< string > *graphemes)
bool CoversUTF8Text(const char *utf8_text, int byte_length) const
char * utf8_str() const
Definition: unichar.cpp:125
static void GetAllRenderableCharacters(std::vector< bool > *unichar_bitmap)
static bool IsAvailableFont(const char *font_desc)
bool IsWhitespace(const char32 ch)
Definition: normstrngs.cpp:178
#define MAX(x, y)
Definition: ndminx.h:24
const string & family_name() const
bool IsUTF8Whitespace(const char *text)
Definition: normstrngs.cpp:184
bool ParseFontDescriptionName(const string &name)
const int kDefaultResolution
bool IsInterchangeValid(const char32 ch)
Definition: normstrngs.cpp:216