tesseract  4.1.1
unicharset.cpp
Go to the documentation of this file.
1 // File: unicharset.cpp
3 // Description: Unicode character/ligature set class.
4 // Author: Thomas Kielbus
5 //
6 // (C) Copyright 2006, Google Inc.
7 // Licensed under the Apache License, Version 2.0 (the "License");
8 // you may not use this file except in compliance with the License.
9 // You may obtain a copy of the License at
10 // http://www.apache.org/licenses/LICENSE-2.0
11 // Unless required by applicable law or agreed to in writing, software
12 // distributed under the License is distributed on an "AS IS" BASIS,
13 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
14 // See the License for the specific language governing permissions and
15 // limitations under the License.
16 //
18 
19 #include "unicharset.h"
20 
21 #include <algorithm>
22 #include <cassert>
23 #include <cstdio>
24 #include <cstring>
25 #include <iomanip> // for std::setw
26 #include <locale> // for std::locale::classic
27 #include <sstream> // for std::istringstream, std::ostringstream
28 
29 #include "params.h"
30 #include "serialis.h"
31 #include "tesscallback.h"
32 #include "unichar.h"
33 
34 // TODO(rays) Move UNICHARSET to tesseract namespace.
35 using tesseract::char32;
36 using tesseract::UNICHAR;
37 
38 // Special character used in representing character fragments.
39 static const char kSeparator = '|';
40 // Special character used in representing 'natural' character fragments.
41 static const char kNaturalFlag = 'n';
42 
43 static const int ISALPHA_MASK = 0x1;
44 static const int ISLOWER_MASK = 0x2;
45 static const int ISUPPER_MASK = 0x4;
46 static const int ISDIGIT_MASK = 0x8;
47 static const int ISPUNCTUATION_MASK = 0x10;
48 
49 // Y coordinate threshold for determining cap-height vs x-height.
50 // TODO(rays) Bring the global definition down to the ccutil library level,
51 // so this constant is relative to some other constants.
52 static const int kMeanlineThreshold = 220;
53 // Let C be the number of alpha chars for which all tops exceed
54 // kMeanlineThreshold, and X the number of alpha chars for which all
55 // tops are below kMeanlineThreshold, then if X > C *
56 // kMinXHeightFraction and C > X * kMinCapHeightFraction or more than
57 // half the alpha characters have upper or lower case, then the
58 // unicharset "has x-height".
59 const double kMinXHeightFraction = 0.25;
60 const double kMinCapHeightFraction = 0.05;
61 
62 /*static */
63 const char* UNICHARSET::kCustomLigatures[][2] = {
64  {"ct", "\uE003"}, // c + t -> U+E003
65  {"ſh", "\uE006"}, // long-s + h -> U+E006
66  {"ſi", "\uE007"}, // long-s + i -> U+E007
67  {"ſl", "\uE008"}, // long-s + l -> U+E008
68  {"ſſ", "\uE009"}, // long-s + long-s -> U+E009
69  {nullptr, nullptr}
70 };
71 
72 // List of mappings to make when ingesting strings from the outside.
73 // The substitutions clean up text that should exist for rendering of
74 // synthetic data, but not in the recognition set.
75 const char* UNICHARSET::kCleanupMaps[][2] = {
76  {"\u0640", ""}, // TATWEEL is deleted.
77  {"\ufb01", "fi"}, // fi ligature->fi pair.
78  {"\ufb02", "fl"}, // fl ligature->fl pair.
79  {nullptr, nullptr}};
80 
81 // List of strings for the SpecialUnicharCodes. Keep in sync with the enum.
83  " ",
84  "Joined",
85  "|Broken|0|1"
86 };
87 
88 const char* UNICHARSET::null_script = "NULL";
89 
90 UNICHARSET::UNICHAR_PROPERTIES::UNICHAR_PROPERTIES() {
91  Init();
92 }
93 
94 // Initialize all properties to sensible default values.
95 void UNICHARSET::UNICHAR_PROPERTIES::Init() {
96  isalpha = false;
97  islower = false;
98  isupper = false;
99  isdigit = false;
100  ispunctuation = false;
101  isngram = false;
102  enabled = false;
103  SetRangesOpen();
104  script_id = 0;
105  other_case = 0;
106  mirror = 0;
107  normed = "";
108  direction = UNICHARSET::U_LEFT_TO_RIGHT;
109  fragment = nullptr;
110 }
111 
112 // Sets all ranges wide open. Initialization default in case there are
113 // no useful values available.
114 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesOpen() {
115  min_bottom = 0;
116  max_bottom = UINT8_MAX;
117  min_top = 0;
118  max_top = UINT8_MAX;
119  width = 0.0f;
120  width_sd = 0.0f;
121  bearing = 0.0f;
122  bearing_sd = 0.0f;
123  advance = 0.0f;
124  advance_sd = 0.0f;
125 }
126 
127 // Sets all ranges to empty. Used before expanding with font-based data.
128 void UNICHARSET::UNICHAR_PROPERTIES::SetRangesEmpty() {
129  min_bottom = UINT8_MAX;
130  max_bottom = 0;
131  min_top = UINT8_MAX;
132  max_top = 0;
133  width = 0.0f;
134  width_sd = 0.0f;
135  bearing = 0.0f;
136  bearing_sd = 0.0f;
137  advance = 0.0f;
138  advance_sd = 0.0f;
139 }
140 
141 // Returns true if any of the top/bottom/width/bearing/advance ranges/stats
142 // is empty.
143 bool UNICHARSET::UNICHAR_PROPERTIES::AnyRangeEmpty() const {
144  return width == 0.0f || advance == 0.0f;
145 }
146 
147 // Expands the ranges with the ranges from the src properties.
148 void UNICHARSET::UNICHAR_PROPERTIES::ExpandRangesFrom(
149  const UNICHAR_PROPERTIES& src) {
150  UpdateRange(src.min_bottom, &min_bottom, &max_bottom);
151  UpdateRange(src.max_bottom, &min_bottom, &max_bottom);
152  UpdateRange(src.min_top, &min_top, &max_top);
153  UpdateRange(src.max_top, &min_top, &max_top);
154  if (src.width_sd > width_sd) {
155  width = src.width;
156  width_sd = src.width_sd;
157  }
158  if (src.bearing_sd > bearing_sd) {
159  bearing = src.bearing;
160  bearing_sd = src.bearing_sd;
161  }
162  if (src.advance_sd > advance_sd) {
163  advance = src.advance;
164  advance_sd = src.advance_sd;
165  }
166 }
167 
168 // Copies the properties from src into this.
169 void UNICHARSET::UNICHAR_PROPERTIES::CopyFrom(const UNICHAR_PROPERTIES& src) {
170  // Apart from the fragment, everything else can be done with a default copy.
171  CHAR_FRAGMENT* saved_fragment = fragment;
172  *this = src; // Bitwise copy.
173  fragment = saved_fragment;
174 }
175 
177  unichars(nullptr),
178  ids(),
179  size_used(0),
180  size_reserved(0),
181  script_table(nullptr),
182  script_table_size_used(0) {
183  clear();
184  for (int i = 0; i < SPECIAL_UNICHAR_CODES_COUNT; ++i) {
186  if (i == UNICHAR_JOINED)
187  set_isngram(i, true);
188  }
189 }
190 
192  clear();
193 }
194 
195 void UNICHARSET::reserve(int unichars_number) {
196  if (unichars_number > size_reserved) {
197  auto* unichars_new = new UNICHAR_SLOT[unichars_number];
198  for (int i = 0; i < size_used; ++i)
199  unichars_new[i] = unichars[i];
200  for (int j = size_used; j < unichars_number; ++j) {
201  unichars_new[j].properties.script_id = add_script(null_script);
202  }
203  delete[] unichars;
204  unichars = unichars_new;
205  size_reserved = unichars_number;
206  }
207 }
208 
210 UNICHARSET::unichar_to_id(const char* const unichar_repr) const {
211  std::string cleaned =
212  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
213  return ids.contains(cleaned.data(), cleaned.size())
214  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
215  : INVALID_UNICHAR_ID;
216 }
217 
218 UNICHAR_ID UNICHARSET::unichar_to_id(const char* const unichar_repr,
219  int length) const {
220  assert(length > 0 && length <= UNICHAR_LEN);
221  std::string cleaned(unichar_repr, length);
222  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
223  return ids.contains(cleaned.data(), cleaned.size())
224  ? ids.unichar_to_id(cleaned.data(), cleaned.size())
225  : INVALID_UNICHAR_ID;
226 }
227 
228 // Return the minimum number of bytes that matches a legal UNICHAR_ID,
229 // while leaving the rest of the string encodable. Returns 0 if the
230 // beginning of the string is not encodable.
231 // WARNING: this function now encodes the whole string for precision.
232 // Use encode_string in preference to repeatedly calling step.
233 int UNICHARSET::step(const char* str) const {
234  GenericVector<UNICHAR_ID> encoding;
235  GenericVector<char> lengths;
236  encode_string(str, true, &encoding, &lengths, nullptr);
237  if (encoding.empty() || encoding[0] == INVALID_UNICHAR_ID) return 0;
238  return lengths[0];
239 }
240 
241 // Return whether the given UTF-8 string is encodable with this UNICHARSET.
242 // If not encodable, write the first byte offset which cannot be converted
243 // into the second (return) argument.
244 bool UNICHARSET::encodable_string(const char *str,
245  int *first_bad_position) const {
246  GenericVector<UNICHAR_ID> encoding;
247  return encode_string(str, true, &encoding, nullptr, first_bad_position);
248 }
249 
250 // Encodes the given UTF-8 string with this UNICHARSET.
251 // Returns true if the encoding succeeds completely, false if there is at
252 // least one INVALID_UNICHAR_ID in the returned encoding, but in this case
253 // the rest of the string is still encoded.
254 // If lengths is not nullptr, then it is filled with the corresponding
255 // byte length of each encoded UNICHAR_ID.
256 // WARNING: Caller must guarantee that str has already been cleaned of codes
257 // that do not belong in the unicharset, or encoding may fail.
258 // Use CleanupString to perform the cleaning.
259 bool UNICHARSET::encode_string(const char* str, bool give_up_on_failure,
260  GenericVector<UNICHAR_ID>* encoding,
261  GenericVector<char>* lengths,
262  int* encoded_length) const {
263  GenericVector<UNICHAR_ID> working_encoding;
264  GenericVector<char> working_lengths;
265  GenericVector<char> best_lengths;
266  encoding->truncate(0); // Just in case str is empty.
267  int str_length = strlen(str);
268  int str_pos = 0;
269  bool perfect = true;
270  while (str_pos < str_length) {
271  encode_string(str, str_pos, str_length, &working_encoding, &working_lengths,
272  &str_pos, encoding, &best_lengths);
273  if (str_pos < str_length) {
274  // This is a non-match. Skip one utf-8 character.
275  perfect = false;
276  if (give_up_on_failure) break;
277  int step = UNICHAR::utf8_step(str + str_pos);
278  if (step == 0) step = 1;
279  encoding->push_back(INVALID_UNICHAR_ID);
280  best_lengths.push_back(step);
281  str_pos += step;
282  working_encoding = *encoding;
283  working_lengths = best_lengths;
284  }
285  }
286  if (lengths != nullptr) *lengths = best_lengths;
287  if (encoded_length != nullptr) *encoded_length = str_pos;
288  return perfect;
289 }
290 
291 const char* UNICHARSET::id_to_unichar(UNICHAR_ID id) const {
292  if (id == INVALID_UNICHAR_ID) {
293  return INVALID_UNICHAR;
294  }
295  ASSERT_HOST(id < this->size());
296  return unichars[id].representation;
297 }
298 
300  if (id == INVALID_UNICHAR_ID) {
301  return INVALID_UNICHAR;
302  }
303  ASSERT_HOST(id < this->size());
304  // Resolve from the kCustomLigatures table if this is a private encoding.
305  if (get_isprivate(id)) {
306  const char* ch = id_to_unichar(id);
307  for (int i = 0; kCustomLigatures[i][0] != nullptr; ++i) {
308  if (!strcmp(ch, kCustomLigatures[i][1])) {
309  return kCustomLigatures[i][0];
310  }
311  }
312  }
313  // Otherwise return the stored representation.
314  return unichars[id].representation;
315 }
316 
317 // Return a STRING that reformats the utf8 str into the str followed
318 // by its hex unicodes.
320  STRING result = str;
321  result += " [";
322  int step = 1;
323  // Chop into unicodes and code each as hex.
324  for (int i = 0; str[i] != '\0'; i += step) {
325  char hex[sizeof(int) * 2 + 1];
326  step = UNICHAR::utf8_step(str + i);
327  if (step == 0) {
328  step = 1;
329  sprintf(hex, "%x", str[i]);
330  } else {
331  UNICHAR ch(str + i, step);
332  sprintf(hex, "%x", ch.first_uni());
333  }
334  result += hex;
335  result += " ";
336  }
337  result += "]";
338  return result;
339 }
340 
341 // Return a STRING containing debug information on the unichar, including
342 // the id_to_unichar, its hex unicodes and the properties.
344  if (id == INVALID_UNICHAR_ID) return STRING(id_to_unichar(id));
345  const CHAR_FRAGMENT *fragment = this->get_fragment(id);
346  if (fragment) {
347  return fragment->to_string();
348  }
349  const char* str = id_to_unichar(id);
350  STRING result = debug_utf8_str(str);
351  // Append a for lower alpha, A for upper alpha, and x if alpha but neither.
352  if (get_isalpha(id)) {
353  if (get_islower(id))
354  result += "a";
355  else if (get_isupper(id))
356  result += "A";
357  else
358  result += "x";
359  }
360  // Append 0 if a digit.
361  if (get_isdigit(id)) {
362  result += "0";
363  }
364  // Append p is a punctuation symbol.
365  if (get_ispunctuation(id)) {
366  result += "p";
367  }
368  return result;
369 }
370 
371 // Sets the normed_ids vector from the normed string. normed_ids is not
372 // stored in the file, and needs to be set when the UNICHARSET is loaded.
374  unichars[unichar_id].properties.normed_ids.truncate(0);
375  if (unichar_id == UNICHAR_SPACE && id_to_unichar(unichar_id)[0] == ' ') {
376  unichars[unichar_id].properties.normed_ids.push_back(UNICHAR_SPACE);
377  } else if (!encode_string(unichars[unichar_id].properties.normed.string(),
378  true, &unichars[unichar_id].properties.normed_ids,
379  nullptr, nullptr)) {
380  unichars[unichar_id].properties.normed_ids.truncate(0);
381  unichars[unichar_id].properties.normed_ids.push_back(unichar_id);
382  }
383 }
384 
385 // Returns whether the unichar id represents a unicode value in the private use
386 // area. We use this range only internally to represent uncommon ligatures
387 // (eg. 'ct') that do not have regular unicode values.
388 bool UNICHARSET::get_isprivate(UNICHAR_ID unichar_id) const {
389  UNICHAR uc(id_to_unichar(unichar_id), -1);
390  int uni = uc.first_uni();
391  return (uni >= 0xE000 && uni <= 0xF8FF);
392 }
393 
394 
395 // Sets all ranges to empty, so they can be expanded to set the values.
397  for (int id = 0; id < size_used; ++id) {
398  unichars[id].properties.SetRangesEmpty();
399  }
400 }
401 
402 // Sets all the properties for this unicharset given a src unicharset with
403 // everything set. The unicharsets don't have to be the same, and graphemes
404 // are correctly accounted for.
406  const UNICHARSET& src) {
407  for (int ch = start_index; ch < size_used; ++ch) {
408  const char* utf8 = id_to_unichar(ch);
409  UNICHAR_PROPERTIES properties;
410  if (src.GetStrProperties(utf8, &properties)) {
411  // Setup the script_id, other_case, and mirror properly.
412  const char* script = src.get_script_from_script_id(properties.script_id);
413  properties.script_id = add_script(script);
414  const char* other_case = src.id_to_unichar(properties.other_case);
415  if (contains_unichar(other_case)) {
416  properties.other_case = unichar_to_id(other_case);
417  } else {
418  properties.other_case = ch;
419  }
420  const char* mirror_str = src.id_to_unichar(properties.mirror);
421  if (contains_unichar(mirror_str)) {
422  properties.mirror = unichar_to_id(mirror_str);
423  } else {
424  properties.mirror = ch;
425  }
426  unichars[ch].properties.CopyFrom(properties);
427  set_normed_ids(ch);
428  }
429  }
430 }
431 
432 // Expands the tops and bottoms and widths for this unicharset given a
433 // src unicharset with ranges in it. The unicharsets don't have to be the
434 // same, and graphemes are correctly accounted for.
436  for (int ch = 0; ch < size_used; ++ch) {
437  const char* utf8 = id_to_unichar(ch);
438  UNICHAR_PROPERTIES properties;
439  if (src.GetStrProperties(utf8, &properties)) {
440  // Expand just the ranges from properties.
441  unichars[ch].properties.ExpandRangesFrom(properties);
442  }
443  }
444 }
445 
446 // Makes this a copy of src. Clears this completely first, so the automatic
447 // ids will not be present in this if not in src. Does NOT reorder the set!
449  clear();
450  for (int ch = 0; ch < src.size_used; ++ch) {
451  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
452  const char* utf8 = src.id_to_unichar(ch);
454  unichars[ch].properties.ExpandRangesFrom(src_props);
455  }
456  // Set properties, including mirror and other_case, WITHOUT reordering
457  // the unicharset.
459 }
460 
461 // For each id in src, if it does not occur in this, add it, as in
462 // SetPropertiesFromOther, otherwise expand the ranges, as in
463 // ExpandRangesFromOther.
465  int initial_used = size_used;
466  for (int ch = 0; ch < src.size_used; ++ch) {
467  const UNICHAR_PROPERTIES& src_props = src.unichars[ch].properties;
468  const char* utf8 = src.id_to_unichar(ch);
469  int id = size_used;
470  if (contains_unichar(utf8)) {
471  id = unichar_to_id(utf8);
472  // Just expand current ranges.
473  unichars[id].properties.ExpandRangesFrom(src_props);
474  } else {
476  unichars[id].properties.SetRangesEmpty();
477  }
478  }
479  // Set properties, including mirror and other_case, WITHOUT reordering
480  // the unicharset.
481  PartialSetPropertiesFromOther(initial_used, src);
482 }
483 
484 // Returns true if the acceptable ranges of the tops of the characters do
485 // not overlap, making their x-height calculations distinct.
487  int overlap = std::min(unichars[id1].properties.max_top,
488  unichars[id2].properties.max_top) -
489  std::max(unichars[id1].properties.min_top,
490  unichars[id2].properties.min_top);
491  return overlap <= 0;
492 }
493 
494 // Internal recursive version of encode_string above.
495 // Seeks to encode the given string as a sequence of UNICHAR_IDs such that
496 // each UNICHAR_ID uses the least possible part of the utf8 str.
497 // It does this by depth-first tail recursion on increasing length matches
498 // to the UNICHARSET, saving the first encountered result that encodes the
499 // maximum total length of str. It stops on a failure to encode to make
500 // the overall process of encoding a partially failed string more efficient.
501 // See unicharset.h for definition of the args.
502 void UNICHARSET::encode_string(const char* str, int str_index, int str_length,
503  GenericVector<UNICHAR_ID>* encoding,
504  GenericVector<char>* lengths,
505  int* best_total_length,
506  GenericVector<UNICHAR_ID>* best_encoding,
507  GenericVector<char>* best_lengths) const {
508  if (str_index > *best_total_length) {
509  // This is the best result so far.
510  *best_total_length = str_index;
511  *best_encoding = *encoding;
512  if (best_lengths != nullptr)
513  *best_lengths = *lengths;
514  }
515  if (str_index == str_length) return;
516  int encoding_index = encoding->size();
517  // Find the length of the first matching unicharset member.
518  int length = ids.minmatch(str + str_index);
519  if (length == 0 || str_index + length > str_length) return;
520  do {
521  if (ids.contains(str + str_index, length)) {
522  // Successful encoding so far.
523  UNICHAR_ID id = ids.unichar_to_id(str + str_index, length);
524  encoding->push_back(id);
525  lengths->push_back(length);
526  encode_string(str, str_index + length, str_length, encoding, lengths,
527  best_total_length, best_encoding, best_lengths);
528  if (*best_total_length == str_length)
529  return; // Tail recursion success!
530  // Failed with that length, truncate back and try again.
531  encoding->truncate(encoding_index);
532  lengths->truncate(encoding_index);
533  }
534  int step = UNICHAR::utf8_step(str + str_index + length);
535  if (step == 0) step = 1;
536  length += step;
537  } while (length <= UNICHAR_LEN && str_index + length <= str_length);
538 }
539 
540 // Gets the properties for a grapheme string, combining properties for
541 // multiple characters in a meaningful way where possible.
542 // Returns false if no valid match was found in the unicharset.
543 // NOTE that script_id, mirror, and other_case refer to this unicharset on
544 // return and will need translation if the target unicharset is different.
545 bool UNICHARSET::GetStrProperties(const char* utf8_str,
546  UNICHAR_PROPERTIES* props) const {
547  props->Init();
548  props->SetRangesEmpty();
549  int total_unicodes = 0;
550  GenericVector<UNICHAR_ID> encoding;
551  if (!encode_string(utf8_str, true, &encoding, nullptr, nullptr))
552  return false; // Some part was invalid.
553  for (int i = 0; i < encoding.size(); ++i) {
554  int id = encoding[i];
555  const UNICHAR_PROPERTIES& src_props = unichars[id].properties;
556  // Logical OR all the bools.
557  if (src_props.isalpha) props->isalpha = true;
558  if (src_props.islower) props->islower = true;
559  if (src_props.isupper) props->isupper = true;
560  if (src_props.isdigit) props->isdigit = true;
561  if (src_props.ispunctuation) props->ispunctuation = true;
562  if (src_props.isngram) props->isngram = true;
563  if (src_props.enabled) props->enabled = true;
564  // Min/max the tops/bottoms.
565  UpdateRange(src_props.min_bottom, &props->min_bottom, &props->max_bottom);
566  UpdateRange(src_props.max_bottom, &props->min_bottom, &props->max_bottom);
567  UpdateRange(src_props.min_top, &props->min_top, &props->max_top);
568  UpdateRange(src_props.max_top, &props->min_top, &props->max_top);
569  float bearing = props->advance + src_props.bearing;
570  if (total_unicodes == 0 || bearing < props->bearing) {
571  props->bearing = bearing;
572  props->bearing_sd = props->advance_sd + src_props.bearing_sd;
573  }
574  props->advance += src_props.advance;
575  props->advance_sd += src_props.advance_sd;
576  // With a single width, just use the widths stored in the unicharset.
577  props->width = src_props.width;
578  props->width_sd = src_props.width_sd;
579  // Use the first script id, other_case, mirror, direction.
580  // Note that these will need translation, except direction.
581  if (total_unicodes == 0) {
582  props->script_id = src_props.script_id;
583  props->other_case = src_props.other_case;
584  props->mirror = src_props.mirror;
585  props->direction = src_props.direction;
586  }
587  // The normed string for the compound character is the concatenation of
588  // the normed versions of the individual characters.
589  props->normed += src_props.normed;
590  ++total_unicodes;
591  }
592  if (total_unicodes > 1) {
593  // Estimate the total widths from the advance - bearing.
594  props->width = props->advance - props->bearing;
595  props->width_sd = props->advance_sd + props->bearing_sd;
596  }
597  return total_unicodes > 0;
598 }
599 
600 // TODO(rays) clean-up the order of functions to match unicharset.h.
601 
602 unsigned int UNICHARSET::get_properties(UNICHAR_ID id) const {
603  unsigned int properties = 0;
604  if (this->get_isalpha(id))
605  properties |= ISALPHA_MASK;
606  if (this->get_islower(id))
607  properties |= ISLOWER_MASK;
608  if (this->get_isupper(id))
609  properties |= ISUPPER_MASK;
610  if (this->get_isdigit(id))
611  properties |= ISDIGIT_MASK;
612  if (this->get_ispunctuation(id))
613  properties |= ISPUNCTUATION_MASK;
614  return properties;
615 }
616 
618  if (this->get_isupper(id)) return 'A';
619  if (this->get_islower(id)) return 'a';
620  if (this->get_isalpha(id)) return 'x';
621  if (this->get_isdigit(id)) return '0';
622  if (this->get_ispunctuation(id)) return 'p';
623  return 0;
624 }
625 
626 void UNICHARSET::unichar_insert(const char* const unichar_repr,
627  OldUncleanUnichars old_style) {
628  if (old_style == OldUncleanUnichars::kTrue) old_style_included_ = true;
629  std::string cleaned =
630  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
631  if (!cleaned.empty() && !ids.contains(cleaned.data(), cleaned.size())) {
632  const char* str = cleaned.c_str();
633  GenericVector<int> encoding;
634  if (!old_style_included_ &&
635  encode_string(str, true, &encoding, nullptr, nullptr))
636  return;
637  if (size_used == size_reserved) {
638  if (size_used == 0)
639  reserve(8);
640  else
641  reserve(2 * size_used);
642  }
643  int index = 0;
644  do {
645  if (index >= UNICHAR_LEN) {
646  fprintf(stderr, "Utf8 buffer too big, size>%d for %s\n", UNICHAR_LEN,
647  unichar_repr);
648  return;
649  }
650  unichars[size_used].representation[index++] = *str++;
651  } while (*str != '\0');
652  unichars[size_used].representation[index] = '\0';
653  this->set_script(size_used, null_script);
654  // If the given unichar_repr represents a fragmented character, set
655  // fragment property to a pointer to CHAR_FRAGMENT class instance with
656  // information parsed from the unichar representation. Use the script
657  // of the base unichar for the fragmented character if possible.
658  CHAR_FRAGMENT* frag =
659  CHAR_FRAGMENT::parse_from_string(unichars[size_used].representation);
660  this->unichars[size_used].properties.fragment = frag;
661  if (frag != nullptr && this->contains_unichar(frag->get_unichar())) {
662  this->unichars[size_used].properties.script_id =
663  this->get_script(frag->get_unichar());
664  }
665  this->unichars[size_used].properties.enabled = true;
666  ids.insert(unichars[size_used].representation, size_used);
667  ++size_used;
668  }
669 }
670 
671 bool UNICHARSET::contains_unichar(const char* const unichar_repr) const {
672  std::string cleaned =
673  old_style_included_ ? unichar_repr : CleanupString(unichar_repr);
674  return ids.contains(cleaned.data(), cleaned.size());
675 }
676 
677 bool UNICHARSET::contains_unichar(const char* const unichar_repr,
678  int length) const {
679  if (length == 0) {
680  return false;
681  }
682  std::string cleaned(unichar_repr, length);
683  if (!old_style_included_) cleaned = CleanupString(unichar_repr, length);
684  return ids.contains(cleaned.data(), cleaned.size());
685 }
686 
687 bool UNICHARSET::eq(UNICHAR_ID unichar_id,
688  const char* const unichar_repr) const {
689  return strcmp(this->id_to_unichar(unichar_id), unichar_repr) == 0;
690 }
691 
693  const int kFileBufSize = 1024;
694  char buffer[kFileBufSize + 1];
695  snprintf(buffer, kFileBufSize, "%d\n", this->size());
696  *str = buffer;
697  for (UNICHAR_ID id = 0; id < this->size(); ++id) {
698  int min_bottom, max_bottom, min_top, max_top;
699  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
700  float width, width_sd;
701  get_width_stats(id, &width, &width_sd);
702  float bearing, bearing_sd;
703  get_bearing_stats(id, &bearing, &bearing_sd);
704  float advance, advance_sd;
705  get_advance_stats(id, &advance, &advance_sd);
706  unsigned int properties = this->get_properties(id);
707  if (strcmp(this->id_to_unichar(id), " ") == 0) {
708  snprintf(buffer, kFileBufSize, "%s %x %s %d\n", "NULL", properties,
709  this->get_script_from_script_id(this->get_script(id)),
710  this->get_other_case(id));
711  *str += buffer;
712  } else {
713  std::ostringstream stream;
714  stream.imbue(std::locale::classic());
715  stream << this->id_to_unichar(id) << ' ' << properties << ' ' <<
716  min_bottom << ',' << max_bottom << ',' <<
717  min_top << ',' << max_top << ',' <<
718  width << ',' << width_sd << ',' <<
719  bearing << ',' << bearing_sd << ',' <<
720  advance << ',' << advance_sd << ' ' <<
721  this->get_script_from_script_id(this->get_script(id)) << ' ' <<
722  this->get_other_case(id) << ' ' <<
723  this->get_direction(id) << ' ' <<
724  this->get_mirror(id) << ' ' <<
725  this->get_normed_unichar(id) << "\t# " <<
726  this->debug_str(id).string() << '\n';
727  *str += stream.str().c_str();
728  }
729  }
730  return true;
731 }
732 
733 // TODO(rays) Replace with TFile everywhere.
735  public:
736  InMemoryFilePointer(const char *memory, int mem_size)
737  : memory_(memory), fgets_ptr_(memory), mem_size_(mem_size) { }
738 
739  char *fgets(char *orig_dst, int size) {
740  const char *src_end = memory_ + mem_size_;
741  char *dst_end = orig_dst + size - 1;
742  if (size < 1) {
743  return fgets_ptr_ < src_end ? orig_dst : nullptr;
744  }
745 
746  char *dst = orig_dst;
747  char ch = '^';
748  while (fgets_ptr_ < src_end && dst < dst_end && ch != '\n') {
749  ch = *dst++ = *fgets_ptr_++;
750  }
751  *dst = 0;
752  return (dst == orig_dst) ? nullptr : orig_dst;
753  }
754 
755  private:
756  const char *memory_;
757  const char *fgets_ptr_;
758  const int mem_size_;
759 };
760 
762  const char *memory, int mem_size, bool skip_fragments) {
763  InMemoryFilePointer mem_fp(memory, mem_size);
766  bool success = load_via_fgets(fgets_cb, skip_fragments);
767  delete fgets_cb;
768  return success;
769 }
770 
772  public:
773  LocalFilePointer(FILE *stream) : fp_(stream) {}
774  char *fgets(char *dst, int size) {
775  return ::fgets(dst, size, fp_);
776  }
777  private:
778  FILE *fp_;
779 };
780 
781 bool UNICHARSET::load_from_file(FILE *file, bool skip_fragments) {
782  LocalFilePointer lfp(file);
785  bool success = load_via_fgets(fgets_cb, skip_fragments);
786  delete fgets_cb;
787  return success;
788 }
789 
790 bool UNICHARSET::load_from_file(tesseract::TFile *file, bool skip_fragments) {
793  bool success = load_via_fgets(fgets_cb, skip_fragments);
794  delete fgets_cb;
795  return success;
796 }
797 
798 bool UNICHARSET::load_via_fgets(
800  bool skip_fragments) {
801  int unicharset_size;
802  char buffer[256];
803 
804  this->clear();
805  if (fgets_cb->Run(buffer, sizeof(buffer)) == nullptr ||
806  sscanf(buffer, "%d", &unicharset_size) != 1) {
807  return false;
808  }
809  this->reserve(unicharset_size);
810  for (UNICHAR_ID id = 0; id < unicharset_size; ++id) {
811  char unichar[256];
812  unsigned int properties;
813  char script[64];
814 
815  strncpy(script, null_script, sizeof(script) - 1);
816  int min_bottom = 0;
817  int max_bottom = UINT8_MAX;
818  int min_top = 0;
819  int max_top = UINT8_MAX;
820  float width = 0.0f;
821  float width_sd = 0.0f;
822  float bearing = 0.0f;
823  float bearing_sd = 0.0f;
824  float advance = 0.0f;
825  float advance_sd = 0.0f;
826  // TODO(eger): check that this default it ok
827  // after enabling BiDi iterator for Arabic.
828  int direction = UNICHARSET::U_LEFT_TO_RIGHT;
829  UNICHAR_ID other_case = unicharset_size;
830  UNICHAR_ID mirror = unicharset_size;
831  if (fgets_cb->Run(buffer, sizeof (buffer)) == nullptr) {
832  return false;
833  }
834  char normed[64];
835  normed[0] = '\0';
836  std::istringstream stream(buffer);
837  stream.imbue(std::locale::classic());
838  // 标 1 0,255,0,255,0,0,0,0,0,0 Han 68 0 68 标 # 标 [6807 ]x
839  //stream.flags(std::ios::hex);
840  stream >> std::setw(255) >> unichar >> std::hex >> properties >> std::dec;
841  //stream.flags(std::ios::dec);
842  if (stream.fail()) {
843  fprintf(stderr, "%s:%u failed\n", __FILE__, __LINE__);
844  return false;
845  }
846  auto position = stream.tellg();
847  stream.seekg(position);
848  char c1, c2, c3, c4, c5, c6, c7, c8, c9;
849  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
850  width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
851  advance >> c9 >> advance_sd >> std::setw(63) >> script >>
852  other_case >> direction >> mirror >> std::setw(63) >> normed;
853  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
854  c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
855  stream.clear();
856  stream.seekg(position);
857  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >> c4 >>
858  width >> c5 >>width_sd >> c6 >> bearing >> c7 >> bearing_sd >> c8 >>
859  advance >> c9 >> advance_sd >> std::setw(63) >> script >>
860  other_case >> direction >> mirror;
861  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',' || c4 != ',' ||
862  c5 != ',' || c6 != ',' || c7 != ',' || c8 != ',' || c9 != ',') {
863  stream.clear();
864  stream.seekg(position);
865  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
866  std::setw(63) >> script >> other_case >> direction >> mirror;
867  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
868  stream.clear();
869  stream.seekg(position);
870  stream >> min_bottom >> c1 >> max_bottom >> c2 >> min_top >> c3 >> max_top >>
871  std::setw(63) >> script >> other_case;
872  if (stream.fail() || c1 != ',' || c2 != ',' || c3 != ',') {
873  stream.clear();
874  stream.seekg(position);
875  stream >> std::setw(63) >> script >> other_case;
876  if (stream.fail()) {
877  stream.clear();
878  stream.seekg(position);
879  stream >> std::setw(63) >> script;
880  }
881  }
882  }
883  }
884  }
885 
886  // Skip fragments if needed.
887  CHAR_FRAGMENT *frag = nullptr;
888  if (skip_fragments && (frag = CHAR_FRAGMENT::parse_from_string(unichar))) {
889  int num_pieces = frag->get_total();
890  delete frag;
891  // Skip multi-element fragments, but keep singles like UNICHAR_BROKEN in.
892  if (num_pieces > 1)
893  continue;
894  }
895  // Insert unichar into unicharset and set its properties.
896  if (strcmp(unichar, "NULL") == 0)
897  this->unichar_insert(" ");
898  else
900 
901  this->set_isalpha(id, properties & ISALPHA_MASK);
902  this->set_islower(id, properties & ISLOWER_MASK);
903  this->set_isupper(id, properties & ISUPPER_MASK);
904  this->set_isdigit(id, properties & ISDIGIT_MASK);
905  this->set_ispunctuation(id, properties & ISPUNCTUATION_MASK);
906  this->set_isngram(id, false);
907  this->set_script(id, script);
908  this->unichars[id].properties.enabled = true;
909  this->set_top_bottom(id, min_bottom, max_bottom, min_top, max_top);
910  this->set_width_stats(id, width, width_sd);
911  this->set_bearing_stats(id, bearing, bearing_sd);
912  this->set_advance_stats(id, advance, advance_sd);
913  this->set_direction(id, static_cast<UNICHARSET::Direction>(direction));
914  this->set_other_case(
915  id, (other_case < unicharset_size) ? other_case : id);
916  this->set_mirror(id, (mirror < unicharset_size) ? mirror : id);
917  this->set_normed(id, normed[0] != '\0' ? normed : unichar);
918  }
919  post_load_setup();
920  return true;
921 }
922 
923 // Sets up internal data after loading the file, based on the char
924 // properties. Called from load_from_file, but also needs to be run
925 // during set_unicharset_properties.
927  // Number of alpha chars with the case property minus those without,
928  // in order to determine that half the alpha chars have case.
929  int net_case_alphas = 0;
930  int x_height_alphas = 0;
931  int cap_height_alphas = 0;
932  top_bottom_set_ = false;
933  for (UNICHAR_ID id = 0; id < size_used; ++id) {
934  int min_bottom = 0;
935  int max_bottom = UINT8_MAX;
936  int min_top = 0;
937  int max_top = UINT8_MAX;
938  get_top_bottom(id, &min_bottom, &max_bottom, &min_top, &max_top);
939  if (min_top > 0)
940  top_bottom_set_ = true;
941  if (get_isalpha(id)) {
942  if (get_islower(id) || get_isupper(id))
943  ++net_case_alphas;
944  else
945  --net_case_alphas;
946  if (min_top < kMeanlineThreshold && max_top < kMeanlineThreshold)
947  ++x_height_alphas;
948  else if (min_top > kMeanlineThreshold && max_top > kMeanlineThreshold)
949  ++cap_height_alphas;
950  }
951  set_normed_ids(id);
952  }
953 
954  script_has_upper_lower_ = net_case_alphas > 0;
955  script_has_xheight_ = script_has_upper_lower_ ||
956  (x_height_alphas > cap_height_alphas * kMinXHeightFraction &&
957  cap_height_alphas > x_height_alphas * kMinCapHeightFraction);
958 
959  null_sid_ = get_script_id_from_name(null_script);
960  ASSERT_HOST(null_sid_ == 0);
961  common_sid_ = get_script_id_from_name("Common");
962  latin_sid_ = get_script_id_from_name("Latin");
963  cyrillic_sid_ = get_script_id_from_name("Cyrillic");
964  greek_sid_ = get_script_id_from_name("Greek");
965  han_sid_ = get_script_id_from_name("Han");
966  hiragana_sid_ = get_script_id_from_name("Hiragana");
967  katakana_sid_ = get_script_id_from_name("Katakana");
968  thai_sid_ = get_script_id_from_name("Thai");
969  hangul_sid_ = get_script_id_from_name("Hangul");
970 
971  // Compute default script. Use the highest-counting alpha script, that is
972  // not the common script, as that still contains some "alphas".
973  int* script_counts = new int[script_table_size_used];
974  memset(script_counts, 0, sizeof(*script_counts) * script_table_size_used);
975  for (int id = 0; id < size_used; ++id) {
976  if (get_isalpha(id)) {
977  ++script_counts[get_script(id)];
978  }
979  }
980  default_sid_ = 0;
981  for (int s = 1; s < script_table_size_used; ++s) {
982  if (script_counts[s] > script_counts[default_sid_] && s != common_sid_)
983  default_sid_ = s;
984  }
985  delete [] script_counts;
986 }
987 
988 // Returns true if right_to_left scripts are significant in the unicharset,
989 // but without being so sensitive that "universal" unicharsets containing
990 // characters from many scripts, like orientation and script detection,
991 // look like they are right_to_left.
993  int ltr_count = 0;
994  int rtl_count = 0;
995  for (int id = 0; id < size_used; ++id) {
996  int dir = get_direction(id);
997  if (dir == UNICHARSET::U_LEFT_TO_RIGHT) ltr_count++;
998  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
1000  dir == UNICHARSET::U_ARABIC_NUMBER) rtl_count++;
1001  }
1002  return rtl_count > ltr_count;
1003 }
1004 
1005 // Set a whitelist and/or blacklist of characters to recognize.
1006 // An empty or nullptr whitelist enables everything (minus any blacklist).
1007 // An empty or nullptr blacklist disables nothing.
1008 // An empty or nullptr blacklist has no effect.
1009 void UNICHARSET::set_black_and_whitelist(const char* blacklist,
1010  const char* whitelist,
1011  const char* unblacklist) {
1012  bool def_enabled = whitelist == nullptr || whitelist[0] == '\0';
1013  // Set everything to default
1014  for (int ch = 0; ch < size_used; ++ch)
1015  unichars[ch].properties.enabled = def_enabled;
1016  if (!def_enabled) {
1017  // Enable the whitelist.
1018  GenericVector<UNICHAR_ID> encoding;
1019  encode_string(whitelist, false, &encoding, nullptr, nullptr);
1020  for (int i = 0; i < encoding.size(); ++i) {
1021  if (encoding[i] != INVALID_UNICHAR_ID)
1022  unichars[encoding[i]].properties.enabled = true;
1023  }
1024  }
1025  if (blacklist != nullptr && blacklist[0] != '\0') {
1026  // Disable the blacklist.
1027  GenericVector<UNICHAR_ID> encoding;
1028  encode_string(blacklist, false, &encoding, nullptr, nullptr);
1029  for (int i = 0; i < encoding.size(); ++i) {
1030  if (encoding[i] != INVALID_UNICHAR_ID)
1031  unichars[encoding[i]].properties.enabled = false;
1032  }
1033  }
1034  if (unblacklist != nullptr && unblacklist[0] != '\0') {
1035  // Re-enable the unblacklist.
1036  GenericVector<UNICHAR_ID> encoding;
1037  encode_string(unblacklist, false, &encoding, nullptr, nullptr);
1038  for (int i = 0; i < encoding.size(); ++i) {
1039  if (encoding[i] != INVALID_UNICHAR_ID)
1040  unichars[encoding[i]].properties.enabled = true;
1041  }
1042  }
1043 }
1044 
1045 // Returns true if there are any repeated unicodes in the normalized
1046 // text of any unichar-id in the unicharset.
1048  int start_id = 0;
1050  for (int id = start_id; id < size_used; ++id) {
1051  // Convert to unicodes.
1052  std::vector<char32> unicodes = UNICHAR::UTF8ToUTF32(get_normed_unichar(id));
1053  for (size_t u = 1; u < unicodes.size(); ++u) {
1054  if (unicodes[u - 1] == unicodes[u]) return true;
1055  }
1056  }
1057  return false;
1058 }
1059 
1060 int UNICHARSET::add_script(const char* script) {
1061  for (int i = 0; i < script_table_size_used; ++i) {
1062  if (strcmp(script, script_table[i]) == 0)
1063  return i;
1064  }
1065  if (script_table_size_reserved == 0) {
1066  script_table_size_reserved = 8;
1067  script_table = new char*[script_table_size_reserved];
1068  } else if (script_table_size_used >= script_table_size_reserved) {
1069  assert(script_table_size_used == script_table_size_reserved);
1070  script_table_size_reserved += script_table_size_reserved;
1071  char** new_script_table = new char*[script_table_size_reserved];
1072  memcpy(new_script_table, script_table,
1073  script_table_size_used * sizeof(char*));
1074  delete[] script_table;
1075  script_table = new_script_table;
1076  }
1077  script_table[script_table_size_used] = new char[strlen(script) + 1];
1078  strcpy(script_table[script_table_size_used], script);
1079  return script_table_size_used++;
1080 }
1081 
1082 // Returns the string that represents a fragment
1083 // with the given unichar, pos and total.
1084 STRING CHAR_FRAGMENT::to_string(const char *unichar, int pos, int total,
1085  bool natural) {
1086  if (total == 1) return STRING(unichar);
1087  STRING result = "";
1088  result += kSeparator;
1089  result += unichar;
1090  char buffer[kMaxLen];
1091  snprintf(buffer, kMaxLen, "%c%d%c%d", kSeparator, pos,
1092  natural ? kNaturalFlag : kSeparator, total);
1093  result += buffer;
1094  return result;
1095 }
1096 
1098  const char *ptr = string;
1099  int len = strlen(string);
1100  if (len < kMinLen || *ptr != kSeparator) {
1101  return nullptr; // this string can not represent a fragment
1102  }
1103  ptr++; // move to the next character
1104  int step = 0;
1105  while ((ptr + step) < (string + len) && *(ptr + step) != kSeparator) {
1106  step += UNICHAR::utf8_step(ptr + step);
1107  }
1108  if (step == 0 || step > UNICHAR_LEN) {
1109  return nullptr; // no character for unichar or the character is too long
1110  }
1111  char unichar[UNICHAR_LEN + 1];
1112  strncpy(unichar, ptr, step);
1113  unichar[step] = '\0'; // null terminate unichar
1114  ptr += step; // move to the next fragment separator
1115  int pos = 0;
1116  int total = 0;
1117  bool natural = false;
1118  char *end_ptr = nullptr;
1119  for (int i = 0; i < 2; i++) {
1120  if (ptr > string + len || *ptr != kSeparator) {
1121  if (i == 1 && *ptr == kNaturalFlag)
1122  natural = true;
1123  else
1124  return nullptr; // Failed to parse fragment representation.
1125  }
1126  ptr++; // move to the next character
1127  i == 0 ? pos = static_cast<int>(strtol(ptr, &end_ptr, 10))
1128  : total = static_cast<int>(strtol(ptr, &end_ptr, 10));
1129  ptr = end_ptr;
1130  }
1131  if (ptr != string + len) {
1132  return nullptr; // malformed fragment representation
1133  }
1134  auto *fragment = new CHAR_FRAGMENT();
1135  fragment->set_all(unichar, pos, total, natural);
1136  return fragment;
1137 }
1138 
1139 int UNICHARSET::get_script_id_from_name(const char* script_name) const {
1140  for (int i = 0; i < script_table_size_used; ++i) {
1141  if (strcmp(script_name, script_table[i]) == 0)
1142  return i;
1143  }
1144  return 0; // 0 is always the null_script
1145 }
1146 
1147 // Removes/replaces content that belongs in rendered text, but not in the
1148 // unicharset.
1149 /* static */
1150 std::string UNICHARSET::CleanupString(const char* utf8_str, size_t length) {
1151  std::string result;
1152  result.reserve(length);
1153  char ch;
1154  while ((ch = *utf8_str) != '\0' && length-- > 0) {
1155  int key_index = 0;
1156  const char* key;
1157  while ((key = kCleanupMaps[key_index][0]) != nullptr) {
1158  int match = 0;
1159  while (key[match] != '\0' && key[match] == utf8_str[match]) ++match;
1160  if (key[match] == '\0') {
1161  utf8_str += match;
1162  break;
1163  }
1164  ++key_index;
1165  }
1166  if (key == nullptr) {
1167  result.push_back(ch);
1168  ++utf8_str;
1169  } else {
1170  result.append(kCleanupMaps[key_index][1]);
1171  }
1172  }
1173  return result;
1174 }
const char * get_script_from_script_id(int id) const
Definition: unicharset.h:854
int UNICHAR_ID
Definition: unichar.h:34
unsigned int get_properties(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:602
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
Definition: unicharset.h:582
bool empty() const
Definition: genericvector.h:91
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
void AppendOtherUnicharset(const UNICHARSET &src)
Definition: unicharset.cpp:464
bool save_to_string(STRING *str) const
Definition: unicharset.cpp:692
int get_script_id_from_name(const char *script_name) const
void unichar_insert_backwards_compatible(const char *const unichar_repr)
Definition: unicharset.h:269
static const int kMinLen
Definition: unicharset.h:51
void CopyFrom(const UNICHARSET &src)
Definition: unicharset.cpp:448
int size() const
Definition: unicharset.h:341
const char * get_unichar() const
Definition: unicharset.h:70
bool get_ispunctuation(UNICHAR_ID unichar_id) const
Definition: unicharset.h:519
void set_isupper(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:441
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
Definition: unicharset.cpp:626
void ExpandRangesFromOther(const UNICHARSET &src)
Definition: unicharset.cpp:435
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
void clear()
Definition: unicharset.h:306
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
OldUncleanUnichars
Definition: unicharset.h:43
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
Definition: unicharset.h:623
char * fgets(char *dst, int size)
Definition: unicharset.cpp:774
void set_isngram(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:456
bool get_isalpha(UNICHAR_ID unichar_id) const
Definition: unicharset.h:491
const double kMinXHeightFraction
Definition: unicharset.cpp:59
const char * c_str() const
Definition: strngs.cpp:205
static CHAR_FRAGMENT * parse_from_string(const char *str)
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
Definition: unicharset.h:467
const double kMinCapHeightFraction
Definition: unicharset.cpp:60
virtual R Run(A1, A2)=0
const char * string() const
Definition: strngs.cpp:194
char get_chartype(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:617
signed int char32
Definition: unichar.h:51
void set_ranges_empty()
Definition: unicharset.cpp:396
UNICHAR_ID unichar_to_id(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:34
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
Definition: unicharset.h:683
int get_total() const
Definition: unicharset.h:72
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
Definition: unicharset.cpp:687
void truncate(int size)
int minmatch(const char *const unichar_repr) const
Definition: unicharmap.cpp:100
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
Definition: unicharset.h:613
static const int kMaxLen
Definition: unicharset.h:53
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
Definition: unicharset.h:607
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
Definition: unicharset.cpp:405
bool get_isprivate(UNICHAR_ID unichar_id) const
Definition: unicharset.cpp:388
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
static TESS_API const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
Definition: unicharset.h:153
void set_isalpha(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:431
#define UNICHAR_LEN
Definition: unichar.h:30
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
Definition: unicharset.h:640
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:451
void post_load_setup()
Definition: unicharset.cpp:926
static STRING debug_utf8_str(const char *str)
Definition: unicharset.cpp:319
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
char * fgets(char *orig_dst, int size)
Definition: unicharset.cpp:739
void UpdateRange(const T1 &x, T2 *lower_bound, T2 *upper_bound)
Definition: helpers.h:120
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
STRING debug_str(UNICHAR_ID id) const
Definition: unicharset.cpp:343
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:299
STRING to_string() const
Definition: unicharset.h:79
bool has_special_codes() const
Definition: unicharset.h:722
bool contains_unichar(const char *const unichar_repr) const
Definition: unicharset.cpp:671
void reserve(int unichars_number)
Definition: unicharset.cpp:195
void set_normed(UNICHAR_ID unichar_id, const char *normed)
Definition: unicharset.h:482
int first_uni() const
Definition: unichar.cpp:98
char * FGets(char *buffer, int buffer_size)
Definition: serialis.cpp:249
bool AnyRepeatedUnicodes() const
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:690
Definition: strngs.h:45
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
bool contains(const char *const unichar_repr, int length) const
Definition: unicharmap.cpp:79
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
bool encodable_string(const char *str, int *first_bad_position) const
Definition: unicharset.cpp:244
int add_script(const char *script)
void set_normed_ids(UNICHAR_ID unichar_id)
Definition: unicharset.cpp:373
LocalFilePointer(FILE *stream)
Definition: unicharset.cpp:773
static TESS_API const char * kCustomLigatures[][2]
Definition: unicharset.h:150
void set_script(UNICHAR_ID unichar_id, const char *value)
Definition: unicharset.h:462
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
Definition: unicharset.h:630
void set_islower(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:436
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
Definition: unicharset.h:596
int push_back(T object)
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
static STRING to_string(const char *unichar, int pos, int total, bool natural)
int size() const
Definition: genericvector.h:72
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
Definition: unicharset.h:477
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
Definition: unicharset.cpp:761
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
Definition: unicharset.h:472
#define ASSERT_HOST(x)
Definition: errcode.h:88
int step(const char *str) const
Definition: unicharset.cpp:233
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:697
void insert(const char *const unichar_repr, UNICHAR_ID id)
Definition: unicharmap.cpp:56
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
Definition: unicharset.h:828
void set_isdigit(UNICHAR_ID unichar_id, bool value)
Definition: unicharset.h:446
bool load_from_file(const char *const filename, bool skip_fragments)
Definition: unicharset.h:388
bool major_right_to_left() const
Definition: unicharset.cpp:992
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
Definition: unicharset.cpp:486
InMemoryFilePointer(const char *memory, int mem_size)
Definition: unicharset.cpp:736