tesseract  4.1.1
ratngs.cpp
Go to the documentation of this file.
1 /**********************************************************************
2  * File: ratngs.cpp (Formerly ratings.c)
3  * Description: Code to manipulate the BLOB_CHOICE and WERD_CHOICE classes.
4  * Author: Ray Smith
5  * Created: Thu Apr 23 13:23:29 BST 1992
6  *
7  * (C) Copyright 1992, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 
21 #ifdef HAVE_CONFIG_H
22 #include "config_auto.h"
23 #endif
24 
25 #include "ratngs.h"
26 
27 #include <algorithm>
28 #include <string>
29 #include "blobs.h"
30 #include "callcpp.h"
31 #include "genericvector.h"
32 #include "matrix.h"
33 #include "normalis.h" // kBlnBaselineOffset.
34 #include "unicharset.h"
35 
37 
40 
41 const float WERD_CHOICE::kBadRating = 100000.0;
42 // Min offset in baseline-normalized coords to make a character a subscript.
43 const int kMinSubscriptOffset = 20;
44 // Min offset in baseline-normalized coords to make a character a superscript.
45 const int kMinSuperscriptOffset = 20;
46 // Max y of bottom of a drop-cap blob.
47 const int kMaxDropCapBottom = -128;
48 // Max fraction of x-height to use as denominator in measuring x-height overlap.
49 const double kMaxOverlapDenominator = 0.125;
50 // Min fraction of x-height range that should be in agreement for matching
51 // x-heights.
52 const double kMinXHeightMatch = 0.5;
53 // Max tolerance on baseline position as a fraction of x-height for matching
54 // baselines.
55 const double kMaxBaselineDrift = 0.0625;
56 
57 static const char kPermuterTypeNoPerm[] = "None";
58 static const char kPermuterTypePuncPerm[] = "Punctuation";
59 static const char kPermuterTypeTopPerm[] = "Top Choice";
60 static const char kPermuterTypeLowerPerm[] = "Top Lower Case";
61 static const char kPermuterTypeUpperPerm[] = "Top Upper Case";
62 static const char kPermuterTypeNgramPerm[] = "Ngram";
63 static const char kPermuterTypeNumberPerm[] = "Number";
64 static const char kPermuterTypeUserPatPerm[] = "User Pattern";
65 static const char kPermuterTypeSysDawgPerm[] = "System Dictionary";
66 static const char kPermuterTypeDocDawgPerm[] = "Document Dictionary";
67 static const char kPermuterTypeUserDawgPerm[] = "User Dictionary";
68 static const char kPermuterTypeFreqDawgPerm[] = "Frequent Words Dictionary";
69 static const char kPermuterTypeCompoundPerm[] = "Compound";
70 
71 static const char * const kPermuterTypeNames[] = {
72  kPermuterTypeNoPerm, // 0
73  kPermuterTypePuncPerm, // 1
74  kPermuterTypeTopPerm, // 2
75  kPermuterTypeLowerPerm, // 3
76  kPermuterTypeUpperPerm, // 4
77  kPermuterTypeNgramPerm, // 5
78  kPermuterTypeNumberPerm, // 6
79  kPermuterTypeUserPatPerm, // 7
80  kPermuterTypeSysDawgPerm, // 8
81  kPermuterTypeDocDawgPerm, // 9
82  kPermuterTypeUserDawgPerm, // 10
83  kPermuterTypeFreqDawgPerm, // 11
84  kPermuterTypeCompoundPerm // 12
85 };
86 
92 BLOB_CHOICE::BLOB_CHOICE(UNICHAR_ID src_unichar_id, // character id
93  float src_rating, // rating
94  float src_cert, // certainty
95  int src_script_id, // script
96  float min_xheight, // min xheight allowed
97  float max_xheight, // max xheight by this char
98  float yshift, // yshift out of position
99  BlobChoiceClassifier c) { // adapted match or other
100  unichar_id_ = src_unichar_id;
101  rating_ = src_rating;
102  certainty_ = src_cert;
103  fontinfo_id_ = -1;
104  fontinfo_id2_ = -1;
105  script_id_ = src_script_id;
106  min_xheight_ = min_xheight;
107  max_xheight_ = max_xheight;
108  yshift_ = yshift;
109  classifier_ = c;
110 }
111 
118  unichar_id_ = other.unichar_id();
119  rating_ = other.rating();
120  certainty_ = other.certainty();
121  fontinfo_id_ = other.fontinfo_id();
122  fontinfo_id2_ = other.fontinfo_id2();
123  script_id_ = other.script_id();
124  matrix_cell_ = other.matrix_cell_;
125  min_xheight_ = other.min_xheight_;
126  max_xheight_ = other.max_xheight_;
127  yshift_ = other.yshift();
128  classifier_ = other.classifier_;
129 #ifndef DISABLED_LEGACY_ENGINE
130  fonts_ = other.fonts_;
131 #endif // ndef DISABLED_LEGACY_ENGINE
132 }
133 
134 // Copy assignment operator.
135 BLOB_CHOICE& BLOB_CHOICE::operator=(const BLOB_CHOICE& other) {
136  ELIST_LINK::operator=(other);
137  unichar_id_ = other.unichar_id();
138  rating_ = other.rating();
139  certainty_ = other.certainty();
140  fontinfo_id_ = other.fontinfo_id();
141  fontinfo_id2_ = other.fontinfo_id2();
142  script_id_ = other.script_id();
143  matrix_cell_ = other.matrix_cell_;
144  min_xheight_ = other.min_xheight_;
145  max_xheight_ = other.max_xheight_;
146  yshift_ = other.yshift();
147  classifier_ = other.classifier_;
148 #ifndef DISABLED_LEGACY_ENGINE
149  fonts_ = other.fonts_;
150 #endif // ndef DISABLED_LEGACY_ENGINE
151  return *this;
152 }
153 
154 // Returns true if *this and other agree on the baseline and x-height
155 // to within some tolerance based on a given estimate of the x-height.
156 bool BLOB_CHOICE::PosAndSizeAgree(const BLOB_CHOICE& other, float x_height,
157  bool debug) const {
158  double baseline_diff = fabs(yshift() - other.yshift());
159  if (baseline_diff > kMaxBaselineDrift * x_height) {
160  if (debug) {
161  tprintf("Baseline diff %g for %d v %d\n",
162  baseline_diff, unichar_id_, other.unichar_id_);
163  }
164  return false;
165  }
166  double this_range = max_xheight() - min_xheight();
167  double other_range = other.max_xheight() - other.min_xheight();
168  double denominator = ClipToRange(std::min(this_range, other_range),
169  1.0, kMaxOverlapDenominator * x_height);
170  double overlap = std::min(max_xheight(), other.max_xheight()) -
171  std::max(min_xheight(), other.min_xheight());
172  overlap /= denominator;
173  if (debug) {
174  tprintf("PosAndSize for %d v %d: bl diff = %g, ranges %g, %g / %g ->%g\n",
175  unichar_id_, other.unichar_id_, baseline_diff,
176  this_range, other_range, denominator, overlap);
177  }
178 
179  return overlap >= kMinXHeightMatch;
180 }
181 
182 // Helper to find the BLOB_CHOICE in the bc_list that matches the given
183 // unichar_id, or nullptr if there is no match.
185  BLOB_CHOICE_LIST* bc_list) {
186  // Find the corresponding best BLOB_CHOICE.
187  BLOB_CHOICE_IT choice_it(bc_list);
188  for (choice_it.mark_cycle_pt(); !choice_it.cycled_list();
189  choice_it.forward()) {
190  BLOB_CHOICE* choice = choice_it.data();
191  if (choice->unichar_id() == char_id) {
192  return choice;
193  }
194  }
195  return nullptr;
196 }
197 
198 const char *WERD_CHOICE::permuter_name(uint8_t permuter) {
199  return kPermuterTypeNames[permuter];
200 }
201 
202 namespace tesseract {
203 
204 const char *ScriptPosToString(enum ScriptPos script_pos) {
205  switch (script_pos) {
206  case SP_NORMAL: return "NORM";
207  case SP_SUBSCRIPT: return "SUB";
208  case SP_SUPERSCRIPT: return "SUPER";
209  case SP_DROPCAP: return "DROPC";
210  }
211  return "SP_UNKNOWN";
212 }
213 
214 } // namespace tesseract.
215 
222 WERD_CHOICE::WERD_CHOICE(const char *src_string,
223  const UNICHARSET &unicharset)
224  : unicharset_(&unicharset){
225  GenericVector<UNICHAR_ID> encoding;
226  GenericVector<char> lengths;
227  std::string cleaned = unicharset.CleanupString(src_string);
228  if (unicharset.encode_string(cleaned.c_str(), true, &encoding, &lengths,
229  nullptr)) {
230  lengths.push_back('\0');
231  STRING src_lengths = &lengths[0];
232  this->init(cleaned.c_str(), src_lengths.string(), 0.0, 0.0, NO_PERM);
233  } else { // There must have been an invalid unichar in the string.
234  this->init(8);
235  this->make_bad();
236  }
237 }
238 
249 void WERD_CHOICE::init(const char *src_string,
250  const char *src_lengths,
251  float src_rating,
252  float src_certainty,
253  uint8_t src_permuter) {
254  int src_string_len = strlen(src_string);
255  if (src_string_len == 0) {
256  this->init(8);
257  } else {
258  this->init(src_lengths ? strlen(src_lengths): src_string_len);
259  length_ = reserved_;
260  int offset = 0;
261  for (int i = 0; i < length_; ++i) {
262  int unichar_length = src_lengths ? src_lengths[i] : 1;
263  unichar_ids_[i] =
264  unicharset_->unichar_to_id(src_string+offset, unichar_length);
265  state_[i] = 1;
266  certainties_[i] = src_certainty;
267  offset += unichar_length;
268  }
269  }
270  adjust_factor_ = 1.0f;
271  rating_ = src_rating;
272  certainty_ = src_certainty;
273  permuter_ = src_permuter;
274  dangerous_ambig_found_ = false;
275 }
276 
281  delete[] unichar_ids_;
282  delete[] script_pos_;
283  delete[] state_;
284  delete[] certainties_;
285 }
286 
287 const char *WERD_CHOICE::permuter_name() const {
288  return kPermuterTypeNames[permuter_];
289 }
290 
291 // Returns the BLOB_CHOICE_LIST corresponding to the given index in the word,
292 // taken from the appropriate cell in the ratings MATRIX.
293 // Borrowed pointer, so do not delete.
294 BLOB_CHOICE_LIST* WERD_CHOICE::blob_choices(int index, MATRIX* ratings) const {
295  MATRIX_COORD coord = MatrixCoord(index);
296  BLOB_CHOICE_LIST* result = ratings->get(coord.col, coord.row);
297  if (result == nullptr) {
298  result = new BLOB_CHOICE_LIST;
299  ratings->put(coord.col, coord.row, result);
300  }
301  return result;
302 }
303 
304 // Returns the MATRIX_COORD corresponding to the location in the ratings
305 // MATRIX for the given index into the word.
307  int col = 0;
308  for (int i = 0; i < index; ++i)
309  col += state_[i];
310  int row = col + state_[index] - 1;
311  return MATRIX_COORD(col, row);
312 }
313 
314 // Sets the entries for the given index from the BLOB_CHOICE, assuming
315 // unit fragment lengths, but setting the state for this index to blob_count.
316 void WERD_CHOICE::set_blob_choice(int index, int blob_count,
317  const BLOB_CHOICE* blob_choice) {
318  unichar_ids_[index] = blob_choice->unichar_id();
319  script_pos_[index] = tesseract::SP_NORMAL;
320  state_[index] = blob_count;
321  certainties_[index] = blob_choice->certainty();
322 }
323 
324 
331  for (int i = 0; i < length_; ++i) {
332  if (unichar_ids_[i] == unichar_id) {
333  return true;
334  }
335  }
336  return false;
337 }
338 
346 void WERD_CHOICE::remove_unichar_ids(int start, int num) {
347  ASSERT_HOST(start >= 0 && start + num <= length_);
348  // Accumulate the states to account for the merged blobs.
349  for (int i = 0; i < num; ++i) {
350  if (start > 0)
351  state_[start - 1] += state_[start + i];
352  else if (start + num < length_)
353  state_[start + num] += state_[start + i];
354  }
355  for (int i = start; i + num < length_; ++i) {
356  unichar_ids_[i] = unichar_ids_[i + num];
357  script_pos_[i] = script_pos_[i + num];
358  state_[i] = state_[i + num];
359  certainties_[i] = certainties_[i + num];
360  }
361  length_ -= num;
362 }
363 
370  for (int i = 0; i < length_ / 2; ++i) {
371  UNICHAR_ID tmp_id = unichar_ids_[i];
372  unichar_ids_[i] = unicharset_->get_mirror(unichar_ids_[length_-1-i]);
373  unichar_ids_[length_-1-i] = unicharset_->get_mirror(tmp_id);
374  }
375  if (length_ % 2 != 0) {
376  unichar_ids_[length_/2] = unicharset_->get_mirror(unichar_ids_[length_/2]);
377  }
378 }
379 
387 void WERD_CHOICE::punct_stripped(int *start, int *end) const {
388  *start = 0;
389  *end = length() - 1;
390  while (*start < length() &&
391  unicharset()->get_ispunctuation(unichar_id(*start))) {
392  (*start)++;
393  }
394  while (*end > -1 &&
395  unicharset()->get_ispunctuation(unichar_id(*end))) {
396  (*end)--;
397  }
398  (*end)++;
399 }
400 
401 void WERD_CHOICE::GetNonSuperscriptSpan(int *pstart, int *pend) const {
402  int end = length();
403  while (end > 0 &&
404  unicharset_->get_isdigit(unichar_ids_[end - 1]) &&
406  end--;
407  }
408  int start = 0;
409  while (start < end &&
410  unicharset_->get_isdigit(unichar_ids_[start]) &&
412  start++;
413  }
414  *pstart = start;
415  *pend = end;
416 }
417 
418 WERD_CHOICE WERD_CHOICE::shallow_copy(int start, int end) const {
419  ASSERT_HOST(start >= 0 && start <= length_);
420  ASSERT_HOST(end >= 0 && end <= length_);
421  if (end < start) { end = start; }
422  WERD_CHOICE retval(unicharset_, end - start);
423  for (int i = start; i < end; i++) {
425  unichar_ids_[i], state_[i], 0.0f, certainties_[i]);
426  }
427  return retval;
428 }
429 
436  int i;
437  for (i = 0; i < length_; ++i) {
438  UNICHARSET::Direction dir = unicharset_->get_direction(unichar_ids_[i]);
439  if (dir == UNICHARSET::U_RIGHT_TO_LEFT ||
441  return true;
442  }
443  }
444  return false;
445 }
446 
454  STRING *word_lengths_str) const {
455  *word_str = "";
456  if (word_lengths_str != nullptr) *word_lengths_str = "";
457  for (int i = 0; i < length_; ++i) {
458  const char *ch = unicharset_->id_to_unichar_ext(unichar_ids_[i]);
459  *word_str += ch;
460  if (word_lengths_str != nullptr) {
461  *word_lengths_str += strlen(ch);
462  }
463  }
464 }
465 
473  UNICHAR_ID unichar_id, int blob_count,
474  float rating, float certainty) {
475  if (length_ == reserved_) {
476  this->double_the_size();
477  }
478  this->append_unichar_id_space_allocated(unichar_id, blob_count,
479  rating, certainty);
480 }
481 
490  ASSERT_HOST(unicharset_ == second.unicharset_);
491  while (reserved_ < length_ + second.length()) {
492  this->double_the_size();
493  }
494  const UNICHAR_ID *other_unichar_ids = second.unichar_ids();
495  for (int i = 0; i < second.length(); ++i) {
496  unichar_ids_[length_ + i] = other_unichar_ids[i];
497  state_[length_ + i] = second.state_[i];
498  certainties_[length_ + i] = second.certainties_[i];
499  script_pos_[length_ + i] = second.BlobPosition(i);
500  }
501  length_ += second.length();
502  if (second.adjust_factor_ > adjust_factor_)
503  adjust_factor_ = second.adjust_factor_;
504  rating_ += second.rating(); // add ratings
505  if (second.certainty() < certainty_) // take min
506  certainty_ = second.certainty();
507  if (second.dangerous_ambig_found_)
508  dangerous_ambig_found_ = true;
509  if (permuter_ == NO_PERM) {
510  permuter_ = second.permuter();
511  } else if (second.permuter() != NO_PERM &&
512  second.permuter() != permuter_) {
513  permuter_ = COMPOUND_PERM;
514  }
515  return *this;
516 }
517 
518 
526  while (reserved_ < source.length()) {
527  this->double_the_size();
528  }
529 
530  unicharset_ = source.unicharset_;
531  const UNICHAR_ID *other_unichar_ids = source.unichar_ids();
532  for (int i = 0; i < source.length(); ++i) {
533  unichar_ids_[i] = other_unichar_ids[i];
534  state_[i] = source.state_[i];
535  certainties_[i] = source.certainties_[i];
536  script_pos_[i] = source.BlobPosition(i);
537  }
538  length_ = source.length();
539  adjust_factor_ = source.adjust_factor_;
540  rating_ = source.rating();
541  certainty_ = source.certainty();
542  min_x_height_ = source.min_x_height();
543  max_x_height_ = source.max_x_height();
544  permuter_ = source.permuter();
545  dangerous_ambig_found_ = source.dangerous_ambig_found_;
546  return *this;
547 }
548 
549 // Sets up the script_pos_ member using the blobs_list to get the bln
550 // bounding boxes, *this to get the unichars, and this->unicharset
551 // to get the target positions. If small_caps is true, sub/super are not
552 // considered, but dropcaps are.
553 // NOTE: blobs_list should be the chopped_word blobs. (Fully segemented.)
554 void WERD_CHOICE::SetScriptPositions(bool small_caps, TWERD* word, int debug) {
555  // Initialize to normal.
556  for (int i = 0; i < length_; ++i)
557  script_pos_[i] = tesseract::SP_NORMAL;
558  if (word->blobs.empty() || word->NumBlobs() != TotalOfStates()) {
559  return;
560  }
561 
562  int position_counts[4] = { 0, 0, 0, 0 };
563 
564  int chunk_index = 0;
565  for (int blob_index = 0; blob_index < length_; ++blob_index, ++chunk_index) {
566  TBLOB* tblob = word->blobs[chunk_index];
567  int uni_id = unichar_id(blob_index);
568  TBOX blob_box = tblob->bounding_box();
569  if (state_ != nullptr) {
570  for (int i = 1; i < state_[blob_index]; ++i) {
571  ++chunk_index;
572  tblob = word->blobs[chunk_index];
573  blob_box += tblob->bounding_box();
574  }
575  }
576  script_pos_[blob_index] = ScriptPositionOf(false, *unicharset_, blob_box,
577  uni_id);
578  if (small_caps && script_pos_[blob_index] != tesseract::SP_DROPCAP) {
579  script_pos_[blob_index] = tesseract::SP_NORMAL;
580  }
581  position_counts[script_pos_[blob_index]]++;
582  }
583  // If almost everything looks like a superscript or subscript,
584  // we most likely just got the baseline wrong.
585  if (position_counts[tesseract::SP_SUBSCRIPT] > 0.75 * length_ ||
586  position_counts[tesseract::SP_SUPERSCRIPT] > 0.75 * length_) {
587  if (debug >= 2) {
588  tprintf("Most characters of %s are subscript or superscript.\n"
589  "That seems wrong, so I'll assume we got the baseline wrong\n",
590  unichar_string().string());
591  }
592  for (int i = 0; i < length_; i++) {
593  ScriptPos sp = script_pos_[i];
595  position_counts[sp]--;
596  position_counts[tesseract::SP_NORMAL]++;
597  script_pos_[i] = tesseract::SP_NORMAL;
598  }
599  }
600  }
601 
602  if ((debug >= 1 && position_counts[tesseract::SP_NORMAL] < length_) ||
603  debug >= 2) {
604  tprintf("SetScriptPosition on %s\n", unichar_string().string());
605  int chunk_index = 0;
606  for (int blob_index = 0; blob_index < length_; ++blob_index) {
607  if (debug >= 2 || script_pos_[blob_index] != tesseract::SP_NORMAL) {
608  TBLOB* tblob = word->blobs[chunk_index];
609  ScriptPositionOf(true, *unicharset_, tblob->bounding_box(),
610  unichar_id(blob_index));
611  }
612  chunk_index += state_ != nullptr ? state_[blob_index] : 1;
613  }
614  }
615 }
616 // Sets the script_pos_ member from some source positions with a given length.
618  int length) {
619  ASSERT_HOST(length == length_);
620  if (positions != script_pos_) {
621  delete [] script_pos_;
622  script_pos_ = new ScriptPos[length];
623  memcpy(script_pos_, positions, sizeof(positions[0]) * length);
624  }
625 }
626 // Sets all the script_pos_ positions to the given position.
628  for (int i = 0; i < length_; ++i)
629  script_pos_[i] = position;
630 }
631 
632 /* static */
634  const UNICHARSET& unicharset,
635  const TBOX& blob_box,
636  UNICHAR_ID unichar_id) {
638  int top = blob_box.top();
639  int bottom = blob_box.bottom();
640  int min_bottom, max_bottom, min_top, max_top;
642  &min_bottom, &max_bottom,
643  &min_top, &max_top);
644 
645  int sub_thresh_top = min_top - kMinSubscriptOffset;
646  int sub_thresh_bot = kBlnBaselineOffset - kMinSubscriptOffset;
647  int sup_thresh_bot = max_bottom + kMinSuperscriptOffset;
648  if (bottom <= kMaxDropCapBottom) {
649  retval = tesseract::SP_DROPCAP;
650  } else if (top < sub_thresh_top && bottom < sub_thresh_bot) {
651  retval = tesseract::SP_SUBSCRIPT;
652  } else if (bottom > sup_thresh_bot) {
653  retval = tesseract::SP_SUPERSCRIPT;
654  }
655 
656  if (print_debug) {
657  const char *pos = ScriptPosToString(retval);
658  tprintf("%s Character %s[bot:%d top: %d] "
659  "bot_range[%d,%d] top_range[%d, %d] "
660  "sub_thresh[bot:%d top:%d] sup_thresh_bot %d\n",
662  bottom, top,
663  min_bottom, max_bottom, min_top, max_top,
664  sub_thresh_bot, sub_thresh_top,
665  sup_thresh_bot);
666  }
667  return retval;
668 }
669 
670 // Returns the script-id (eg Han) of the dominant script in the word.
672  int max_script = unicharset_->get_script_table_size();
673  int *sid = new int[max_script];
674  int x;
675  for (x = 0; x < max_script; x++) sid[x] = 0;
676  for (x = 0; x < length_; ++x) {
677  int script_id = unicharset_->get_script(unichar_id(x));
678  sid[script_id]++;
679  }
680  if (unicharset_->han_sid() != unicharset_->null_sid()) {
681  // Add the Hiragana & Katakana counts to Han and zero them out.
682  if (unicharset_->hiragana_sid() != unicharset_->null_sid()) {
683  sid[unicharset_->han_sid()] += sid[unicharset_->hiragana_sid()];
684  sid[unicharset_->hiragana_sid()] = 0;
685  }
686  if (unicharset_->katakana_sid() != unicharset_->null_sid()) {
687  sid[unicharset_->han_sid()] += sid[unicharset_->katakana_sid()];
688  sid[unicharset_->katakana_sid()] = 0;
689  }
690  }
691  // Note that high script ID overrides lower one on a tie, thus biasing
692  // towards non-Common script (if sorted that way in unicharset file).
693  int max_sid = 0;
694  for (x = 1; x < max_script; x++)
695  if (sid[x] >= sid[max_sid]) max_sid = x;
696  if (sid[max_sid] < length_ / 2)
697  max_sid = unicharset_->null_sid();
698  delete[] sid;
699  return max_sid;
700 }
701 
702 // Fixes the state_ for a chop at the given blob_posiiton.
703 void WERD_CHOICE::UpdateStateForSplit(int blob_position) {
704  int total_chunks = 0;
705  for (int i = 0; i < length_; ++i) {
706  total_chunks += state_[i];
707  if (total_chunks > blob_position) {
708  ++state_[i];
709  return;
710  }
711  }
712 }
713 
714 // Returns the sum of all the state elements, being the total number of blobs.
716  int total_chunks = 0;
717  for (int i = 0; i < length_; ++i) {
718  total_chunks += state_[i];
719  }
720  return total_chunks;
721 }
722 
728 void WERD_CHOICE::print(const char *msg) const {
729  tprintf("%s : ", msg);
730  for (int i = 0; i < length_; ++i) {
731  tprintf("%s", unicharset_->id_to_unichar(unichar_ids_[i]));
732  }
733  tprintf(" : R=%g, C=%g, F=%g, Perm=%d, xht=[%g,%g], ambig=%d\n",
734  rating_, certainty_, adjust_factor_, permuter_,
735  min_x_height_, max_x_height_, dangerous_ambig_found_);
736  tprintf("pos");
737  for (int i = 0; i < length_; ++i) {
738  tprintf("\t%s", ScriptPosToString(script_pos_[i]));
739  }
740  tprintf("\nstr");
741  for (int i = 0; i < length_; ++i) {
742  tprintf("\t%s", unicharset_->id_to_unichar(unichar_ids_[i]));
743  }
744  tprintf("\nstate:");
745  for (int i = 0; i < length_; ++i) {
746  tprintf("\t%d ", state_[i]);
747  }
748  tprintf("\nC");
749  for (int i = 0; i < length_; ++i) {
750  tprintf("\t%.3f", certainties_[i]);
751  }
752  tprintf("\n");
753 }
754 
755 // Prints the segmentation state with an introductory message.
756 void WERD_CHOICE::print_state(const char *msg) const {
757  tprintf("%s", msg);
758  for (int i = 0; i < length_; ++i)
759  tprintf(" %d", state_[i]);
760  tprintf("\n");
761 }
762 
763 // Displays the segmentation state of *this (if not the same as the last
764 // one displayed) and waits for a click in the window.
766 #ifndef GRAPHICS_DISABLED
767  // Number of different colors to draw with.
768  const int kNumColors = 6;
769  static ScrollView *segm_window = nullptr;
770  // Check the state against the static prev_drawn_state.
771  static GenericVector<int> prev_drawn_state;
772  bool already_done = prev_drawn_state.size() == length_;
773  if (!already_done) prev_drawn_state.init_to_size(length_, 0);
774  for (int i = 0; i < length_; ++i) {
775  if (prev_drawn_state[i] != state_[i]) {
776  already_done = false;
777  }
778  prev_drawn_state[i] = state_[i];
779  }
780  if (already_done || word->blobs.empty()) return;
781 
782  // Create the window if needed.
783  if (segm_window == nullptr) {
784  segm_window = new ScrollView("Segmentation", 5, 10, 500, 256,
785  2000.0, 256.0, true);
786  } else {
787  segm_window->Clear();
788  }
789 
790  TBOX bbox;
791  int blob_index = 0;
792  for (int c = 0; c < length_; ++c) {
793  auto color =
794  static_cast<ScrollView::Color>(c % kNumColors + 3);
795  for (int i = 0; i < state_[c]; ++i, ++blob_index) {
796  TBLOB* blob = word->blobs[blob_index];
797  bbox += blob->bounding_box();
798  blob->plot(segm_window, color, color);
799  }
800  }
801  segm_window->ZoomToRectangle(bbox.left(), bbox.top(),
802  bbox.right(), bbox.bottom());
803  segm_window->Update();
804  window_wait(segm_window);
805 #endif
806 }
807 
808 
810  const WERD_CHOICE &word2) {
811  const UNICHARSET *uchset = word1.unicharset();
812  if (word2.unicharset() != uchset) return false;
813  int w1start, w1end;
814  word1.punct_stripped(&w1start, &w1end);
815  int w2start, w2end;
816  word2.punct_stripped(&w2start, &w2end);
817  if (w1end - w1start != w2end - w2start) return false;
818  for (int i = 0; i < w1end - w1start; i++) {
819  if (uchset->to_lower(word1.unichar_id(w1start + i)) !=
820  uchset->to_lower(word2.unichar_id(w2start + i))) {
821  return false;
822  }
823  }
824  return true;
825 }
826 
837 void print_ratings_list(const char *msg,
838  BLOB_CHOICE_LIST *ratings,
839  const UNICHARSET &current_unicharset) {
840  if (ratings->length() == 0) {
841  tprintf("%s:<none>\n", msg);
842  return;
843  }
844  if (*msg != '\0') {
845  tprintf("%s\n", msg);
846  }
847  BLOB_CHOICE_IT c_it;
848  c_it.set_to_list(ratings);
849  for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
850  c_it.data()->print(&current_unicharset);
851  if (!c_it.at_last()) tprintf("\n");
852  }
853  tprintf("\n");
854  fflush(stdout);
855 }
int UNICHAR_ID
Definition: unichar.h:34
#define ELISTIZE(CLASSNAME)
Definition: elst.h:946
bool empty() const
Definition: genericvector.h:91
void DisplaySegmentation(TWERD *word)
Definition: ratngs.cpp:765
const char * ScriptPosToString(enum ScriptPos script_pos)
Definition: ratngs.cpp:204
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int NumBlobs() const
Definition: blobs.h:448
static void Update()
Definition: scrollview.cpp:709
Definition: blobs.h:418
WERD_CHOICE & operator+=(const WERD_CHOICE &second)
Definition: ratngs.cpp:489
MATRIX_COORD MatrixCoord(int index) const
Definition: ratngs.cpp:306
void init_to_size(int size, const T &t)
int script_id() const
Definition: ratngs.h:114
void reverse_and_mirror_unichar_ids()
Definition: ratngs.cpp:369
int length() const
Definition: ratngs.h:293
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
void ZoomToRectangle(int x1, int y1, int x2, int y2)
Definition: scrollview.cpp:757
void operator=(const ELIST_LINK &)
Definition: elst.h:94
float rating() const
Definition: ratngs.h:317
const int kBlnBaselineOffset
Definition: normalis.h:25
void punct_stripped(int *start_core, int *end_core) const
Definition: ratngs.cpp:387
const int kMaxDropCapBottom
Definition: ratngs.cpp:47
float yshift() const
Definition: ratngs.h:126
const char * permuter_name() const
Definition: ratngs.cpp:287
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
Definition: ratngs.cpp:554
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
int get_script(UNICHAR_ID unichar_id) const
Definition: unicharset.h:663
bool contains_unichar_id(UNICHAR_ID unichar_id) const
Definition: ratngs.cpp:330
Definition: matrix.h:578
static tesseract::ScriptPos ScriptPositionOf(bool print_debug, const UNICHARSET &unicharset, const TBOX &blob_box, UNICHAR_ID unichar_id)
Definition: ratngs.cpp:633
Definition: blobs.h:284
void print() const
Definition: ratngs.h:570
int16_t fontinfo_id2() const
Definition: ratngs.h:89
BLOB_CHOICE_LIST * blob_choices(int index, MATRIX *ratings) const
Definition: ratngs.cpp:294
tesseract::ScriptPos BlobPosition(int index) const
Definition: ratngs.h:312
void print_state(const char *msg) const
Definition: ratngs.cpp:756
const char * string() const
Definition: strngs.cpp:194
void GetNonSuperscriptSpan(int *start, int *end) const
Definition: ratngs.cpp:401
void print_ratings_list(const char *msg, BLOB_CHOICE_LIST *ratings, const UNICHARSET &current_unicharset)
Definition: ratngs.cpp:837
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
uint8_t permuter() const
Definition: ratngs.h:336
T ClipToRange(const T &x, const T &lower_bound, const T &upper_bound)
Definition: helpers.h:108
BlobChoiceClassifier
Definition: ratngs.h:43
T get(ICOORD pos) const
Definition: matrix.h:231
int16_t left() const
Definition: rect.h:72
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
const int kMinSubscriptOffset
Definition: ratngs.cpp:43
int hiragana_sid() const
Definition: unicharset.h:890
WERD_CHOICE & operator=(const WERD_CHOICE &source)
Definition: ratngs.cpp:525
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:442
int katakana_sid() const
Definition: unicharset.h:891
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:704
int GetTopScriptID() const
Definition: ratngs.cpp:671
int null_sid() const
Definition: unicharset.h:884
BLOB_CHOICE()
Definition: ratngs.h:54
const double kMaxOverlapDenominator
Definition: ratngs.cpp:49
float min_x_height() const
Definition: ratngs.h:326
float rating() const
Definition: ratngs.h:80
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
int han_sid() const
Definition: unicharset.h:889
int TotalOfStates() const
Definition: ratngs.cpp:715
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
int16_t bottom() const
Definition: rect.h:65
void UpdateStateForSplit(int blob_position)
Definition: ratngs.cpp:703
const char * id_to_unichar_ext(UNICHAR_ID id) const
Definition: unicharset.cpp:299
float max_xheight() const
Definition: ratngs.h:123
const UNICHAR_ID * unichar_ids() const
Definition: ratngs.h:302
BLOB_CHOICE * FindMatchingChoice(UNICHAR_ID char_id, BLOB_CHOICE_LIST *bc_list)
Definition: ratngs.cpp:184
void set_blob_choice(int index, int blob_count, const BLOB_CHOICE *blob_choice)
Definition: ratngs.cpp:316
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
const STRING & unichar_string() const
Definition: ratngs.h:531
void init(int reserved)
Definition: ratngs.h:399
Direction get_direction(UNICHAR_ID unichar_id) const
Definition: unicharset.h:690
Definition: strngs.h:45
float max_x_height() const
Definition: ratngs.h:329
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
const int kMinSuperscriptOffset
Definition: ratngs.cpp:45
static const float kBadRating
Definition: ratngs.h:265
Definition: rect.h:34
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
Definition: unicharset.h:568
bool EqualIgnoringCaseAndTerminalPunct(const WERD_CHOICE &word1, const WERD_CHOICE &word2)
Definition: ratngs.cpp:809
void double_the_size()
Make more space in unichar_id_ and fragment_lengths_ arrays.
Definition: ratngs.h:377
void Clear()
Definition: scrollview.cpp:589
char window_wait(ScrollView *win)
Definition: callcpp.cpp:103
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
Definition: ratngs.cpp:156
int get_script_table_size() const
Definition: unicharset.h:849
float certainty() const
Definition: ratngs.h:320
static std::string CleanupString(const char *utf8_str)
Definition: unicharset.h:246
void SetAllScriptPositions(tesseract::ScriptPos position)
Definition: ratngs.cpp:627
int16_t fontinfo_id() const
Definition: ratngs.h:86
WERD_CHOICE shallow_copy(int start, int end) const
Definition: ratngs.cpp:418
int size() const
Definition: genericvector.h:72
TBOX bounding_box() const
Definition: blobs.cpp:468
void put(ICOORD pos, const T &thing)
Definition: matrix.h:223
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
Definition: unicharset.cpp:259
float min_xheight() const
Definition: ratngs.h:120
const double kMaxBaselineDrift
Definition: ratngs.cpp:55
WERD_CHOICE(const UNICHARSET *unicharset)
Definition: ratngs.h:268
void remove_unichar_ids(int index, int num)
Definition: ratngs.cpp:346
int16_t right() const
Definition: rect.h:79
#define ASSERT_HOST(x)
Definition: errcode.h:88
int16_t top() const
Definition: rect.h:58
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
Definition: unicharset.h:697
bool has_rtl_unichar_id() const
Definition: ratngs.cpp:435
void plot(ScrollView *window, ScrollView::Color color, ScrollView::Color child_color)
Definition: blobs.cpp:510
void make_bad()
Set the fields in this choice to be default (bad) values.
Definition: ratngs.h:433
const double kMinXHeightMatch
Definition: ratngs.cpp:52
float certainty() const
Definition: ratngs.h:83
~WERD_CHOICE()
Definition: ratngs.cpp:280