tesseract  4.1.1
docqual.cpp
Go to the documentation of this file.
1 /******************************************************************
2  * File: docqual.cpp (Formerly docqual.c)
3  * Description: Document Quality Metrics
4  * Author: Phil Cheatle
5  * Created: Mon May 9 11:27:28 BST 1994
6  *
7  * (C) Copyright 1994, Hewlett-Packard Ltd.
8  ** Licensed under the Apache License, Version 2.0 (the "License");
9  ** you may not use this file except in compliance with the License.
10  ** You may obtain a copy of the License at
11  ** http://www.apache.org/licenses/LICENSE-2.0
12  ** Unless required by applicable law or agreed to in writing, software
13  ** distributed under the License is distributed on an "AS IS" BASIS,
14  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15  ** See the License for the specific language governing permissions and
16  ** limitations under the License.
17  *
18  **********************************************************************/
19 
20 #include <cctype>
21 #include "docqual.h"
22 #include "reject.h"
23 #include "tesscallback.h"
24 #include "tessvars.h"
25 #include "tesseractclass.h"
26 
27 namespace tesseract{
28 
29 // A little class to provide the callbacks as we have no pre-bound args.
31  explicit DocQualCallbacks(WERD_RES* word0)
32  : word(word0), match_count(0), accepted_match_count(0) {}
33 
34  void CountMatchingBlobs(int index) {
35  ++match_count;
36  }
37 
38  void CountAcceptedBlobs(int index) {
39  if (word->reject_map[index].accepted())
41  ++match_count;
42  }
43 
44  void AcceptIfGoodQuality(int index) {
45  if (word->reject_map[index].accept_if_good_quality())
46  word->reject_map[index].setrej_quality_accept();
47  }
48 
50  int16_t match_count;
52 };
53 
54 /*************************************************************************
55  * word_blob_quality()
56  * How many blobs in the box_word are identical to those of the inword?
57  * ASSUME blobs in both initial word and box_word are in ascending order of
58  * left hand blob edge.
59  *************************************************************************/
61  if (word->bln_boxes == nullptr ||
62  word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
63  return 0;
64 
65  DocQualCallbacks cb(word);
67  *word->rebuild_word,
69  return cb.match_count;
70 }
71 
73  int16_t i = 0;
74  int16_t err_count = 0;
75 
76  if (word->rebuild_word != nullptr) {
77  for (int b = 0; b < word->rebuild_word->NumBlobs(); ++b) {
78  TBLOB* blob = word->rebuild_word->blobs[b];
79  err_count += count_outline_errs(word->best_choice->unichar_string()[i],
80  blob->NumOutlines());
81  i++;
82  }
83  }
84  return err_count;
85 }
86 
87 /*************************************************************************
88  * word_char_quality()
89  * Combination of blob quality and outline quality - how many good chars are
90  * there? - I.e chars which pass the blob AND outline tests.
91  *************************************************************************/
93  ROW *row,
94  int16_t *match_count,
95  int16_t *accepted_match_count) {
96  if (word->bln_boxes == nullptr || word->rebuild_word == nullptr ||
97  word->rebuild_word->blobs.empty()) {
98  *match_count = 0;
99  *accepted_match_count = 0;
100  return;
101  }
102 
103  DocQualCallbacks cb(word);
105  *word->rebuild_word,
107  *match_count = cb.match_count;
108  *accepted_match_count = cb.accepted_match_count;
109 }
110 
111 /*************************************************************************
112  * unrej_good_chs()
113  * Unreject POTENTIAL rejects if the blob passes the blob and outline checks
114  *************************************************************************/
116  if (word->bln_boxes == nullptr ||
117  word->rebuild_word == nullptr || word->rebuild_word->blobs.empty())
118  return;
119 
120  DocQualCallbacks cb(word);
122  *word->rebuild_word,
124 }
125 
126 int16_t Tesseract::count_outline_errs(char c, int16_t outline_count) {
127  int expected_outline_count;
128 
129  if (STRING (outlines_odd).contains (c))
130  return 0; // Don't use this char
131  else if (STRING (outlines_2).contains (c))
132  expected_outline_count = 2;
133  else
134  expected_outline_count = 1;
135  return abs (outline_count - expected_outline_count);
136 }
137 
139  bool good_quality_doc) {
140  if ((tessedit_good_quality_unrej && good_quality_doc))
141  unrej_good_quality_words(page_res_it);
142  doc_and_block_rejection(page_res_it, good_quality_doc);
143  if (unlv_tilde_crunching) {
144  tilde_crunch(page_res_it);
145  tilde_delete(page_res_it);
146  }
147 }
148 
149 /*************************************************************************
150  * unrej_good_quality_words()
151  * Accept potential rejects in words which pass the following checks:
152  * - Contains a potential reject
153  * - Word looks like a sensible alpha word.
154  * - Word segmentation is the same as the original image
155  * - All characters have the expected number of outlines
156  * NOTE - the rejection counts are recalculated after unrejection
157  * - CAN'T do it in a single pass without a bit of fiddling
158  * - keep it simple but inefficient
159  *************************************************************************/
160 void Tesseract::unrej_good_quality_words( //unreject potential
161  PAGE_RES_IT &page_res_it) {
162  WERD_RES *word;
163  ROW_RES *current_row;
164  BLOCK_RES *current_block;
165  int i;
166 
167  page_res_it.restart_page ();
168  while (page_res_it.word () != nullptr) {
169  check_debug_pt (page_res_it.word (), 100);
170  if (bland_unrej) {
171  word = page_res_it.word ();
172  for (i = 0; i < word->reject_map.length (); i++) {
173  if (word->reject_map[i].accept_if_good_quality ())
174  word->reject_map[i].setrej_quality_accept ();
175  }
176  page_res_it.forward ();
177  }
178  else if ((page_res_it.row ()->char_count > 0) &&
179  ((page_res_it.row ()->rej_count /
180  static_cast<float>(page_res_it.row ()->char_count)) <=
182  word = page_res_it.word ();
186  word->best_choice->unichar_string().string(),
188  != AC_UNACCEPTABLE)) {
189  unrej_good_chs(word, page_res_it.row ()->row);
190  }
191  page_res_it.forward ();
192  }
193  else {
194  /* Skip to end of dodgy row */
195  current_row = page_res_it.row ();
196  while ((page_res_it.word () != nullptr) &&
197  (page_res_it.row () == current_row))
198  page_res_it.forward ();
199  }
200  check_debug_pt (page_res_it.word (), 110);
201  }
202  page_res_it.restart_page ();
203  page_res_it.page_res->char_count = 0;
204  page_res_it.page_res->rej_count = 0;
205  current_block = nullptr;
206  current_row = nullptr;
207  while (page_res_it.word () != nullptr) {
208  if (current_block != page_res_it.block ()) {
209  current_block = page_res_it.block ();
210  current_block->char_count = 0;
211  current_block->rej_count = 0;
212  }
213  if (current_row != page_res_it.row ()) {
214  current_row = page_res_it.row ();
215  current_row->char_count = 0;
216  current_row->rej_count = 0;
217  current_row->whole_word_rej_count = 0;
218  }
219  page_res_it.rej_stat_word ();
220  page_res_it.forward ();
221  }
222 }
223 
224 
225 /*************************************************************************
226  * doc_and_block_rejection()
227  *
228  * If the page has too many rejects - reject all of it.
229  * If any block has too many rejects - reject all words in the block
230  *************************************************************************/
231 
232 void Tesseract::doc_and_block_rejection( //reject big chunks
233  PAGE_RES_IT &page_res_it,
234  bool good_quality_doc) {
235  int16_t block_no = 0;
236  int16_t row_no = 0;
237  BLOCK_RES *current_block;
238  ROW_RES *current_row;
239 
240  bool rej_word;
241  bool prev_word_rejected;
242  int16_t char_quality = 0;
243  int16_t accepted_char_quality;
244 
245  if (page_res_it.page_res->rej_count * 100.0 /
247  reject_whole_page(page_res_it);
249  tprintf("REJECT ALL #chars: %d #Rejects: %d; \n",
250  page_res_it.page_res->char_count,
251  page_res_it.page_res->rej_count);
252  }
253  } else {
255  tprintf("NO PAGE REJECTION #chars: %d # Rejects: %d; \n",
256  page_res_it.page_res->char_count,
257  page_res_it.page_res->rej_count);
258  }
259 
260  /* Walk blocks testing for block rejection */
261 
262  page_res_it.restart_page();
263  WERD_RES* word;
264  while ((word = page_res_it.word()) != nullptr) {
265  current_block = page_res_it.block();
266  block_no = current_block->block->pdblk.index();
267  if (current_block->char_count > 0 &&
268  (current_block->rej_count * 100.0 / current_block->char_count) >
271  tprintf("REJECTING BLOCK %d #chars: %d; #Rejects: %d\n",
272  block_no, current_block->char_count,
273  current_block->rej_count);
274  }
275  prev_word_rejected = false;
276  while ((word = page_res_it.word()) != nullptr &&
277  (page_res_it.block() == current_block)) {
279  rej_word = word->reject_map.reject_count() > 0 ||
281  if (rej_word && tessedit_dont_blkrej_good_wds &&
284  *word->uch_set,
285  word->best_choice->unichar_string().string(),
286  word->best_choice->unichar_lengths().string()) !=
287  AC_UNACCEPTABLE) {
288  word_char_quality(word, page_res_it.row()->row,
289  &char_quality,
290  &accepted_char_quality);
291  rej_word = char_quality != word->reject_map.length();
292  }
293  } else {
294  rej_word = true;
295  }
296  if (rej_word) {
297  /*
298  Reject spacing if both current and prev words are rejected.
299  NOTE - this is NOT restricted to FUZZY spaces. - When tried this
300  generated more space errors.
301  */
303  prev_word_rejected &&
304  page_res_it.prev_row() == page_res_it.row() &&
305  word->word->space() == 1)
306  word->reject_spaces = true;
308  }
309  prev_word_rejected = rej_word;
310  page_res_it.forward();
311  }
312  } else {
314  tprintf("NOT REJECTING BLOCK %d #chars: %d # Rejects: %d; \n",
315  block_no, page_res_it.block()->char_count,
316  page_res_it.block()->rej_count);
317  }
318 
319  /* Walk rows in block testing for row rejection */
320  row_no = 0;
321  while (page_res_it.word() != nullptr &&
322  page_res_it.block() == current_block) {
323  current_row = page_res_it.row();
324  row_no++;
325  /* Reject whole row if:
326  fraction of chars on row which are rejected exceed a limit AND
327  fraction rejects which occur in WHOLE WERD rejects is LESS THAN a
328  limit
329  */
330  if (current_row->char_count > 0 &&
331  (current_row->rej_count * 100.0 / current_row->char_count) >
333  (current_row->whole_word_rej_count * 100.0 /
334  current_row->rej_count) <
337  tprintf("REJECTING ROW %d #chars: %d; #Rejects: %d\n",
338  row_no, current_row->char_count,
339  current_row->rej_count);
340  }
341  prev_word_rejected = false;
342  while ((word = page_res_it.word()) != nullptr &&
343  page_res_it.row () == current_row) {
344  /* Preserve words on good docs unless they are mostly rejected*/
345  if (!tessedit_row_rej_good_docs && good_quality_doc) {
346  rej_word = word->reject_map.reject_count() /
347  static_cast<float>(word->reject_map.length()) >
350  /* Preserve perfect words anyway */
351  rej_word = word->reject_map.reject_count() > 0 ||
353  if (rej_word && tessedit_dont_rowrej_good_wds &&
356  word->best_choice->unichar_string().string(),
357  word->best_choice->unichar_lengths().string()) !=
358  AC_UNACCEPTABLE) {
359  word_char_quality(word, page_res_it.row()->row,
360  &char_quality,
361  &accepted_char_quality);
362  rej_word = char_quality != word->reject_map.length();
363  }
364  } else {
365  rej_word = true;
366  }
367  if (rej_word) {
368  /*
369  Reject spacing if both current and prev words are rejected.
370  NOTE - this is NOT restricted to FUZZY spaces. - When tried
371  this generated more space errors.
372  */
374  prev_word_rejected &&
375  page_res_it.prev_row() == page_res_it.row() &&
376  word->word->space () == 1)
377  word->reject_spaces = true;
379  }
380  prev_word_rejected = rej_word;
381  page_res_it.forward();
382  }
383  } else {
385  tprintf("NOT REJECTING ROW %d #chars: %d # Rejects: %d; \n",
386  row_no, current_row->char_count, current_row->rej_count);
387  }
388  while (page_res_it.word() != nullptr &&
389  page_res_it.row() == current_row)
390  page_res_it.forward();
391  }
392  }
393  }
394  }
395  }
396 }
397 
398 } // namespace tesseract
399 
400 /*************************************************************************
401  * reject_whole_page()
402  * Don't believe any of it - set the reject map to 00..00 in all words
403  *
404  *************************************************************************/
405 
406 void reject_whole_page(PAGE_RES_IT &page_res_it) {
407  page_res_it.restart_page ();
408  while (page_res_it.word () != nullptr) {
409  page_res_it.word ()->reject_map.rej_word_doc_rej ();
410  page_res_it.forward ();
411  }
412  //whole page is rejected
413  page_res_it.page_res->rejected = true;
414 }
415 
416 namespace tesseract {
418  WERD_RES *word;
419  GARBAGE_LEVEL garbage_level;
420  PAGE_RES_IT copy_it;
421  bool prev_potential_marked = false;
422  bool found_terrible_word = false;
423  bool ok_dict_word;
424 
425  page_res_it.restart_page();
426  while (page_res_it.word() != nullptr) {
427  POLY_BLOCK* pb = page_res_it.block()->block->pdblk.poly_block();
428  if (pb != nullptr && !pb->IsText()) {
429  page_res_it.forward();
430  continue;
431  }
432  word = page_res_it.word();
433 
435  convert_bad_unlv_chs(word);
436 
438  word->merge_tess_fails();
439 
440  if (word->reject_map.accept_count () != 0) {
441  found_terrible_word = false;
442  //Forget earlier potential crunches
443  prev_potential_marked = false;
444  }
445  else {
446  ok_dict_word = safe_dict_word(word);
447  garbage_level = garbage_word(word, ok_dict_word);
448 
449  if ((garbage_level != G_NEVER_CRUNCH) &&
450  (terrible_word_crunch (word, garbage_level))) {
451  if (crunch_debug > 0) {
452  tprintf ("T CRUNCHING: \"%s\"\n",
453  word->best_choice->unichar_string().string());
454  }
456  if (prev_potential_marked) {
457  while (copy_it.word () != word) {
458  if (crunch_debug > 0) {
459  tprintf ("P1 CRUNCHING: \"%s\"\n",
460  copy_it.word()->best_choice->unichar_string().string());
461  }
462  copy_it.word ()->unlv_crunch_mode = CR_KEEP_SPACE;
463  copy_it.forward ();
464  }
465  prev_potential_marked = false;
466  }
467  found_terrible_word = true;
468  }
469  else if ((garbage_level != G_NEVER_CRUNCH) &&
470  (potential_word_crunch (word,
471  garbage_level, ok_dict_word))) {
472  if (found_terrible_word) {
473  if (crunch_debug > 0) {
474  tprintf ("P2 CRUNCHING: \"%s\"\n",
475  word->best_choice->unichar_string().string());
476  }
478  }
479  else if (!prev_potential_marked) {
480  copy_it = page_res_it;
481  prev_potential_marked = true;
482  if (crunch_debug > 1) {
483  tprintf ("P3 CRUNCHING: \"%s\"\n",
484  word->best_choice->unichar_string().string());
485  }
486  }
487  }
488  else {
489  found_terrible_word = false;
490  //Forget earlier potential crunches
491  prev_potential_marked = false;
492  if (crunch_debug > 2) {
493  tprintf ("NO CRUNCH: \"%s\"\n",
494  word->best_choice->unichar_string().string());
495  }
496  }
497  }
498  page_res_it.forward ();
499  }
500 }
501 
502 
504  GARBAGE_LEVEL garbage_level) {
505  float rating_per_ch;
506  int adjusted_len;
507  int crunch_mode = 0;
508 
509  if ((word->best_choice->unichar_string().length() == 0) ||
510  (strspn(word->best_choice->unichar_string().string(), " ") ==
512  crunch_mode = 1;
513  else {
514  adjusted_len = word->reject_map.length ();
515  if (adjusted_len > crunch_rating_max)
516  adjusted_len = crunch_rating_max;
517  rating_per_ch = word->best_choice->rating () / adjusted_len;
518 
519  if (rating_per_ch > crunch_terrible_rating)
520  crunch_mode = 2;
521  else if (crunch_terrible_garbage && (garbage_level == G_TERRIBLE))
522  crunch_mode = 3;
523  else if ((word->best_choice->certainty () < crunch_poor_garbage_cert) &&
524  (garbage_level != G_OK))
525  crunch_mode = 4;
526  else if ((rating_per_ch > crunch_poor_garbage_rate) &&
527  (garbage_level != G_OK))
528  crunch_mode = 5;
529  }
530  if (crunch_mode > 0) {
531  if (crunch_debug > 2) {
532  tprintf ("Terrible_word_crunch (%d) on \"%s\"\n",
533  crunch_mode, word->best_choice->unichar_string().string());
534  }
535  return true;
536  }
537  else
538  return false;
539 }
540 
542  GARBAGE_LEVEL garbage_level,
543  bool ok_dict_word) {
544  float rating_per_ch;
545  int adjusted_len;
546  const char *str = word->best_choice->unichar_string().string();
547  const char *lengths = word->best_choice->unichar_lengths().string();
548  bool word_crunchable;
549  int poor_indicator_count = 0;
550 
551  word_crunchable = !crunch_leave_accept_strings ||
552  word->reject_map.length() < 3 ||
554  str, lengths) == AC_UNACCEPTABLE &&
555  !ok_dict_word);
556 
557  adjusted_len = word->reject_map.length();
558  if (adjusted_len > 10)
559  adjusted_len = 10;
560  rating_per_ch = word->best_choice->rating() / adjusted_len;
561 
562  if (rating_per_ch > crunch_pot_poor_rate) {
563  if (crunch_debug > 2) {
564  tprintf("Potential poor rating on \"%s\"\n",
565  word->best_choice->unichar_string().string());
566  }
567  poor_indicator_count++;
568  }
569 
570  if (word_crunchable &&
572  if (crunch_debug > 2) {
573  tprintf("Potential poor cert on \"%s\"\n",
574  word->best_choice->unichar_string().string());
575  }
576  poor_indicator_count++;
577  }
578 
579  if (garbage_level != G_OK) {
580  if (crunch_debug > 2) {
581  tprintf("Potential garbage on \"%s\"\n",
582  word->best_choice->unichar_string().string());
583  }
584  poor_indicator_count++;
585  }
586  return poor_indicator_count >= crunch_pot_indicators;
587 }
588 
590  WERD_RES *word;
591  PAGE_RES_IT copy_it;
592  bool deleting_from_bol = false;
593  bool marked_delete_point = false;
594  int16_t debug_delete_mode;
595  CRUNCH_MODE delete_mode;
596  int16_t x_debug_delete_mode;
597  CRUNCH_MODE x_delete_mode;
598 
599  page_res_it.restart_page();
600  while (page_res_it.word() != nullptr) {
601  word = page_res_it.word();
602 
603  delete_mode = word_deletable (word, debug_delete_mode);
604  if (delete_mode != CR_NONE) {
605  if (word->word->flag (W_BOL) || deleting_from_bol) {
606  if (crunch_debug > 0) {
607  tprintf ("BOL CRUNCH DELETING(%d): \"%s\"\n",
608  debug_delete_mode,
609  word->best_choice->unichar_string().string());
610  }
611  word->unlv_crunch_mode = delete_mode;
612  deleting_from_bol = true;
613  } else if (word->word->flag(W_EOL)) {
614  if (marked_delete_point) {
615  while (copy_it.word() != word) {
616  x_delete_mode = word_deletable (copy_it.word (),
617  x_debug_delete_mode);
618  if (crunch_debug > 0) {
619  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
620  x_debug_delete_mode,
621  copy_it.word()->best_choice->unichar_string().string());
622  }
623  copy_it.word ()->unlv_crunch_mode = x_delete_mode;
624  copy_it.forward ();
625  }
626  }
627  if (crunch_debug > 0) {
628  tprintf ("EOL CRUNCH DELETING(%d): \"%s\"\n",
629  debug_delete_mode,
630  word->best_choice->unichar_string().string());
631  }
632  word->unlv_crunch_mode = delete_mode;
633  deleting_from_bol = false;
634  marked_delete_point = false;
635  }
636  else {
637  if (!marked_delete_point) {
638  copy_it = page_res_it;
639  marked_delete_point = true;
640  }
641  }
642  }
643  else {
644  deleting_from_bol = false;
645  //Forget earlier potential crunches
646  marked_delete_point = false;
647  }
648  /*
649  The following step has been left till now as the tess fails are used to
650  determine if the word is deletable.
651  */
653  word->merge_tess_fails();
654  page_res_it.forward ();
655  }
656 }
657 
658 
660  int i;
661  UNICHAR_ID unichar_dash = word_res->uch_set->unichar_to_id("-");
662  UNICHAR_ID unichar_space = word_res->uch_set->unichar_to_id(" ");
663  UNICHAR_ID unichar_tilde = word_res->uch_set->unichar_to_id("~");
664  UNICHAR_ID unichar_pow = word_res->uch_set->unichar_to_id("^");
665  for (i = 0; i < word_res->reject_map.length(); ++i) {
666  if (word_res->best_choice->unichar_id(i) == unichar_tilde) {
667  word_res->best_choice->set_unichar_id(unichar_dash, i);
668  if (word_res->reject_map[i].accepted ())
669  word_res->reject_map[i].setrej_unlv_rej ();
670  }
671  if (word_res->best_choice->unichar_id(i) == unichar_pow) {
672  word_res->best_choice->set_unichar_id(unichar_space, i);
673  if (word_res->reject_map[i].accepted ())
674  word_res->reject_map[i].setrej_unlv_rej ();
675  }
676  }
677 }
678 
679 GARBAGE_LEVEL Tesseract::garbage_word(WERD_RES *word, bool ok_dict_word) {
680  enum STATES
681  {
682  JUNK,
683  FIRST_UPPER,
684  FIRST_LOWER,
685  FIRST_NUM,
686  SUBSEQUENT_UPPER,
687  SUBSEQUENT_LOWER,
688  SUBSEQUENT_NUM
689  };
690  const char *str = word->best_choice->unichar_string().string();
691  const char *lengths = word->best_choice->unichar_lengths().string();
692  STATES state = JUNK;
693  int len = 0;
694  int isolated_digits = 0;
695  int isolated_alphas = 0;
696  int bad_char_count = 0;
697  int tess_rejs = 0;
698  int dodgy_chars = 0;
699  int ok_chars;
700  UNICHAR_ID last_char = -1;
701  int alpha_repetition_count = 0;
702  int longest_alpha_repetition_count = 0;
703  int longest_lower_run_len = 0;
704  int lower_string_count = 0;
705  int longest_upper_run_len = 0;
706  int upper_string_count = 0;
707  int total_alpha_count = 0;
708  int total_digit_count = 0;
709 
710  for (; *str != '\0'; str += *(lengths++)) {
711  len++;
712  if (word->uch_set->get_isupper (str, *lengths)) {
713  total_alpha_count++;
714  switch (state) {
715  case SUBSEQUENT_UPPER:
716  case FIRST_UPPER:
717  state = SUBSEQUENT_UPPER;
718  upper_string_count++;
719  if (longest_upper_run_len < upper_string_count)
720  longest_upper_run_len = upper_string_count;
721  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
722  alpha_repetition_count++;
723  if (longest_alpha_repetition_count < alpha_repetition_count) {
724  longest_alpha_repetition_count = alpha_repetition_count;
725  }
726  }
727  else {
728  last_char = word->uch_set->unichar_to_id(str, *lengths);
729  alpha_repetition_count = 1;
730  }
731  break;
732  case FIRST_NUM:
733  isolated_digits++;
734  // Fall through.
735  default:
736  state = FIRST_UPPER;
737  last_char = word->uch_set->unichar_to_id(str, *lengths);
738  alpha_repetition_count = 1;
739  upper_string_count = 1;
740  break;
741  }
742  }
743  else if (word->uch_set->get_islower (str, *lengths)) {
744  total_alpha_count++;
745  switch (state) {
746  case SUBSEQUENT_LOWER:
747  case FIRST_LOWER:
748  state = SUBSEQUENT_LOWER;
749  lower_string_count++;
750  if (longest_lower_run_len < lower_string_count)
751  longest_lower_run_len = lower_string_count;
752  if (last_char == word->uch_set->unichar_to_id(str, *lengths)) {
753  alpha_repetition_count++;
754  if (longest_alpha_repetition_count < alpha_repetition_count) {
755  longest_alpha_repetition_count = alpha_repetition_count;
756  }
757  }
758  else {
759  last_char = word->uch_set->unichar_to_id(str, *lengths);
760  alpha_repetition_count = 1;
761  }
762  break;
763  case FIRST_NUM:
764  isolated_digits++;
765  // Fall through.
766  default:
767  state = FIRST_LOWER;
768  last_char = word->uch_set->unichar_to_id(str, *lengths);
769  alpha_repetition_count = 1;
770  lower_string_count = 1;
771  break;
772  }
773  }
774  else if (word->uch_set->get_isdigit (str, *lengths)) {
775  total_digit_count++;
776  switch (state) {
777  case FIRST_NUM:
778  state = SUBSEQUENT_NUM;
779  case SUBSEQUENT_NUM:
780  break;
781  case FIRST_UPPER:
782  case FIRST_LOWER:
783  isolated_alphas++;
784  // Fall through.
785  default:
786  state = FIRST_NUM;
787  break;
788  }
789  }
790  else {
791  if (*lengths == 1 && *str == ' ')
792  tess_rejs++;
793  else
794  bad_char_count++;
795  switch (state) {
796  case FIRST_NUM:
797  isolated_digits++;
798  break;
799  case FIRST_UPPER:
800  case FIRST_LOWER:
801  isolated_alphas++;
802  default:
803  break;
804  }
805  state = JUNK;
806  }
807  }
808 
809  switch (state) {
810  case FIRST_NUM:
811  isolated_digits++;
812  break;
813  case FIRST_UPPER:
814  case FIRST_LOWER:
815  isolated_alphas++;
816  default:
817  break;
818  }
819 
821  total_alpha_count += total_digit_count - isolated_digits;
822  }
823 
824  if (crunch_leave_ok_strings && len >= 4 &&
825  2 * (total_alpha_count - isolated_alphas) > len &&
826  longest_alpha_repetition_count < crunch_long_repetitions) {
827  if ((crunch_accept_ok &&
828  acceptable_word_string(*word->uch_set, str, lengths) !=
829  AC_UNACCEPTABLE) ||
830  longest_lower_run_len > crunch_leave_lc_strings ||
831  longest_upper_run_len > crunch_leave_uc_strings)
832  return G_NEVER_CRUNCH;
833  }
834  if (word->reject_map.length() > 1 &&
835  strpbrk(str, " ") == nullptr &&
836  (word->best_choice->permuter() == SYSTEM_DAWG_PERM ||
837  word->best_choice->permuter() == FREQ_DAWG_PERM ||
838  word->best_choice->permuter() == USER_DAWG_PERM ||
839  word->best_choice->permuter() == NUMBER_PERM ||
840  acceptable_word_string(*word->uch_set, str, lengths) !=
841  AC_UNACCEPTABLE || ok_dict_word))
842  return G_OK;
843 
844  ok_chars = len - bad_char_count - isolated_digits -
845  isolated_alphas - tess_rejs;
846 
847  if (crunch_debug > 3) {
848  tprintf("garbage_word: \"%s\"\n",
849  word->best_choice->unichar_string().string());
850  tprintf("LEN: %d bad: %d iso_N: %d iso_A: %d rej: %d\n",
851  len,
852  bad_char_count, isolated_digits, isolated_alphas, tess_rejs);
853  }
854  if (bad_char_count == 0 &&
855  tess_rejs == 0 &&
856  (len > isolated_digits + isolated_alphas || len <= 2))
857  return G_OK;
858 
859  if (tess_rejs > ok_chars ||
860  (tess_rejs > 0 && (bad_char_count + tess_rejs) * 2 > len))
861  return G_TERRIBLE;
862 
863  if (len > 4) {
864  dodgy_chars = 2 * tess_rejs + bad_char_count + isolated_digits +
865  isolated_alphas;
866  if (dodgy_chars > 5 || (dodgy_chars / static_cast<float>(len)) > 0.5)
867  return G_DODGY;
868  else
869  return G_OK;
870  } else {
871  dodgy_chars = 2 * tess_rejs + bad_char_count;
872  if ((len == 4 && dodgy_chars > 2) ||
873  (len == 3 && dodgy_chars > 2) || dodgy_chars >= len)
874  return G_DODGY;
875  else
876  return G_OK;
877  }
878 }
879 
880 
881 /*************************************************************************
882  * word_deletable()
883  * DELETE WERDS AT ENDS OF ROWS IF
884  * Word is crunched &&
885  * ( string length = 0 OR
886  * > 50% of chars are "|" (before merging) OR
887  * certainty < -10 OR
888  * rating /char > 60 OR
889  * TOP of word is more than 0.5 xht BELOW baseline OR
890  * BOTTOM of word is more than 0.5 xht ABOVE xht OR
891  * length of word < 3xht OR
892  * height of word < 0.7 xht OR
893  * height of word > 3.0 xht OR
894  * >75% of the outline BBs have longest dimension < 0.5xht
895  *************************************************************************/
896 
897 CRUNCH_MODE Tesseract::word_deletable(WERD_RES *word, int16_t &delete_mode) {
898  int word_len = word->reject_map.length ();
899  float rating_per_ch;
900  TBOX box; //BB of word
901 
902  if (word->unlv_crunch_mode == CR_NONE) {
903  delete_mode = 0;
904  return CR_NONE;
905  }
906 
907  if (word_len == 0) {
908  delete_mode = 1;
909  return CR_DELETE;
910  }
911 
912  if (word->rebuild_word != nullptr) {
913  // Cube leaves rebuild_word nullptr.
914  box = word->rebuild_word->bounding_box();
915  if (box.height () < crunch_del_min_ht * kBlnXHeight) {
916  delete_mode = 4;
917  return CR_DELETE;
918  }
919 
920  if (noise_outlines(word->rebuild_word)) {
921  delete_mode = 5;
922  return CR_DELETE;
923  }
924  }
925 
926  if ((failure_count (word) * 1.5) > word_len) {
927  delete_mode = 2;
928  return CR_LOOSE_SPACE;
929  }
930 
931  if (word->best_choice->certainty () < crunch_del_cert) {
932  delete_mode = 7;
933  return CR_LOOSE_SPACE;
934  }
935 
936  rating_per_ch = word->best_choice->rating () / word_len;
937 
938  if (rating_per_ch > crunch_del_rating) {
939  delete_mode = 8;
940  return CR_LOOSE_SPACE;
941  }
942 
944  delete_mode = 9;
945  return CR_LOOSE_SPACE;
946  }
947 
948  if (box.bottom () >
950  delete_mode = 10;
951  return CR_LOOSE_SPACE;
952  }
953 
954  if (box.height () > crunch_del_max_ht * kBlnXHeight) {
955  delete_mode = 11;
956  return CR_LOOSE_SPACE;
957  }
958 
959  if (box.width () < crunch_del_min_width * kBlnXHeight) {
960  delete_mode = 3;
961  return CR_LOOSE_SPACE;
962  }
963 
964  delete_mode = 0;
965  return CR_NONE;
966 }
967 
969  const char *str = word->best_choice->unichar_string().string();
970  int tess_rejs = 0;
971 
972  for (; *str != '\0'; str++) {
973  if (*str == ' ')
974  tess_rejs++;
975  }
976  return tess_rejs;
977 }
978 
979 
981  TBOX box; // BB of outline
982  int16_t outline_count = 0;
983  int16_t small_outline_count = 0;
984  int16_t max_dimension;
985  float small_limit = kBlnXHeight * crunch_small_outlines_size;
986 
987  for (int b = 0; b < word->NumBlobs(); ++b) {
988  TBLOB* blob = word->blobs[b];
989  for (TESSLINE* ol = blob->outlines; ol != nullptr; ol = ol->next) {
990  outline_count++;
991  box = ol->bounding_box();
992  if (box.height() > box.width())
993  max_dimension = box.height();
994  else
995  max_dimension = box.width();
996  if (max_dimension < small_limit)
997  small_outline_count++;
998  }
999  }
1000  return small_outline_count >= outline_count;
1001 }
1002 
1003 } // namespace tesseract
int UNICHAR_ID
Definition: unichar.h:34
int16_t width() const
Definition: rect.h:115
void unrej_good_quality_words(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:160
bool empty() const
Definition: genericvector.h:91
bool flag(WERD_FLAGS mask) const
Definition: werd.h:117
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
int NumBlobs() const
Definition: blobs.h:448
void rej_word_doc_rej()
Definition: rejctmap.cpp:424
bool noise_outlines(TWERD *word)
Definition: docqual.cpp:980
void quality_based_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:138
Definition: blobs.h:418
void tilde_crunch(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:417
bool IsText() const
Definition: polyblk.h:49
CRUNCH_MODE word_deletable(WERD_RES *word, int16_t &delete_mode)
Definition: docqual.cpp:897
void word_char_quality(WERD_RES *word, ROW *row, int16_t *match_count, int16_t *accepted_match_count)
Definition: docqual.cpp:92
CRUNCH_MODE unlv_crunch_mode
Definition: pageres.h:315
void convert_bad_unlv_chs(WERD_RES *word_res)
Definition: docqual.cpp:659
GenericVector< TBLOB * > blobs
Definition: blobs.h:459
float rating() const
Definition: ratngs.h:317
void doc_and_block_rejection(PAGE_RES_IT &page_res_it, bool good_quality_doc)
Definition: docqual.cpp:232
bool potential_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level, bool ok_dict_word)
Definition: docqual.cpp:541
REJMAP reject_map
Definition: pageres.h:294
void set_unichar_id(UNICHAR_ID unichar_id, int index)
Definition: ratngs.h:349
const int kBlnBaselineOffset
Definition: normalis.h:25
int16_t word_blob_quality(WERD_RES *word, ROW *row)
Definition: docqual.cpp:60
bool get_isdigit(UNICHAR_ID unichar_id) const
Definition: unicharset.h:512
bool get_islower(UNICHAR_ID unichar_id) const
Definition: unicharset.h:498
void tilde_delete(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:589
ROW * row
Definition: pageres.h:140
int16_t word_outline_errs(WERD_RES *word)
Definition: docqual.cpp:72
bool check_debug_pt(WERD_RES *word, int location)
Definition: control.cpp:1849
ACCEPTABLE_WERD_TYPE acceptable_word_string(const UNICHARSET &char_set, const char *s, const char *lengths)
Definition: control.cpp:1745
Definition: blobs.h:284
TWERD * rebuild_word
Definition: pageres.h:266
ROW_RES * row() const
Definition: pageres.h:757
int16_t safe_dict_word(const WERD_RES *werd_res)
Definition: reject.cpp:608
double tessedit_whole_wd_rej_row_percent
int32_t char_count
Definition: pageres.h:141
void CountMatchingBlobs(int index)
Definition: docqual.cpp:34
uint32_t unsigned_size() const
Definition: strngs.h:72
const int kBlnXHeight
Definition: normalis.h:24
const STRING & unichar_lengths() const
Definition: ratngs.h:538
const char * string() const
Definition: strngs.cpp:194
TESSLINE * outlines
Definition: blobs.h:400
uint8_t permuter() const
Definition: ratngs.h:336
int16_t count_outline_errs(char c, int16_t outline_count)
Definition: docqual.cpp:126
PDBLK pdblk
Page Description Block.
Definition: ocrblock.h:190
int32_t whole_word_rej_count
Definition: pageres.h:143
void reject_whole_page(PAGE_RES_IT &page_res_it)
Definition: docqual.cpp:406
int NumOutlines() const
Definition: blobs.cpp:454
end of line
Definition: werd.h:33
Definition: ocrrow.h:36
BLOCK_RES * block() const
Definition: pageres.h:760
int16_t height() const
Definition: rect.h:108
int16_t failure_count(WERD_RES *word)
Definition: docqual.cpp:968
int32_t length() const
Definition: strngs.cpp:189
TESSLINE * next
Definition: blobs.h:281
void rej_word_block_rej()
Definition: rejctmap.cpp:433
void CountAcceptedBlobs(int index)
Definition: docqual.cpp:38
bool get_isupper(UNICHAR_ID unichar_id) const
Definition: unicharset.h:505
void merge_tess_fails()
Definition: pageres.cpp:1067
tesseract::BoxWord * bln_boxes
Definition: pageres.h:195
double tessedit_reject_block_percent
bool crunch_early_convert_bad_unlv_chs
WERD_RES * restart_page()
Definition: pageres.h:701
uint8_t space()
Definition: werd.h:99
void ProcessMatchedBlobs(const TWERD &other, TessCallback1< int > *cb) const
Definition: boxword.cpp:190
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
int16_t bottom() const
Definition: rect.h:65
CRUNCH_MODE
Definition: pageres.h:156
bool terrible_word_crunch(WERD_RES *word, GARBAGE_LEVEL garbage_level)
Definition: docqual.cpp:503
int32_t length() const
Definition: rejctmap.h:223
start of line
Definition: werd.h:32
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
const STRING & unichar_string() const
Definition: ratngs.h:531
int index() const
Definition: pdblock.h:67
double tessedit_reject_doc_percent
int32_t rej_count
Definition: pageres.h:142
ROW_RES * prev_row() const
Definition: pageres.h:748
Definition: docqual.h:32
Definition: strngs.h:45
int32_t char_count
Definition: pageres.h:78
Definition: rect.h:34
_ConstTessMemberResultCallback_5_0< false, R, T1, P1, P2, P3, P4, P5 >::base * NewPermanentTessCallback(const T1 *obj, R(T2::*member)(P1, P2, P3, P4, P5) const, typename Identity< P1 >::type p1, typename Identity< P2 >::type p2, typename Identity< P3 >::type p3, typename Identity< P4 >::type p4, typename Identity< P5 >::type p5)
Definition: tesscallback.h:258
int16_t accept_count()
Definition: rejctmap.cpp:279
void unrej_good_chs(WERD_RES *word, ROW *row)
Definition: docqual.cpp:115
WERD * word
Definition: pageres.h:186
void rej_stat_word()
Definition: pageres.cpp:1667
void rej_word_row_rej()
Definition: rejctmap.cpp:442
bool rejected
Definition: pageres.h:81
float certainty() const
Definition: ratngs.h:320
double tessedit_good_doc_still_rowrej_wd
bool tessedit_preserve_row_rej_perfect_wds
int32_t rej_count
Definition: pageres.h:118
int32_t char_count
Definition: pageres.h:117
int32_t rej_count
Definition: pageres.h:79
Unacceptable word.
Definition: control.h:30
BLOCK * block
Definition: pageres.h:116
const UNICHARSET * uch_set
Definition: pageres.h:203
GARBAGE_LEVEL
Definition: docqual.h:29
WERD_CHOICE * best_choice
Definition: pageres.h:241
WERD_RES * word() const
Definition: pageres.h:754
WERD_RES * forward()
Definition: pageres.h:734
int16_t reject_count()
Definition: rejctmap.h:229
DocQualCallbacks(WERD_RES *word0)
Definition: docqual.cpp:31
double tessedit_reject_row_percent
GARBAGE_LEVEL garbage_word(WERD_RES *word, bool ok_dict_word)
Definition: docqual.cpp:679
int16_t top() const
Definition: rect.h:58
PAGE_RES * page_res
Definition: pageres.h:677
TBOX bounding_box() const
Definition: blobs.cpp:861
bool tessedit_preserve_blk_rej_perfect_wds
bool reject_spaces
Definition: pageres.h:341
POLY_BLOCK * poly_block() const
Definition: pdblock.h:55
bool quality_recoverable_rejects()
Definition: rejctmap.cpp:300
void AcceptIfGoodQuality(int index)
Definition: docqual.cpp:44