tesseract  4.1.1
permdawg.cpp
Go to the documentation of this file.
1 /* -*-C-*-
2  ********************************************************************************
3  *
4  * File: permdawg.cpp (Formerly permdawg.c)
5  * Description: Scale word choices by a dictionary
6  * Author: Mark Seaman, OCR Technology
7  *
8  * (c) Copyright 1987, Hewlett-Packard Company.
9  ** Licensed under the Apache License, Version 2.0 (the "License");
10  ** you may not use this file except in compliance with the License.
11  ** You may obtain a copy of the License at
12  ** http://www.apache.org/licenses/LICENSE-2.0
13  ** Unless required by applicable law or agreed to in writing, software
14  ** distributed under the License is distributed on an "AS IS" BASIS,
15  ** WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
16  ** See the License for the specific language governing permissions and
17  ** limitations under the License.
18  *
19  *********************************************************************************/
20 /*----------------------------------------------------------------------
21  I n c l u d e s
22 ----------------------------------------------------------------------*/
23 
24 #include "dawg.h"
25 #include "stopper.h"
26 #include "tprintf.h"
27 #include "params.h"
28 
29 #include <algorithm>
30 #include <cctype>
31 #include "dict.h"
32 
33 /*----------------------------------------------------------------------
34  F u n c t i o n s
35 ----------------------------------------------------------------------*/
36 namespace tesseract {
37 
45  const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices,
46  int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info,
47  bool word_ending, WERD_CHOICE *word, float certainties[], float *limit,
48  WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args) {
49  auto *more_args = static_cast<DawgArgs *>(void_more_args);
50  word_ending = (char_choice_index == char_choices.size()-1);
51  int word_index = word->length() - 1;
52  if (best_choice->rating() < *limit) return;
53  // Look up char in DAWG
54 
55  // If the current unichar is an ngram first try calling
56  // letter_is_okay() for each unigram it contains separately.
57  UNICHAR_ID orig_uch_id = word->unichar_id(word_index);
58  bool checked_unigrams = false;
59  if (getUnicharset().get_isngram(orig_uch_id)) {
60  if (dawg_debug_level) {
61  tprintf("checking unigrams in an ngram %s\n",
62  getUnicharset().debug_str(orig_uch_id).string());
63  }
64  int num_unigrams = 0;
65  word->remove_last_unichar_id();
67  const char *ngram_str = getUnicharset().id_to_unichar(orig_uch_id);
68  // Since the string came out of the unicharset, failure is impossible.
69  ASSERT_HOST(getUnicharset().encode_string(ngram_str, true, &encoding, nullptr,
70  nullptr));
71  bool unigrams_ok = true;
72  // Construct DawgArgs that reflect the current state.
73  DawgPositionVector unigram_active_dawgs = *(more_args->active_dawgs);
74  DawgPositionVector unigram_updated_dawgs;
75  DawgArgs unigram_dawg_args(&unigram_active_dawgs,
76  &unigram_updated_dawgs,
77  more_args->permuter);
78  // Check unigrams in the ngram with letter_is_okay().
79  for (int i = 0; unigrams_ok && i < encoding.size(); ++i) {
80  UNICHAR_ID uch_id = encoding[i];
81  ASSERT_HOST(uch_id != INVALID_UNICHAR_ID);
82  ++num_unigrams;
83  word->append_unichar_id(uch_id, 1, 0.0, 0.0);
84  unigrams_ok = (this->*letter_is_okay_)(
85  &unigram_dawg_args, *word->unicharset(),
86  word->unichar_id(word_index+num_unigrams-1),
87  word_ending && i == encoding.size() - 1);
88  (*unigram_dawg_args.active_dawgs) = *(unigram_dawg_args.updated_dawgs);
89  if (dawg_debug_level) {
90  tprintf("unigram %s is %s\n",
91  getUnicharset().debug_str(uch_id).string(),
92  unigrams_ok ? "OK" : "not OK");
93  }
94  }
95  // Restore the word and copy the updated dawg state if needed.
96  while (num_unigrams-- > 0) word->remove_last_unichar_id();
97  word->append_unichar_id_space_allocated(orig_uch_id, 1, 0.0, 0.0);
98  if (unigrams_ok) {
99  checked_unigrams = true;
100  more_args->permuter = unigram_dawg_args.permuter;
101  *(more_args->updated_dawgs) = *(unigram_dawg_args.updated_dawgs);
102  }
103  }
104 
105  // Check which dawgs from the dawgs_ vector contain the word
106  // up to and including the current unichar.
107  if (checked_unigrams || (this->*letter_is_okay_)(
108  more_args, *word->unicharset(), word->unichar_id(word_index),
109  word_ending)) {
110  // Add a new word choice
111  if (word_ending) {
112  if (dawg_debug_level) {
113  tprintf("found word = %s\n", word->debug_string().string());
114  }
115  if (strcmp(output_ambig_words_file.string(), "") != 0) {
116  if (output_ambig_words_file_ == nullptr) {
117  output_ambig_words_file_ =
118  fopen(output_ambig_words_file.string(), "wb+");
119  if (output_ambig_words_file_ == nullptr) {
120  tprintf("Failed to open output_ambig_words_file %s\n",
121  output_ambig_words_file.string());
122  exit(1);
123  }
124  STRING word_str;
125  word->string_and_lengths(&word_str, nullptr);
126  word_str += " ";
127  fprintf(output_ambig_words_file_, "%s", word_str.string());
128  }
129  STRING word_str;
130  word->string_and_lengths(&word_str, nullptr);
131  word_str += " ";
132  fprintf(output_ambig_words_file_, "%s", word_str.string());
133  }
134  WERD_CHOICE *adjusted_word = word;
135  adjusted_word->set_permuter(more_args->permuter);
136  update_best_choice(*adjusted_word, best_choice);
137  } else { // search the next letter
138  // Make updated_* point to the next entries in the DawgPositionVector
139  // arrays (that were originally created in dawg_permute_and_select)
140  ++(more_args->updated_dawgs);
141  // Make active_dawgs and constraints point to the updated ones.
142  ++(more_args->active_dawgs);
143  permute_choices(debug, char_choices, char_choice_index + 1,
144  prev_char_frag_info, word, certainties, limit,
145  best_choice, attempts_left, more_args);
146  // Restore previous state to explore another letter in this position.
147  --(more_args->updated_dawgs);
148  --(more_args->active_dawgs);
149  }
150  } else {
151  if (dawg_debug_level) {
152  tprintf("last unichar not OK at index %d in %s\n",
153  word_index, word->debug_string().string());
154  }
155  }
156 }
157 
158 
169  const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit) {
170  auto *best_choice = new WERD_CHOICE(&getUnicharset());
171  best_choice->make_bad();
172  best_choice->set_rating(rating_limit);
173  if (char_choices.length() == 0 || char_choices.length() > MAX_WERD_LENGTH)
174  return best_choice;
175  auto *active_dawgs =
176  new DawgPositionVector[char_choices.length() + 1];
177  init_active_dawgs(&(active_dawgs[0]), true);
178  DawgArgs dawg_args(&(active_dawgs[0]), &(active_dawgs[1]), NO_PERM);
180 
181  float certainties[MAX_WERD_LENGTH];
183  int attempts_left = max_permuter_attempts;
184  permute_choices((dawg_debug_level) ? "permute_dawg_debug" : nullptr,
185  char_choices, 0, nullptr, &word, certainties, &rating_limit, best_choice,
186  &attempts_left, &dawg_args);
187  delete[] active_dawgs;
188  return best_choice;
189 }
190 
198  const char *debug,
199  const BLOB_CHOICE_LIST_VECTOR &char_choices,
200  int char_choice_index,
201  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
202  WERD_CHOICE *word,
203  float certainties[],
204  float *limit,
205  WERD_CHOICE *best_choice,
206  int *attempts_left,
207  void *more_args) {
208  if (debug) {
209  tprintf("%s permute_choices: char_choice_index=%d"
210  " limit=%g rating=%g, certainty=%g word=%s\n",
211  debug, char_choice_index, *limit, word->rating(),
212  word->certainty(), word->debug_string().string());
213  }
214  if (char_choice_index < char_choices.length()) {
215  BLOB_CHOICE_IT blob_choice_it;
216  blob_choice_it.set_to_list(char_choices.get(char_choice_index));
217  for (blob_choice_it.mark_cycle_pt(); !blob_choice_it.cycled_list();
218  blob_choice_it.forward()) {
219  (*attempts_left)--;
220  append_choices(debug, char_choices, *(blob_choice_it.data()),
221  char_choice_index, prev_char_frag_info, word,
222  certainties, limit, best_choice, attempts_left, more_args);
223  if (*attempts_left <= 0) {
224  if (debug) tprintf("permute_choices(): attempts_left is 0\n");
225  break;
226  }
227  }
228  }
229 }
230 
240  const char *debug,
241  const BLOB_CHOICE_LIST_VECTOR &char_choices,
242  const BLOB_CHOICE &blob_choice,
243  int char_choice_index,
244  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
245  WERD_CHOICE *word,
246  float certainties[],
247  float *limit,
248  WERD_CHOICE *best_choice,
249  int *attempts_left,
250  void *more_args) {
251  int word_ending = (char_choice_index == char_choices.length() - 1);
252 
253  // Deal with fragments.
254  CHAR_FRAGMENT_INFO char_frag_info;
255  if (!fragment_state_okay(blob_choice.unichar_id(), blob_choice.rating(),
256  blob_choice.certainty(), prev_char_frag_info, debug,
257  word_ending, &char_frag_info)) {
258  return; // blob_choice must be an invalid fragment
259  }
260  // Search the next letter if this character is a fragment.
261  if (char_frag_info.unichar_id == INVALID_UNICHAR_ID) {
262  permute_choices(debug, char_choices, char_choice_index + 1,
263  &char_frag_info, word, certainties, limit,
264  best_choice, attempts_left, more_args);
265  return;
266  }
267 
268  // Add the next unichar.
269  float old_rating = word->rating();
270  float old_certainty = word->certainty();
271  uint8_t old_permuter = word->permuter();
272  certainties[word->length()] = char_frag_info.certainty;
274  char_frag_info.unichar_id, char_frag_info.num_fragments,
275  char_frag_info.rating, char_frag_info.certainty);
276 
277  // Explore the next unichar.
278  (this->*go_deeper_fxn_)(debug, char_choices, char_choice_index,
279  &char_frag_info, word_ending, word, certainties,
280  limit, best_choice, attempts_left, more_args);
281 
282  // Remove the unichar we added to explore other choices in it's place.
283  word->remove_last_unichar_id();
284  word->set_rating(old_rating);
285  word->set_certainty(old_certainty);
286  word->set_permuter(old_permuter);
287 }
288 
315  float curr_rating, float curr_certainty,
316  const CHAR_FRAGMENT_INFO *prev_char_frag_info,
317  const char *debug, int word_ending,
318  CHAR_FRAGMENT_INFO *char_frag_info) {
319  const CHAR_FRAGMENT *this_fragment =
320  getUnicharset().get_fragment(curr_unichar_id);
321  const CHAR_FRAGMENT *prev_fragment =
322  prev_char_frag_info != nullptr ? prev_char_frag_info->fragment : nullptr;
323 
324  // Print debug info for fragments.
325  if (debug && (prev_fragment || this_fragment)) {
326  tprintf("%s check fragments: choice=%s word_ending=%d\n", debug,
327  getUnicharset().debug_str(curr_unichar_id).string(),
328  word_ending);
329  if (prev_fragment) {
330  tprintf("prev_fragment %s\n", prev_fragment->to_string().string());
331  }
332  if (this_fragment) {
333  tprintf("this_fragment %s\n", this_fragment->to_string().string());
334  }
335  }
336 
337  char_frag_info->unichar_id = curr_unichar_id;
338  char_frag_info->fragment = this_fragment;
339  char_frag_info->rating = curr_rating;
340  char_frag_info->certainty = curr_certainty;
341  char_frag_info->num_fragments = 1;
342  if (prev_fragment && !this_fragment) {
343  if (debug) tprintf("Skip choice with incomplete fragment\n");
344  return false;
345  }
346  if (this_fragment) {
347  // We are dealing with a fragment.
348  char_frag_info->unichar_id = INVALID_UNICHAR_ID;
349  if (prev_fragment) {
350  if (!this_fragment->is_continuation_of(prev_fragment)) {
351  if (debug) tprintf("Non-matching fragment piece\n");
352  return false;
353  }
354  if (this_fragment->is_ending()) {
355  char_frag_info->unichar_id =
356  getUnicharset().unichar_to_id(this_fragment->get_unichar());
357  char_frag_info->fragment = nullptr;
358  if (debug) {
359  tprintf("Built character %s from fragments\n",
360  getUnicharset().debug_str(
361  char_frag_info->unichar_id).string());
362  }
363  } else {
364  if (debug) tprintf("Record fragment continuation\n");
365  char_frag_info->fragment = this_fragment;
366  }
367  // Update certainty and rating.
368  char_frag_info->rating =
369  prev_char_frag_info->rating + curr_rating;
370  char_frag_info->num_fragments = prev_char_frag_info->num_fragments + 1;
371  char_frag_info->certainty =
372  std::min(curr_certainty, prev_char_frag_info->certainty);
373  } else {
374  if (this_fragment->is_beginning()) {
375  if (debug) tprintf("Record fragment beginning\n");
376  } else {
377  if (debug) {
378  tprintf("Non-starting fragment piece with no prev_fragment\n");
379  }
380  return false;
381  }
382  }
383  }
384  if (word_ending && char_frag_info->fragment) {
385  if (debug) tprintf("Word can not end with a fragment\n");
386  return false;
387  }
388  return true;
389 }
390 
391 } // namespace tesseract
int UNICHAR_ID
Definition: unichar.h:34
DLLSYM void tprintf(const char *format,...)
Definition: tprintf.cpp:35
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
Definition: dict.cpp:600
bool is_ending() const
Definition: unicharset.h:108
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
Definition: unicharset.h:98
int max_permuter_attempts
Definition: dict.h:658
const char * get_unichar() const
Definition: unicharset.h:70
float rating
Definition: dict.h:47
int length() const
Definition: ratngs.h:293
DawgPositionVector * updated_dawgs
Definition: dict.h:85
UNICHAR_ID unichar_id
Definition: dict.h:44
float rating() const
Definition: ratngs.h:317
void permute_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:197
char * output_ambig_words_file
Definition: dict.h:620
T & get(int index) const
PermuterType permuter
Definition: dict.h:86
#define MAX_WERD_LENGTH
Definition: dict.h:39
const char * string() const
Definition: strngs.cpp:194
int num_fragments
Definition: dict.h:46
const UNICHARSET * unicharset() const
Definition: ratngs.h:290
uint8_t permuter() const
Definition: ratngs.h:336
int length() const
Definition: genericvector.h:86
UNICHAR_ID unichar_id() const
Definition: ratngs.h:77
void append_choices(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, const BLOB_CHOICE &blob_choice, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *more_args)
Definition: permdawg.cpp:239
void set_certainty(float new_val)
Definition: ratngs.h:362
void append_unichar_id(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.cpp:472
void append_unichar_id_space_allocated(UNICHAR_ID unichar_id, int blob_count, float rating, float certainty)
Definition: ratngs.h:442
float certainty
Definition: dict.h:48
void update_best_choice(const WERD_CHOICE &word, WERD_CHOICE *best_choice)
Definition: dict.h:182
float rating() const
Definition: ratngs.h:80
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
Definition: ratngs.cpp:453
void remove_last_unichar_id()
Definition: ratngs.h:473
int(Dict::* letter_is_okay_)(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Definition: dict.h:372
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
Definition: unicharset.cpp:210
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
Definition: unicharset.h:734
UNICHAR_ID unichar_id(int index) const
Definition: ratngs.h:305
const UNICHARSET & getUnicharset() const
Definition: dict.h:101
void set_rating(float new_val)
Definition: ratngs.h:359
void set_permuter(uint8_t perm)
Definition: ratngs.h:365
int dawg_debug_level
Definition: dict.h:622
Definition: strngs.h:45
const char * id_to_unichar(UNICHAR_ID id) const
Definition: unicharset.cpp:291
bool is_beginning() const
Definition: unicharset.h:105
DawgPositionVector * active_dawgs
Definition: dict.h:84
void(Dict::* go_deeper_fxn_)(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Pointer to go_deeper function.
Definition: dict.h:216
float certainty() const
Definition: ratngs.h:320
static STRING to_string(const char *unichar, int pos, int total, bool natural)
int size() const
Definition: genericvector.h:72
WERD_CHOICE * dawg_permute_and_select(const BLOB_CHOICE_LIST_VECTOR &char_choices, float rating_limit)
Definition: permdawg.cpp:168
void go_deeper_dawg_fxn(const char *debug, const BLOB_CHOICE_LIST_VECTOR &char_choices, int char_choice_index, const CHAR_FRAGMENT_INFO *prev_char_frag_info, bool word_ending, WERD_CHOICE *word, float certainties[], float *limit, WERD_CHOICE *best_choice, int *attempts_left, void *void_more_args)
Definition: permdawg.cpp:44
bool fragment_state_okay(UNICHAR_ID curr_unichar_id, float curr_rating, float curr_certainty, const CHAR_FRAGMENT_INFO *prev_char_frag_info, const char *debug, int word_ending, CHAR_FRAGMENT_INFO *char_frag_info)
Definition: permdawg.cpp:314
#define ASSERT_HOST(x)
Definition: errcode.h:88
const CHAR_FRAGMENT * fragment
Definition: dict.h:45
const STRING debug_string() const
Definition: ratngs.h:495
float certainty() const
Definition: ratngs.h:83