46 static inline double log2(
double n) {
55 :
INT_MEMBER(language_model_debug_level, 0,
"Language model debug level",
56 dict->getCCUtil()->params()),
58 "Turn on/off the use of character ngram model",
59 dict->getCCUtil()->params()),
61 "Maximum order of the character ngram model",
62 dict->getCCUtil()->params()),
63 INT_MEMBER(language_model_viterbi_list_max_num_prunable, 10,
64 "Maximum number of prunable (those for which" 65 " PrunablePath() is true) entries in each viterbi list" 66 " recorded in BLOB_CHOICEs",
67 dict->getCCUtil()->params()),
68 INT_MEMBER(language_model_viterbi_list_max_size, 500,
69 "Maximum size of viterbi lists recorded in BLOB_CHOICEs",
70 dict->getCCUtil()->params()),
72 "To avoid overly small denominators use this as the " 73 "floor of the probability returned by the ngram model.",
74 dict->getCCUtil()->params()),
76 "Average classifier score of a non-matching unichar.",
77 dict->getCCUtil()->params()),
78 BOOL_MEMBER(language_model_ngram_use_only_first_uft8_step, false,
79 "Use only the first UTF8 step of the given string" 80 " when computing log probabilities.",
81 dict->getCCUtil()->params()),
83 "Strength of the character ngram model relative to the" 84 " character classifier ",
85 dict->getCCUtil()->params()),
87 "Factor to bring log-probs into the same range as ratings" 88 " when multiplied by outline length ",
89 dict->getCCUtil()->params()),
90 BOOL_MEMBER(language_model_ngram_space_delimited_language, true,
91 "Words are delimited by space", dict->getCCUtil()->params()),
92 INT_MEMBER(language_model_min_compound_length, 3,
93 "Minimum length of compound words",
94 dict->getCCUtil()->params()),
96 "Penalty for words not in the frequent word dictionary",
97 dict->getCCUtil()->params()),
99 "Penalty for non-dictionary words",
100 dict->getCCUtil()->params()),
102 "Penalty for inconsistent punctuation",
103 dict->getCCUtil()->params()),
105 "Penalty for inconsistent case",
106 dict->getCCUtil()->params()),
108 "Penalty for inconsistent script",
109 dict->getCCUtil()->params()),
111 "Penalty for inconsistent character type",
112 dict->getCCUtil()->params()),
116 "Penalty for inconsistent font",
117 dict->getCCUtil()->params()),
119 "Penalty for inconsistent spacing",
120 dict->getCCUtil()->params()),
121 double_MEMBER(language_model_penalty_increment, 0.01,
"Penalty increment",
122 dict->getCCUtil()->params()),
123 INT_MEMBER(wordrec_display_segmentations, 0,
"Display Segmentations",
124 dict->getCCUtil()->params()),
126 "Use sigmoidal score for certainty",
127 dict->getCCUtil()->params()),
129 fontinfo_table_(fontinfo_table),
137 bool fixed_pitch,
float max_char_wh_ratio,
138 float rating_cert_scale) {
154 if (prev_word !=
nullptr && prev_word->
unichar_string() !=
nullptr) {
176 static void ScanParentsForCaseMix(
const UNICHARSET& unicharset,
178 if (parent_node ==
nullptr)
return;
180 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
187 if (other_case == unichar_id)
continue;
193 for (vit2.mark_cycle_pt(); !vit2.cycled_list() &&
194 vit2.data()->curr_b->unichar_id() != other_case;
196 if (!vit2.cycled_list()) {
207 static bool HasBetterCaseVariant(
const UNICHARSET& unicharset,
209 BLOB_CHOICE_LIST* choices) {
212 if (other_case == choice_id || other_case == INVALID_UNICHAR_ID)
216 BLOB_CHOICE_IT bc_it(choices);
217 for (bc_it.mark_cycle_pt(); !bc_it.cycled_list(); bc_it.forward()) {
219 if (better_choice->
unichar_id() == other_case)
221 else if (better_choice == choice)
254 bool just_classified,
255 int curr_col,
int curr_row,
256 BLOB_CHOICE_LIST *curr_list,
263 tprintf(
"\nUpdateState: col=%d row=%d %s",
264 curr_col, curr_row, just_classified ?
"just_classified" :
"");
266 tprintf(
"(parent=%p)\n", parent_node);
272 bool new_changed =
false;
278 bool has_alnum_mix =
false;
279 if (parent_node !=
nullptr) {
283 tprintf(
"No parents found to process\n");
287 has_alnum_mix =
true;
291 has_alnum_mix =
false;;
292 ScanParentsForCaseMix(unicharset, parent_node);
294 parent_node->
Print(
"Parent viterbi list");
299 ViterbiStateEntry_IT vit;
300 BLOB_CHOICE_IT c_it(curr_list);
301 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
313 if (c_it.at_first() || !new_changed)
317 if (first_digit == choice) blob_choice_flags |=
kDigitFlag;
319 if (parent_node ==
nullptr) {
331 if (HasBetterCaseVariant(unicharset, choice, curr_list))
337 blob_choice_flags, denom, word_end, curr_col, curr_row,
338 choice, curr_state,
nullptr, pain_points,
339 word_res, best_choice_bundle, blamer_bundle);
348 c_it.data(), blob_choice_flags,
349 unicharset, word_res, &vit,
350 &top_choice_flags)) !=
nullptr) {
363 HasBetterCaseVariant(unicharset, choice, curr_list))
368 top_choice_flags, denom, word_end, curr_col, curr_row,
369 c_it.data(), curr_state, parent_vse, pain_points,
370 word_res, best_choice_bundle, blamer_bundle);
387 BLOB_CHOICE_IT c_it(curr_list);
390 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
393 if (first_unichar ==
nullptr) first_unichar = c_it.data();
394 if (*first_lower ==
nullptr && unicharset.
get_islower(unichar_id)) {
395 *first_lower = c_it.data();
397 if (*first_upper ==
nullptr && unicharset.
get_isalpha(unichar_id) &&
399 *first_upper = c_it.data();
401 if (*first_digit ==
nullptr && unicharset.
get_isdigit(unichar_id)) {
402 *first_digit = c_it.data();
406 bool mixed = (*first_lower !=
nullptr || *first_upper !=
nullptr) &&
407 *first_digit !=
nullptr;
408 if (*first_lower ==
nullptr) *first_lower = first_unichar;
409 if (*first_upper ==
nullptr) *first_upper = first_unichar;
410 if (*first_digit ==
nullptr) *first_digit = first_unichar;
425 if (parent_node ==
nullptr)
return -1;
431 float lower_rating = 0.0f;
432 float upper_rating = 0.0f;
433 float digit_rating = 0.0f;
434 float top_rating = 0.0f;
437 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
444 while (unichar_id == INVALID_UNICHAR_ID &&
450 if (unichar_id != INVALID_UNICHAR_ID) {
452 if (top_lower ==
nullptr || lower_rating > rating) {
454 lower_rating = rating;
457 if (top_upper ==
nullptr || upper_rating > rating) {
459 upper_rating = rating;
462 if (top_digit ==
nullptr || digit_rating > rating) {
464 digit_rating = rating;
468 if (top_choice ==
nullptr || top_rating > rating) {
474 if (top_choice ==
nullptr)
return -1;
475 bool mixed = (top_lower !=
nullptr || top_upper !=
nullptr) &&
476 top_digit !=
nullptr;
477 if (top_lower ==
nullptr) top_lower = top_choice;
479 if (top_upper ==
nullptr) top_upper = top_choice;
481 if (top_digit ==
nullptr) top_digit = top_choice;
492 return mixed ? 1 : 0;
501 bool just_classified,
bool mixed_alnum,
const BLOB_CHOICE* bc,
503 WERD_RES* word_res, ViterbiStateEntry_IT* vse_it,
505 for (; !vse_it->cycled_list(); vse_it->forward()) {
509 if (!just_classified && !parent_vse->
updated)
continue;
511 parent_vse->
Print(
"Considering");
513 *top_choice_flags = blob_choice_flags;
526 (mixed_alnum || *top_choice_flags == 0))
532 (mixed_alnum || *top_choice_flags == 0))
541 tprintf(
"Parent %s has competition %s\n",
565 int curr_col,
int curr_row,
573 ViterbiStateEntry_IT vit;
575 tprintf(
"AddViterbiStateEntry for unichar %s rating=%.4f" 576 " certainty=%.4f top_choice_flags=0x%x",
580 tprintf(
" parent_vse=%p\n", parent_vse);
589 tprintf(
"AddViterbiStateEntry: viterbi list is full!\n");
598 float outline_length =
605 denom, curr_col, curr_row, outline_length, parent_vse);
608 bool liked_by_language_model = dawg_info !=
nullptr ||
609 (ngram_info !=
nullptr && !ngram_info->
pruned);
612 if (!liked_by_language_model && top_choice_flags == 0) {
614 tprintf(
"Language model components very early pruned this entry\n");
635 if (!liked_by_language_model && top_choice_flags == 0) {
637 tprintf(
"Language model components early pruned this entry\n");
646 word_res, &consistency_info);
647 if (dawg_info !=
nullptr && consistency_info.
invalid_punc) {
654 parent_vse, word_res, &associate_stats);
655 if (parent_vse !=
nullptr) {
662 parent_vse, b, 0.0, outline_length,
663 consistency_info, associate_stats, top_choice_flags, dawg_info,
668 tprintf(
"Adjusted cost = %g\n", new_vse->cost);
677 bool keep = new_vse->top_choice_flags || liked_by_language_model;
684 tprintf(
"Language model components did not like this entry\n");
698 tprintf(
"Discarded ViterbiEntry with high cost %g max cost %g\n",
709 best_choice_bundle, blamer_bundle);
712 new_vse != best_choice_bundle->
best_vse) {
714 tprintf(
"Discarded ViterbiEntry with high cost %g\n", new_vse->cost);
733 new_vse->top_choice_flags) {
737 for (vit.mark_cycle_pt(); !vit.cycled_list(); vit.forward()) {
743 curr_vse->
cost > new_vse->cost) {
746 if (prunable_counter > 0 &&
PrunablePath(*curr_vse)) --prunable_counter;
748 if (prunable_counter == 0) {
751 tprintf(
"Set viterbi_state_entries_prunable_max_cost to %g\n",
754 prunable_counter = -1;
761 new_vse->Print(
"New");
763 curr_state->
Print(
"Updated viterbi list");
773 for (vit.mark_cycle_pt(); !vit.cycled_list() && new_vse->
top_choice_flags &&
774 new_vse->
cost >= vit.data()->cost; vit.forward()) {
780 tprintf(
"GenerateTopChoiceInfo: top_choice_flags=0x%x\n",
787 int curr_col,
int curr_row,
792 if (parent_vse ==
nullptr) {
796 if (parent_vse->
dawg_info ==
nullptr)
return nullptr;
816 if (parent_vse ==
nullptr || word_end ||
823 bool has_word_ending =
false;
831 has_word_ending =
true;
835 if (!has_word_ending)
return nullptr;
849 for (
int i = 0; i < normed_ids.
size(); ++i) {
851 tprintf(
"Test Letter OK for unichar %d, normed %d\n",
854 word_end && i == normed_ids.
size() - 1);
857 }
else if (i < normed_ids.
size() - 1) {
862 tprintf(
"Letter was OK for unichar %d, normed %d\n",
878 const char *unichar,
float certainty,
float denom,
879 int curr_col,
int curr_row,
float outline_length,
882 const char *pcontext_ptr =
"";
883 int pcontext_unichar_step_len = 0;
884 if (parent_vse ==
nullptr) {
889 pcontext_unichar_step_len =
893 int unichar_step_len = 0;
896 float ngram_and_classifier_cost =
898 pcontext_ptr, &unichar_step_len,
899 &pruned, &ngram_cost);
903 ngram_and_classifier_cost *=
906 if (parent_vse !=
nullptr) {
907 ngram_and_classifier_cost +=
913 int num_remove = (unichar_step_len + pcontext_unichar_step_len -
915 if (num_remove > 0) pcontext_unichar_step_len -= num_remove;
916 while (num_remove > 0 && *pcontext_ptr !=
'\0') {
926 pcontext_ptr, pcontext_unichar_step_len, pruned, ngram_cost,
927 ngram_and_classifier_cost);
928 ngram_info->context += unichar;
929 ngram_info->context_unichar_step_len += unichar_step_len;
938 int *unichar_step_len,
939 bool *found_small_prob,
941 const char *context_ptr = context;
942 char *modified_context =
nullptr;
943 char *modified_context_end =
nullptr;
944 const char *unichar_ptr = unichar;
945 const char *unichar_end = unichar_ptr + strlen(unichar_ptr);
948 while (unichar_ptr < unichar_end &&
951 tprintf(
"prob(%s | %s)=%g\n", unichar_ptr, context_ptr,
955 ++(*unichar_step_len);
961 if (unichar_ptr < unichar_end) {
962 if (modified_context ==
nullptr) {
963 size_t context_len = strlen(context);
965 new char[context_len + strlen(unichar_ptr) + step + 1];
966 memcpy(modified_context, context, context_len);
967 modified_context_end = modified_context + context_len;
968 context_ptr = modified_context;
970 strncpy(modified_context_end, unichar_ptr - step, step);
971 modified_context_end += step;
972 *modified_context_end =
'\0';
975 prob /=
static_cast<float>(*unichar_step_len);
978 *found_small_prob =
true;
981 *ngram_cost = -1.0*log2(prob);
982 float ngram_and_classifier_cost =
986 tprintf(
"-log [ p(%s) * p(%s | %s) ] = -log2(%g*%g) = %g\n", unichar,
988 ngram_and_classifier_cost);
990 delete[] modified_context;
991 return ngram_and_classifier_cost;
995 if (curr_list->empty())
return 1.0f;
998 BLOB_CHOICE_IT c_it(curr_list);
999 for (c_it.mark_cycle_pt(); !c_it.cycled_list(); c_it.forward()) {
1033 consistency_info->
punc_ref = NO_EDGE;
1036 bool prev_is_numalpha = (parent_b !=
nullptr &&
1042 (is_apos && prev_is_numalpha)) ?
1044 if (consistency_info->
punc_ref == NO_EDGE ||
1052 node, pattern_unichar_id, word_end) : NO_EDGE;
1053 if (consistency_info->
punc_ref == NO_EDGE) {
1068 }
else if ((parent_b !=
nullptr) && unicharset.
get_isupper(unichar_id)) {
1091 if (parent_vse !=
nullptr &&
1097 consistency_info->
script_id = parent_script_id;
1099 if (consistency_info->
script_id != parent_script_id) {
1115 int fontinfo_id = -1;
1124 tprintf(
"pfont %s pfont %s font %s font2 %s common %s(%d)\n",
1136 bool expected_gap_found =
false;
1137 float expected_gap = 0.0f;
1139 if (fontinfo_id >= 0) {
1140 ASSERT_HOST(fontinfo_id < fontinfo_table_->size());
1142 parent_b->
unichar_id(), unichar_id, &temp_gap)) {
1143 expected_gap = temp_gap;
1144 expected_gap_found =
true;
1149 int num_addends = 0;
1151 for (
int i = 0; i < 4; ++i) {
1154 }
else if (i == 1) {
1156 }
else if (i == 2) {
1161 ASSERT_HOST(temp_fid < 0 || fontinfo_table_->size());
1163 parent_b->
unichar_id(), unichar_id, &temp_gap)) {
1164 expected_gap += temp_gap;
1168 if (num_addends > 0) {
1169 expected_gap /=
static_cast<float>(num_addends);
1170 expected_gap_found =
true;
1173 if (expected_gap_found) {
1174 int actual_gap = word_res->
GetBlobsGap(curr_col-1);
1175 if (actual_gap == 0) {
1178 float gap_ratio = expected_gap / actual_gap;
1184 if (gap_ratio < 0.0f || gap_ratio > 2.0f) {
1189 tprintf(
"spacing for %s(%d) %s(%d) col %d: expected %g actual %d\n",
1192 unichar_id, curr_col, expected_gap, actual_gap);
1206 tprintf(
"ComputeAdjustedPathCost %g ParamsModel features:\n", cost);
1209 tprintf(
"%s=%g\n", kParamsTrainingFeatureTypeName[f], features[f]);
1215 float adjustment = 1.0f;
1228 static_cast<float>(vse->
length);
1249 blamer_bundle, &truth_path);
1257 word->
print(
"UpdateBestChoice() constructed word");
1261 if (blamer_bundle !=
nullptr) {
1268 tprintf(
"Raw features extracted from %s (cost=%g) [ ",
1270 for (
float feature : curr_hyp.
features) {
1292 tprintf(
"Updated raw choice\n");
1316 best_choice_bundle->
updated =
true;
1317 best_choice_bundle->
best_vse = vse;
1319 tprintf(
"Updated best choice\n");
1331 if (blamer_bundle !=
nullptr) {
1345 int len = vse.
length <= kMaxSmallWordUnichars ? 0 :
1346 vse.
length <= kMaxMediumWordUnichars ? 1 : 2;
1396 if (truth_path !=
nullptr) {
1398 (blamer_bundle !=
nullptr &&
1409 float full_wh_ratio_mean = 0.0f;
1413 static_cast<float>(vse->
length));
1419 word->set_length(vse->
length);
1420 int total_blobs = 0;
1421 for (i = (vse->
length-1); i >= 0; --i) {
1422 if (blamer_bundle !=
nullptr && truth_path !=
nullptr && *truth_path &&
1424 *truth_path =
false;
1428 total_blobs += num_blobs;
1429 word->set_blob_choice(i, num_blobs, curr_b);
1433 if ((full_wh_ratio_mean != 0.0f &&
1434 ((curr_vse != vse && curr_vse->
parent_vse !=
nullptr) ||
1439 tprintf(
"full_wh_ratio_var += (%g-%g)^2\n",
1452 if (curr_vse ==
nullptr)
break;
1453 curr_b = curr_vse->
curr_b;
1458 if (full_wh_ratio_mean != 0.0f) {
double language_model_ngram_nonmatch_score
static int utf8_step(const char *utf8_str)
void DisplaySegmentation(TWERD *word)
static void ExtractFeaturesFromPath(const ViterbiStateEntry &vse, float features[])
DLLSYM void tprintf(const char *format,...)
void init_active_dawgs(DawgPositionVector *active_dawgs, bool ambigs_mode) const
#define BOOL_MEMBER(name, val, comment, vec)
virtual UNICHAR_ID edge_letter(EDGE_REF edge_ref) const =0
Returns UNICHAR_ID stored in the edge indicated by the given EDGE_REF.
int context_unichar_step_len
PointerVector< LanguageModelState > beam
int language_model_viterbi_list_max_size
static const LanguageModelFlagsType kSmallestRatingFlag
const UnicityTable< FontInfo > * fontinfo_table_
DawgPositionVector active_dawgs
void Print(const char *msg) const
float ComputeDenom(BLOB_CHOICE_LIST *curr_list)
int viterbi_state_entries_prunable_length
Number and max cost of prunable paths in viterbi_state_entries.
bool PrunablePath(const ViterbiStateEntry &vse)
bool get_ispunctuation(UNICHAR_ID unichar_id) const
GenericVector< TBLOB * > blobs
DawgPositionVector * updated_dawgs
void SetScriptPositions(bool small_caps, TWERD *word, int debug=0)
ParamsModel params_model_
bool get_isdigit(UNICHAR_ID unichar_id) const
int get_script(UNICHAR_ID unichar_id) const
bool get_islower(UNICHAR_ID unichar_id) const
bool correct_segmentation_explored_
DawgPositionVector very_beginning_active_dawgs_
float outline_length
length of the outline so far
#define INT_MEMBER(name, val, comment, vec)
int InconsistentXHeight() const
float features[PTRAIN_NUM_FEATURE_TYPES]
bool GuidedSegsearchStillGoing() const
static NODE_REF GetStartingNode(const Dawg *dawg, EDGE_REF edge_ref)
Returns the appropriate next node given the EDGE_REF.
float ComputeAdjustedPathCost(ViterbiStateEntry *vse)
bool language_model_ngram_space_delimited_language
int16_t fontinfo_id2() const
const MATRIX_COORD & matrix_cell()
bool get_isalpha(UNICHAR_ID unichar_id) const
LMConsistencyInfo consistency_info
path consistency info
int language_model_ngram_order
virtual bool end_of_word(EDGE_REF edge_ref) const =0
bool language_model_ngram_on
#define BOOL_INIT_MEMBER(name, val, comment, vec)
static const float kMaxAvgNgramCost
LanguageModelNgramInfo * ngram_info
void print_state(const char *msg) const
const char * string() const
double language_model_penalty_increment
void FillConsistencyInfo(int curr_col, bool word_end, BLOB_CHOICE *b, ViterbiStateEntry *parent_vse, WERD_RES *word_res, LMConsistencyInfo *consistency_info)
virtual EDGE_REF edge_char_of(NODE_REF node, UNICHAR_ID unichar_id, bool word_end) const =0
Returns the edge that corresponds to the letter out of this node.
AssociateStats associate_stats
character widths/gaps/seams
float CertaintyScore(float cert)
double language_model_penalty_non_freq_dict_word
void Print(const char *msg)
bool AcceptableChoice(const WERD_CHOICE &best_choice, XHeightConsistencyEnum xheight_consistency)
Returns true if the given best_choice is good enough to stop.
float viterbi_state_entries_prunable_max_cost
void adjust_word(WERD_CHOICE *word, bool nonword, XHeightConsistencyEnum xheight_consistency, float additional_adjust, bool modify_rating, bool debug)
Adjusts the rating of the given word.
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
int SetTopParentLowerUpperDigit(LanguageModelState *parent_node) const
UNICHAR_ID unichar_id() const
float ComputeConsistencyAdjustment(const LanguageModelDawgInfo *dawg_info, const LMConsistencyInfo &consistency_info)
double ProbabilityInContext(const char *context, int context_bytes, const char *character, int character_bytes)
Calls probability_in_context_ member function.
bool AcceptablePath(const ViterbiStateEntry &vse)
float ngram_and_classifier_cost
-[ ln(P_classifier(path)) + scale_factor * ln(P_ngram_model(path)) ]
void UpdateBestRating(float rating)
ViterbiStateEntry * competing_vse
void reset_hyphen_vars(bool last_word_on_line)
int language_model_debug_level
float BodyMinXHeight() const
float BodyMaxXHeight() const
DANGERR fixpt
Places to try to fix the word suggested by ambiguity checking.
static const LanguageModelFlagsType kLowerCaseFlag
bool updated
set to true if the entry has just been created/updated
bool get_isupper(UNICHAR_ID unichar_id) const
int LetterIsOkay(void *void_dawg_args, const UNICHARSET &unicharset, UNICHAR_ID unichar_id, bool word_end) const
Calls letter_is_okay_ member function.
double language_model_ngram_small_prob
bool NoDangerousAmbig(WERD_CHOICE *BestChoice, DANGERR *fixpt, bool fix_replaceable, MATRIX *ratings)
void AddHypothesis(const tesseract::ParamsTrainingHypothesis &hypo)
static const UNICHAR_ID kPatternUnicharID
static int Compare(const void *e1, const void *e2)
int language_model_min_compound_length
void set_best_choice_is_dict_and_top_choice(bool value)
int GetBlobsGap(int blob_index)
LanguageModel(const UnicityTable< FontInfo > *fontinfo_table, Dict *dict)
void default_dawgs(DawgPositionVector *anylength_dawgs, bool suppress_patterns) const
int NumInconsistentChartype() const
float ComputeNgramCost(const char *unichar, float certainty, float denom, const char *context, int *unichar_step_len, bool *found_small_prob, float *ngram_prob)
void string_and_lengths(STRING *word_str, STRING *word_lengths_str) const
ViterbiStateEntry_LIST viterbi_state_entries
Storage for the Viterbi state.
int length
number of characters on the path
GenericVector< int > blob_widths
LanguageModelDawgInfo * dawg_info
ViterbiStateEntry * parent_vse
int viterbi_state_entries_length
Total number of entries in viterbi_state_entries.
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
int NumInconsistentSpaces() const
bool UpdateState(bool just_classified, int curr_col, int curr_row, BLOB_CHOICE_LIST *curr_list, LanguageModelState *parent_node, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
static const LanguageModelFlagsType kDigitFlag
void GenerateTopChoiceInfo(ViterbiStateEntry *new_vse, const ViterbiStateEntry *parent_vse, LanguageModelState *lms)
void UpdateBestChoice(ViterbiStateEntry *vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
int wordrec_display_segmentations
ViterbiStateEntry * best_vse
Best ViterbiStateEntry and BLOB_CHOICE.
void InitForWord(const WERD_CHOICE *prev_word, bool fixed_pitch, float max_char_wh_ratio, float rating_cert_scale)
void ComputeAssociateStats(int col, int row, float max_char_wh_ratio, ViterbiStateEntry *parent_vse, WERD_RES *word_res, AssociateStats *associate_stats)
Struct to store information maintained by various language model components.
bool compound_marker(UNICHAR_ID unichar_id)
WERD_CHOICE * ConstructWord(ViterbiStateEntry *vse, WERD_RES *word_res, DANGERR *fixpt, BlamerBundle *blamer_bundle, bool *truth_path)
bool GetTopLowerUpperDigit(BLOB_CHOICE_LIST *curr_list, BLOB_CHOICE **first_lower, BLOB_CHOICE **first_upper, BLOB_CHOICE **first_digit) const
const STRING & unichar_string() const
DawgPositionVector beginning_active_dawgs_
const UNICHARSET & getUnicharset() const
const Dawg * GetDawg(int index) const
Return i-th dawg pointer recorded in the dawgs_ vector.
LanguageModelDawgInfo * GenerateDawgInfo(bool word_end, int curr_col, int curr_row, const BLOB_CHOICE &b, const ViterbiStateEntry *parent_vse)
void set_rating(float new_val)
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
static const LanguageModelFlagsType kUpperCaseFlag
double language_model_ngram_rating_factor
float full_wh_ratio_total
double language_model_penalty_non_dict_word
unsigned char LanguageModelFlagsType
Used for expressing various language model flags.
const char * id_to_unichar(UNICHAR_ID id) const
DawgPositionVector * active_dawgs
bool LogNewRawChoice(WERD_CHOICE *word_choice)
#define double_MEMBER(name, val, comment, vec)
int correct_segmentation_length() const
bool updated
Flag to indicate whether anything was changed.
int language_model_viterbi_list_max_num_prunable
BLOB_CHOICE * curr_b
Pointers to BLOB_CHOICE and parent ViterbiStateEntry (not owned by this).
static const float kBadRating
ViterbiStateEntry * GetNextParentVSE(bool just_classified, bool mixed_alnum, const BLOB_CHOICE *bc, LanguageModelFlagsType blob_choice_flags, const UNICHARSET &unicharset, WERD_RES *word_res, ViterbiStateEntry_IT *vse_it, LanguageModelFlagsType *top_choice_flags) const
bool AddViterbiStateEntry(LanguageModelFlagsType top_choice_flags, float denom, bool word_end, int curr_col, int curr_row, BLOB_CHOICE *b, LanguageModelState *curr_state, ViterbiStateEntry *parent_vse, LMPainPoints *pain_points, WERD_RES *word_res, BestChoiceBundle *best_choice_bundle, BlamerBundle *blamer_bundle)
static float ComputeOutlineLength(float rating_cert_scale, const BLOB_CHOICE &b)
int prev_word_unichar_step_len_
int num_inconsistent_spaces
bool PosAndSizeAgree(const BLOB_CHOICE &other, float x_height, bool debug) const
bool acceptable_choice_found_
const Dawg * GetPuncDawg() const
Return the points to the punctuation dawg.
bool hyphenated() const
Returns true if we've recorded the beginning of a hyphenated word.
int tessedit_truncate_wordchoice_log
float ComputeCost(const float features[]) const
int16_t fontinfo_id() const
void set_hyphen_word(const WERD_CHOICE &word, const DawgPositionVector &active_dawgs)
void ComputeXheightConsistency(const BLOB_CHOICE *b, bool is_punc)
bool language_model_ngram_use_only_first_uft8_step
static const LanguageModelFlagsType kXhtConsistentFlag
float ngram_cost
-ln(P_ngram_model(path))
bool LogNewCookedChoice(int max_num_choices, bool debug, WERD_CHOICE *word_choice)
const UNICHARSET * uch_set
bool has_hyphen_end(const UNICHARSET *unicharset, UNICHAR_ID unichar_id, bool first_pos) const
Check whether the word has a hyphen at the end.
bool MatrixPositionCorrect(int index, const MATRIX_COORD &coord)
int NumInconsistentCase() const
Bundle together all the things pertaining to the best choice/state.
WERD_CHOICE * best_choice
bool HasAlnumChoice(const UNICHARSET &unicharset)
LanguageModelFlagsType top_choice_flags
LanguageModelNgramInfo * GenerateNgramInfo(const char *unichar, float certainty, float denom, int curr_col, int curr_row, float outline_length, const ViterbiStateEntry *parent_vse)
XHeightConsistencyEnum xht_decision
double language_model_ngram_scale_factor
float ratings_sum
sum of ratings of character on the path
bool is_apostrophe(UNICHAR_ID unichar_id)
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const
float min_certainty
minimum certainty on the path