19 #ifndef TESSERACT_CCUTIL_UNICHARSET_H_ 20 #define TESSERACT_CCUTIL_UNICHARSET_H_ 58 inline void set_all(
const char *unichar,
int pos,
int total,
bool natural) {
65 strncpy(this->unichar, uch,
sizeof(this->unichar));
68 inline void set_pos(
int p) { this->pos = p; }
70 inline const char*
get_unichar()
const {
return this->unichar; }
71 inline int get_pos()
const {
return this->pos; }
72 inline int get_total()
const {
return this->total; }
80 return to_string(unichar, pos, total, natural);
85 inline bool equals(
const char *other_unichar,
86 int other_pos,
int other_total)
const {
87 return (strcmp(this->unichar, other_unichar) == 0 &&
88 this->pos == other_pos && this->total == other_total);
99 return (strcmp(this->unichar, fragment->
get_unichar()) == 0 &&
101 this->pos == fragment->
get_pos() + 1);
108 inline bool is_ending()
const {
return this->pos == this->total-1; }
180 #ifndef U_HIDE_DEPRECATED_API 182 #endif // U_HIDE_DEPRECATED_API 203 int step(
const char* str)
const;
228 int* encoded_length)
const;
249 static std::string
CleanupString(
const char* utf8_str,
size_t length);
271 if (cleaned != unichar_repr) {
274 int old_size =
size();
276 if (
size() == old_size) {
285 return unichar_id != INVALID_UNICHAR_ID && unichar_id < size_used &&
295 bool eq(
UNICHAR_ID unichar_id,
const char*
const unichar_repr)
const;
299 for (
int i = 0; i < size_used; ++i) {
300 delete unichars[i].properties.fragment;
301 unichars[i].properties.fragment =
nullptr;
307 if (script_table !=
nullptr) {
308 for (
int i = 0; i < script_table_size_used; ++i)
309 delete[] script_table[i];
310 delete[] script_table;
311 script_table =
nullptr;
312 script_table_size_used = 0;
314 if (unichars !=
nullptr) {
319 script_table_size_reserved = 0;
323 top_bottom_set_ =
false;
324 script_has_upper_lower_ =
false;
325 script_has_xheight_ =
false;
326 old_style_included_ =
false;
346 void reserve(
int unichars_number);
351 FILE* file = fopen(filename,
"w+b");
352 if (file ==
nullptr)
return false;
379 bool skip_fragments);
389 FILE* file = fopen(filename,
"rb");
390 if (file ==
nullptr)
return false;
428 const char* unblacklist);
432 unichars[unichar_id].properties.isalpha = value;
437 unichars[unichar_id].properties.islower = value;
442 unichars[unichar_id].properties.isupper = value;
447 unichars[unichar_id].properties.isdigit = value;
452 unichars[unichar_id].properties.ispunctuation = value;
457 unichars[unichar_id].properties.isngram = value;
463 unichars[unichar_id].properties.script_id =
add_script(value);
468 unichars[unichar_id].properties.other_case = other_case;
473 unichars[unichar_id].properties.direction = value;
478 unichars[unichar_id].properties.mirror = mirror;
483 unichars[unichar_id].properties.normed = normed;
484 unichars[unichar_id].properties.normed_ids.truncate(0);
492 if (INVALID_UNICHAR_ID == unichar_id)
return false;
494 return unichars[unichar_id].properties.isalpha;
499 if (INVALID_UNICHAR_ID == unichar_id)
return false;
501 return unichars[unichar_id].properties.islower;
506 if (INVALID_UNICHAR_ID == unichar_id)
return false;
508 return unichars[unichar_id].properties.isupper;
513 if (INVALID_UNICHAR_ID == unichar_id)
return false;
515 return unichars[unichar_id].properties.isdigit;
520 if (INVALID_UNICHAR_ID == unichar_id)
return false;
522 return unichars[unichar_id].properties.ispunctuation;
527 if (INVALID_UNICHAR_ID == unichar_id)
return false;
529 return unichars[unichar_id].properties.isngram;
538 return top_bottom_set_;
569 int* min_bottom,
int* max_bottom,
570 int* min_top,
int* max_top)
const {
571 if (INVALID_UNICHAR_ID == unichar_id) {
572 *min_bottom = *min_top = 0;
573 *max_bottom = *max_top = 256;
577 *min_bottom = unichars[unichar_id].properties.min_bottom;
578 *max_bottom = unichars[unichar_id].properties.max_bottom;
579 *min_top = unichars[unichar_id].properties.min_top;
580 *max_top = unichars[unichar_id].properties.max_top;
583 int min_bottom,
int max_bottom,
584 int min_top,
int max_top) {
585 unichars[unichar_id].properties.min_bottom =
586 ClipToRange<int>(min_bottom, 0, UINT8_MAX);
587 unichars[unichar_id].properties.max_bottom =
588 ClipToRange<int>(max_bottom, 0, UINT8_MAX);
589 unichars[unichar_id].properties.min_top =
590 ClipToRange<int>(min_top, 0, UINT8_MAX);
591 unichars[unichar_id].properties.max_top =
592 ClipToRange<int>(max_top, 0, UINT8_MAX);
597 float* width,
float* width_sd)
const {
598 if (INVALID_UNICHAR_ID == unichar_id) {
604 *width = unichars[unichar_id].properties.width;
605 *width_sd = unichars[unichar_id].properties.width_sd;
608 unichars[unichar_id].properties.width = width;
609 unichars[unichar_id].properties.width_sd = width_sd;
614 float* bearing,
float* bearing_sd)
const {
615 if (INVALID_UNICHAR_ID == unichar_id) {
616 *bearing = *bearing_sd = 0.0f;
620 *bearing = unichars[unichar_id].properties.bearing;
621 *bearing_sd = unichars[unichar_id].properties.bearing_sd;
624 float bearing,
float bearing_sd) {
625 unichars[unichar_id].properties.bearing = bearing;
626 unichars[unichar_id].properties.bearing_sd = bearing_sd;
631 float* advance,
float* advance_sd)
const {
632 if (INVALID_UNICHAR_ID == unichar_id) {
633 *advance = *advance_sd = 0;
637 *advance = unichars[unichar_id].properties.advance;
638 *advance_sd = unichars[unichar_id].properties.advance_sd;
641 float advance,
float advance_sd) {
642 unichars[unichar_id].properties.advance = advance;
643 unichars[unichar_id].properties.advance_sd = advance_sd;
647 return unichars[unichar_id].properties.AnyRangeEmpty();
653 if (INVALID_UNICHAR_ID == unichar_id)
return true;
655 return script_id != han_sid_ && script_id != thai_sid_ &&
656 script_id != hangul_sid_ && script_id != hiragana_sid_ &&
657 script_id != katakana_sid_;
664 if (INVALID_UNICHAR_ID == unichar_id)
return null_sid_;
666 return unichars[unichar_id].properties.script_id;
684 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
686 return unichars[unichar_id].properties.other_case;
693 return unichars[unichar_id].properties.direction;
698 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
700 return unichars[unichar_id].properties.mirror;
705 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
707 if (unichars[unichar_id].properties.islower)
return unichar_id;
708 return unichars[unichar_id].properties.other_case;
713 if (INVALID_UNICHAR_ID == unichar_id)
return INVALID_UNICHAR_ID;
715 if (unichars[unichar_id].properties.isupper)
return unichar_id;
716 return unichars[unichar_id].properties.other_case;
735 if (INVALID_UNICHAR_ID == unichar_id)
return nullptr;
737 return unichars[unichar_id].properties.fragment;
785 if (unichar_repr ==
nullptr || unichar_repr[0] ==
'\0' ||
786 !ids.
contains(unichar_repr,
false)) {
830 return unichars[unichar_id].properties.normed.string();
836 return unichars[unichar_id].properties.normed_ids;
850 return script_table_size_used;
855 if (
id >= script_table_size_used ||
id < 0)
857 return script_table[id];
869 return script == null_script;
880 return unichars[unichar_id].properties.enabled;
898 return script_has_upper_lower_;
905 return script_has_xheight_;
910 struct UNICHAR_PROPERTIES {
911 UNICHAR_PROPERTIES();
916 void SetRangesOpen();
918 void SetRangesEmpty();
921 bool AnyRangeEmpty()
const;
923 void ExpandRangesFrom(
const UNICHAR_PROPERTIES& src);
925 void CopyFrom(
const UNICHAR_PROPERTIES& src);
972 struct UNICHAR_SLOT {
974 UNICHAR_PROPERTIES properties;
988 void encode_string(
const char* str,
int str_index,
int str_length,
991 int* best_total_length,
1000 bool GetStrProperties(
const char* utf8_str,
1001 UNICHAR_PROPERTIES* props)
const;
1007 bool skip_fragments);
1012 static const char* kCleanupMaps[][2];
1013 static TESS_API const char* null_script;
1015 UNICHAR_SLOT* unichars;
1019 char** script_table;
1020 int script_table_size_used;
1021 int script_table_size_reserved;
1023 bool top_bottom_set_;
1025 bool script_has_upper_lower_;
1028 bool script_has_xheight_;
1030 bool old_style_included_;
1049 #endif // TESSERACT_CCUTIL_UNICHARSET_H_ const char * get_script_from_script_id(int id) const
unsigned int get_properties(UNICHAR_ID unichar_id) const
void set_top_bottom(UNICHAR_ID unichar_id, int min_bottom, int max_bottom, int min_top, int max_top)
void AppendOtherUnicharset(const UNICHARSET &src)
void set_black_and_whitelist(const char *blacklist, const char *whitelist, const char *unblacklist)
bool save_to_string(STRING *str) const
bool get_isupper(const char *const unichar_repr, int length) const
int get_script_id_from_name(const char *script_name) const
void unichar_insert_backwards_compatible(const char *const unichar_repr)
void set_unichar(const char *uch)
void CopyFrom(const UNICHARSET &src)
bool is_continuation_of(const CHAR_FRAGMENT *fragment) const
bool get_ispunctuation(const char *const unichar_repr, int length) const
const char * get_unichar() const
bool get_ispunctuation(UNICHAR_ID unichar_id) const
bool get_isalpha(const char *const unichar_repr, int length) const
UNICHAR_ID to_upper(UNICHAR_ID unichar_id) const
bool save_to_file(tesseract::TFile *file) const
void set_isupper(UNICHAR_ID unichar_id, bool value)
void unichar_insert(const char *const unichar_repr, OldUncleanUnichars old_style)
void ExpandRangesFromOther(const UNICHARSET &src)
bool get_isdigit(UNICHAR_ID unichar_id) const
int get_script(UNICHAR_ID unichar_id) const
bool get_islower(UNICHAR_ID unichar_id) const
bool top_bottom_useful() const
bool get_isngram(UNICHAR_ID unichar_id) const
char get_chartype(const char *const unichar_repr) const
void set_bearing_stats(UNICHAR_ID unichar_id, float bearing, float bearing_sd)
bool Serialize(FILE *fp, const char *data, size_t n)
void set_isngram(UNICHAR_ID unichar_id, bool value)
bool get_isalpha(UNICHAR_ID unichar_id) const
static CHAR_FRAGMENT * parse_from_string(const char *str)
void set_other_case(UNICHAR_ID unichar_id, UNICHAR_ID other_case)
int get_script(const char *const unichar_repr, int length) const
void set_natural(bool value)
bool get_isdigit(const char *const unichar_repr, int length) const
void SetPropertiesFromOther(const UNICHARSET &src)
char get_chartype(UNICHAR_ID unichar_id) const
bool get_islower(const char *const unichar_repr) const
bool get_islower(const char *const unichar_repr, int length) const
int get_script(const char *const unichar_repr) const
UNICHAR_ID get_other_case(UNICHAR_ID unichar_id) const
void set_all(const char *unichar, int pos, int total, bool natural)
bool eq(UNICHAR_ID unichar_id, const char *const unichar_repr) const
void get_bearing_stats(UNICHAR_ID unichar_id, float *bearing, float *bearing_sd) const
void set_width_stats(UNICHAR_ID unichar_id, float width, float width_sd)
void PartialSetPropertiesFromOther(int start_index, const UNICHARSET &src)
bool get_isalpha(const char *const unichar_repr) const
bool get_isprivate(UNICHAR_ID unichar_id) const
bool get_isupper(UNICHAR_ID unichar_id) const
static TESS_API const char * kSpecialUnicharCodes[SPECIAL_UNICHAR_CODES_COUNT]
bool contains_unichar_id(UNICHAR_ID unichar_id) const
void delete_pointers_in_unichars()
UNICHAR_ID to_lower(UNICHAR_ID unichar_id) const
void set_isalpha(UNICHAR_ID unichar_id, bool value)
bool load_from_file(const char *const filename)
bool get_isupper(const char *const unichar_repr) const
void set_advance_stats(UNICHAR_ID unichar_id, float advance, float advance_sd)
void set_ispunctuation(UNICHAR_ID unichar_id, bool value)
bool script_has_xheight() const
bool equals(const char *other_unichar, int other_pos, int other_total) const
static STRING debug_utf8_str(const char *str)
void unichar_insert(const char *const unichar_repr)
UNICHAR_ID unichar_to_id(const char *const unichar_repr) const
const CHAR_FRAGMENT * get_fragment(UNICHAR_ID unichar_id) const
STRING debug_str(UNICHAR_ID id) const
const char * id_to_unichar_ext(UNICHAR_ID id) const
bool has_special_codes() const
bool contains_unichar(const char *const unichar_repr) const
static const int kMaxChunks
void reserve(int unichars_number)
bool script_has_upper_lower() const
bool Serialize(const char *data, size_t count=1)
const GenericVector< UNICHAR_ID > & normed_ids(UNICHAR_ID unichar_id) const
void set_normed(UNICHAR_ID unichar_id, const char *normed)
bool AnyRepeatedUnicodes() const
bool is_null_script(const char *script) const
Direction get_direction(UNICHAR_ID unichar_id) const
const char * id_to_unichar(UNICHAR_ID id) const
bool is_beginning() const
bool contains(const char *const unichar_repr, int length) const
void get_top_bottom(UNICHAR_ID unichar_id, int *min_bottom, int *max_bottom, int *min_top, int *max_top) const
bool save_to_file(FILE *file) const
bool encodable_string(const char *str, int *first_bad_position) const
int add_script(const char *script)
void set_normed_ids(UNICHAR_ID unichar_id)
bool IsSpaceDelimited(UNICHAR_ID unichar_id) const
static TESS_API const char * kCustomLigatures[][2]
void set_script(UNICHAR_ID unichar_id, const char *value)
int get_script_table_size() const
void get_advance_stats(UNICHAR_ID unichar_id, float *advance, float *advance_sd) const
void set_islower(UNICHAR_ID unichar_id, bool value)
bool PropertiesIncomplete(UNICHAR_ID unichar_id) const
void get_width_stats(UNICHAR_ID unichar_id, float *width, float *width_sd) const
bool get_enabled(UNICHAR_ID unichar_id) const
bool equals(const CHAR_FRAGMENT *other) const
static std::string CleanupString(const char *utf8_str)
bool save_to_file(const char *const filename) const
bool get_isdigit(const char *const unichar_repr) const
bool load_from_inmemory_file(const char *const memory, int mem_size)
unsigned int get_properties(const char *const unichar_repr) const
bool load_from_file(FILE *file)
bool encode_string(const char *str, bool give_up_on_failure, GenericVector< UNICHAR_ID > *encoding, GenericVector< char > *lengths, int *encoded_length) const
const CHAR_FRAGMENT * get_fragment(const char *const unichar_repr) const
STRING debug_str(const char *unichar_repr) const
bool get_ispunctuation(const char *const unichar_repr) const
void set_mirror(UNICHAR_ID unichar_id, UNICHAR_ID mirror)
bool load_from_inmemory_file(const char *const memory, int mem_size, bool skip_fragments)
void set_direction(UNICHAR_ID unichar_id, UNICHARSET::Direction value)
int step(const char *str) const
UNICHAR_ID get_mirror(UNICHAR_ID unichar_id) const
const char * get_normed_unichar(UNICHAR_ID unichar_id) const
void set_isdigit(UNICHAR_ID unichar_id, bool value)
bool load_from_file(const char *const filename, bool skip_fragments)
bool major_right_to_left() const
bool SizesDistinct(UNICHAR_ID id1, UNICHAR_ID id2) const