/* Copyright (C) 2014-2016 Quinten Lansu Permission is hereby granted, free of charge, to any person obtaining a copy of this software and associated documentation files (the "Software"), to deal in the Software without restriction, including without limitation the rights to use, copy, modify, merge, publish, distribute, sublicense, and/or sell copies of the Software, and to permit persons to whom the Software is furnished to do so, subject to the following conditions: The above copyright notice and this permission notice shall be included in all copies or substantial portions of the Software. THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE SOFTWARE. */ #ifndef _UTF8REWIND_INTERNAL_CODEPOINT_H_ #define _UTF8REWIND_INTERNAL_CODEPOINT_H_ /*! \file \brief Codepoint interface. \cond INTERNAL */ #include "utf8rewind.h" /*! \addtogroup internal Internal functions and definitions \{ */ /*! \def MAX_BASIC_LATIN \brief The last codepoint part of Basic Latin (U+0000 - U+007F). */ #define MAX_BASIC_LATIN 0x007F /*! \def MAX_LATIN_1 \brief The last codepoint part of Latin-1 Supplement (U+0080 - U+00FF). */ #define MAX_LATIN_1 0x00FF /*! \def MAX_BASIC_MULTILINGUAL_PLANE \brief The last legal codepoint in the Basic Multilingual Plane (BMP). */ #define MAX_BASIC_MULTILINGUAL_PLANE 0xFFFF /*! \def MAX_LEGAL_UNICODE \brief The last legal codepoint in Unicode. */ #define MAX_LEGAL_UNICODE 0x10FFFF /*! \def REPLACEMENT_CHARACTER \brief The codepoint used to replace illegal codepoints. */ #define REPLACEMENT_CHARACTER 0xFFFD /*! \def REPLACEMENT_CHARACTER_STRING \brief The replacement character as a UTF-8 encoded string. */ #define REPLACEMENT_CHARACTER_STRING "\xEF\xBF\xBD" /*! \def REPLACEMENT_CHARACTER_STRING_LENGTH \brief Length of the UTF-8 encoded string of the replacment character. */ #define REPLACEMENT_CHARACTER_STRING_LENGTH 3 /*! \def SURROGATE_HIGH_START \brief The minimum codepoint for the high member of a surrogate pair. */ #define SURROGATE_HIGH_START 0xD800 /*! \def SURROGATE_HIGH_END \brief The maximum codepoint for the high member of a surrogate pair. */ #define SURROGATE_HIGH_END 0xDBFF /*! \def SURROGATE_LOW_START \brief The minimum codepoint for the low member of a surrogate pair. */ #define SURROGATE_LOW_START 0xDC00 /*! \def SURROGATE_LOW_END \brief The maximum codepoint for the low member of a surrogate pair. */ #define SURROGATE_LOW_END 0xDFFF /*! \def HANGUL_JAMO_FIRST \brief The first codepoint part of the Hangul Jamo block. */ #define HANGUL_JAMO_FIRST 0x1100 /*! \def HANGUL_JAMO_LAST \brief The last codepoint part of the Hangul Jamo block. */ #define HANGUL_JAMO_LAST 0x11FF /*! \def HANGUL_L_FIRST \brief The first codepoint part of the Hangul Jamo L section used for normalization. */ #define HANGUL_L_FIRST 0x1100 /*! \def HANGUL_L_LAST \brief The last codepoint part of the Hangul Jamo L section used for normalization. */ #define HANGUL_L_LAST 0x1112 /*! \def HANGUL_L_COUNT \brief The number of codepoints in the Hangul Jamo L section. */ #define HANGUL_L_COUNT 19 /*! \def HANGUL_V_FIRST \brief The first codepoint part of the Hangul Jamo V section used for normalization. */ #define HANGUL_V_FIRST 0x1161 /*! \def HANGUL_V_LAST \brief The last codepoint part of the Hangul Jamo V section used for normalization. */ #define HANGUL_V_LAST 0x1175 /*! \def HANGUL_V_COUNT \brief The number of codepoints in the Hangul Jamo V section. */ #define HANGUL_V_COUNT 21 /*! \def HANGUL_T_FIRST \brief The first codepoint part of the Hangul Jamo T section used for normalization. */ #define HANGUL_T_FIRST 0x11A7 /*! \def HANGUL_T_LAST \brief The last codepoint part of the Hangul Jamo V section used for normalization. */ #define HANGUL_T_LAST 0x11C2 /*! \def HANGUL_T_COUNT \brief The number of codepoints in the Hangul Jamo T section. */ #define HANGUL_T_COUNT 28 /*! \def HANGUL_N_COUNT \brief Number of codepoints part of the Hangul Jamo V and T sections. */ #define HANGUL_N_COUNT 588 /* VCount * TCount */ /*! \def HANGUL_S_FIRST \brief The first codepoint in the Hangul Syllables block. */ #define HANGUL_S_FIRST 0xAC00 /*! \def HANGUL_S_LAST \brief The last codepoint in the Hangul Syllables block. */ #define HANGUL_S_LAST 0xD7A3 /*! \def HANGUL_S_COUNT \brief The number of codepoints in the Hangul Syllables block. */ #define HANGUL_S_COUNT 11172 /* LCount * NCount */ #define CP_LATIN_CAPITAL_LETTER_I 0x0049 #define CP_LATIN_CAPITAL_LETTER_J 0x004A #define CP_LATIN_SMALL_LETTER_I 0x0069 #define CP_LATIN_SMALL_LETTER_J 0x006A #define CP_LATIN_CAPITAL_LETTER_I_WITH_GRAVE 0x00CC #define CP_LATIN_CAPITAL_LETTER_I_WITH_ACUTE 0x00CD #define CP_LATIN_CAPITAL_LETTER_I_WITH_TILDE 0x0128 #define CP_LATIN_CAPITAL_LETTER_I_WITH_OGONEK 0x012E #define CP_LATIN_SMALL_LETTER_I_WITH_OGONEK 0x012F #define CP_LATIN_CAPITAL_LETTER_I_WITH_DOT_ABOVE 0x0130 #define CP_LATIN_SMALL_LETTER_DOTLESS_I 0x0131 #define CP_COMBINING_GRAVE_ACCENT 0x0300 #define CP_COMBINING_ACUTE_ACCENT 0x0301 #define CP_COMBINING_TILDE_ACCENT 0x0303 #define CP_COMBINING_DOT_ABOVE 0x0307 #define CP_COMBINING_GREEK_YPOGEGRAMMENI 0x0345 #define CP_COMBINING_GRAPHEME_JOINER 0x034F #define CP_GREEK_CAPITAL_LETTER_SIGMA 0x03A3 #define CCC_NOT_REORDERED 0 #define CCC_OVERLAY 1 #define CCC_NUKTA 7 #define CCC_KANA_VOICING 8 #define CCC_VIRAMA 9 #define CCC_FIXED_POSITION_START 10 #define CCC_FIXED_POSITION_END 199 #define CCC_ATTACHED_BELOW_LEFT 200 #define CCC_ATTACHED_BELOW 202 #define CCC_ATTACHED_BOTTOM_RIGHT 204 #define CCC_ATTACHED_LEFT 208 #define CCC_ATTACHED_RIGHT 210 #define CCC_ATTACHED_TOP_LEFT 212 #define CCC_ATTACHED_ABOVE 214 #define CCC_ATTACHED_ABOVE_RIGHT 216 #define CCC_BELOW_LEFT 218 #define CCC_BELOW 220 #define CCC_BELOW_RIGHT 222 #define CCC_LEFT 224 #define CCC_RIGHT 226 #define CCC_ABOVE_LEFT 228 #define CCC_ABOVE 230 #define CCC_ABOVE_RIGHT 232 #define CCC_DOUBLE_BELOW 233 #define CCC_DOUBLE_ABOVE 234 #define CCC_IOTA_SUBSCRIPT 240 #define CCC_INVALID 255 /*! \brief Get the number of bytes used for encoding a code point. \param[in] byte Encoded byte \return Number of bytes needed for decoding or 0 if input is illegal. */ extern const uint8_t codepoint_decoded_length[256]; /*! \brief Write Unicode code point to UTF-8 encoded string. Target buffer and size is modified by encoded size. \param[in] encoded Unicode code point \param[in,out] target Target buffer \param[in,out] targetSize Size of output buffer in bytes \return Bytes needed for encoding or 0 on error. */ uint8_t codepoint_write(unicode_t encoded, char** target, size_t* targetSize); /*! \brief Read Unicode code point from UTF-8 encoded string. \param[in] input Input buffer \param[in] inputSize Size of input buffer in bytes \param[out] decoded Unicode codepoint \return Bytes read from string or 0 on error. */ uint8_t codepoint_read(const char* input, size_t inputSize, unicode_t* decoded); /*! \} */ /*! \endcond */ #endif /* _UTF8REWIND_INTERNAL_CODEPOINT_H_ */