ArkScript
A small, fast, functional and scripting language for video games
utf8.hpp
Go to the documentation of this file.
1#ifndef UTF8_DECODER_H
2#define UTF8_DECODER_H
3
4#include <array>
5#include <cstdint>
6
7namespace utf8
8{
9 enum class Utf8Type
10 {
11 Ascii = 0,
12 LatinExtra = 1,
15 OutRange = 4
16 };
17
18 namespace details
19 {
20 // clang-format off
21 constexpr char no = static_cast<char>(-1);
22 constexpr std::array<char, 128> ASCIIHexToInt =
23 {
24 no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,
25 no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,
26 no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,
27 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, no, no, no, no, no, no,
28 no, 10, 11, 12, 13, 14, 15, no, no, no, no, no, no, no, no, no,
29 no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,
30 no, 10, 11, 12, 13, 14, 15, no, no, no, no, no, no, no, no, no,
31 no, no, no, no, no, no, no, no, no, no, no, no, no, no, no, no,
32 };
33 // clang-format on
34 }
35
36 inline Utf8Type utf8type(const char* input, int32_t* out = nullptr)
37 {
38 int32_t codepoint_ = 0;
39 int shift = 0;
40
41 for (const char* s = input; *s != 0; ++s)
42 {
43 codepoint_ = (codepoint_ << shift) | details::ASCIIHexToInt[static_cast<std::size_t>(*s)];
44 shift = 4;
45 }
46
47 if (out != nullptr)
48 *out = codepoint_;
49
50 if (codepoint_ >= 0x0000 && codepoint_ <= 0x007f)
51 return Utf8Type::Ascii;
52 if (codepoint_ > 0x007f && codepoint_ <= 0x07ff)
54 if (codepoint_ > 0x07ff && codepoint_ <= 0xffff)
56 if (codepoint_ > 0xffff && codepoint_ <= 0x10ffff)
58
59 return Utf8Type::OutRange;
60 }
61
62 /**
63 * @brief Convert hex string to utf8 string
64 * @param input
65 * @param dest Output utf8 string (size [2,5]). Empty (\0) if input is invalid or out of range
66 */
67 inline void decode(const char* input, char* dest)
68 {
69 int32_t cdp = 0;
70 const Utf8Type type = utf8type(input, &cdp);
71 const char c0 = details::ASCIIHexToInt[static_cast<std::size_t>(input[0])];
72 const char c1 = details::ASCIIHexToInt[static_cast<std::size_t>(input[1])];
73 const char c2 = details::ASCIIHexToInt[static_cast<std::size_t>(input[2])];
74 const char c3 = details::ASCIIHexToInt[static_cast<std::size_t>(input[3])];
75
76 switch (type)
77 {
78 case Utf8Type::Ascii:
79 {
80 dest[0] = static_cast<char>(cdp);
81 dest[1] = 0;
82 break;
83 }
84
86 {
87 dest[0] = static_cast<char>((0xc0 | ((c1 & 0x7) << 2)) | ((c2 & 0xc) >> 2));
88 dest[1] = static_cast<char>((0x80 | ((c2 & 0x3) << 4)) | c3);
89 dest[2] = 0;
90 break;
91 }
92
94 {
95 dest[0] = static_cast<char>(0xe0 | c0);
96 dest[1] = static_cast<char>((0x80 | (c1 << 2)) | ((c2 & 0xc) >> 2));
97 dest[2] = static_cast<char>((0x80 | ((c2 & 0x3) << 4)) | c3);
98 dest[3] = 0;
99 break;
100 }
101
103 {
104 const char c4 = details::ASCIIHexToInt[static_cast<std::size_t>(input[4])];
105
106 if (cdp <= 0xfffff)
107 {
108 dest[0] = static_cast<char>(0xf0 | ((c0 & 0xc) >> 2));
109 dest[1] = static_cast<char>((0x80 | ((c0 & 0x3) << 4)) | c1);
110 dest[2] = static_cast<char>((0x80 | (c2 << 2)) | ((c3 & 0xc) >> 2));
111 dest[3] = static_cast<char>((0x80 | ((c3 & 0x3) << 4)) | c4);
112 dest[4] = 0;
113 }
114 else
115 {
116 const char c5 = details::ASCIIHexToInt[static_cast<std::size_t>(input[5])];
117
118 dest[0] = static_cast<char>((0xf0 | ((c0 & 0x1) << 2)) | ((c1 & 0xc) >> 2));
119 dest[1] = static_cast<char>(((0x80 | ((c1 & 0x3) << 4)) | ((c1 & 0xc) >> 2)) | c2);
120 dest[2] = static_cast<char>((0x80 | (c3 << 2)) | ((c4 & 0xc) >> 2));
121 dest[3] = static_cast<char>((0x80 | ((c4 & 0x3) << 4)) | c5);
122 dest[4] = 0;
123 }
124 break;
125 }
126
128 *dest = 0;
129 break;
130 }
131 }
132
133 /**
134 * @brief Check the validity of a given string in UTF8
135 * @param str
136 * @return true if the given string is a valid UTF88 string
137 */
138 inline bool isValid(const char* str)
139 {
140 const char* s = str;
141
142 if (str == nullptr)
143 return false;
144
145 while (*s != 0)
146 {
147 if (0xf0 == (0xf8 & *s))
148 {
149 if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])) || (0x80 != (0xc0 & s[3])))
150 return false;
151 if (0x80 == (0xc0 & s[4]))
152 return false;
153 if ((0 == (0x07 & s[0])) && (0 == (0x30 & s[1])))
154 return false;
155 s += 4;
156 }
157 else if (0xe0 == (0xf0 & *s))
158 {
159 if ((0x80 != (0xc0 & s[1])) || (0x80 != (0xc0 & s[2])))
160 return false;
161 if (0x80 == (0xc0 & s[3]))
162 return false;
163 if ((0 == (0x0f & s[0])) && (0 == (0x20 & s[1])))
164 return false;
165 s += 3;
166 }
167 else if (0xc0 == (0xe0 & *s))
168 {
169 if (0x80 != (0xc0 & s[1]))
170 return false;
171 if (0x80 == (0xc0 & s[2]))
172 return false;
173 if (0 == (0x1e & s[0]))
174 return false;
175 s += 2;
176 }
177 else if (0x00 == (0x80 & *s))
178 s += 1;
179 else
180 return false;
181 }
182
183 return true;
184 }
185
186 /**
187 * @brief Compute the UTF8 codepoint for a given UTF8 char
188 * @param str
189 * @return UTF8 codepoint if valid, -1 otherwise
190 */
191 inline int32_t codepoint(const char* str)
192 {
193 int32_t codepoint = 0;
194 const char* s = str;
195
196 if (isValid(str))
197 {
198 while (*s != 0)
199 {
200 if (0xf0 == (0xf8 & *s))
201 {
202 codepoint = ((0x07 & s[0]) << 18) | ((0x3f & s[1]) << 12) | ((0x3f & s[2]) << 6) | (0x3f & s[3]);
203 s += 4;
204 }
205 else if (0xe0 == (0xf0 & *s))
206 {
207 codepoint = ((0x0f & s[0]) << 12) | ((0x3f & s[1]) << 6) | (0x3f & s[2]);
208 s += 3;
209 }
210 else if (0xc0 == (0xe0 & *s))
211 {
212 codepoint = ((0x1f & s[0]) << 6) | (0x3f & s[1]);
213 s += 2;
214 }
215 else if (0x00 == (0x80 & *s))
216 {
217 codepoint = s[0];
218 ++s;
219 }
220 else
221 return -1;
222 }
223 }
224
225 return codepoint;
226 }
227
228 /**
229 * @brief Generate an UTF8 character from a given codepoint
230 * @param codepoint
231 * @param dest Output utf8 string (size [2,5]). Empty (\0) if input is invalid or out of range
232 */
233 inline void codepointToUtf8(const int32_t codepoint, char* dest)
234 {
235 if (codepoint >= 0x0000 && codepoint <= 0x007f)
236 {
237 dest[0] = static_cast<char>(codepoint);
238 dest[1] = 0;
239 }
240 else if (codepoint > 0x007f && codepoint <= 0x07ff)
241 {
242 dest[0] = -128;
243 if (codepoint > 0xff)
244 dest[0] |= static_cast<char>((codepoint >> 6));
245 dest[0] |= static_cast<char>(((codepoint & 0xc0) >> 6));
246 dest[1] = static_cast<char>(0x80 | (codepoint & 0x3f));
247 dest[2] = 0;
248 }
249 else if (codepoint > 0x07ff && codepoint <= 0xffff)
250 {
251 dest[0] = -32;
252 if (codepoint > 0xfff)
253 dest[0] |= static_cast<char>(((codepoint & 0xf000) >> 12));
254 dest[1] = static_cast<char>((0x80 | ((codepoint & 0xf00) >> 6)) | ((codepoint & 0xf0) >> 6));
255 dest[2] = static_cast<char>((0x80 | (codepoint & 0x30)) | (codepoint & 0xf));
256 dest[3] = 0;
257 }
258 else if (codepoint > 0xffff && codepoint <= 0x10ffff)
259 {
260 dest[0] = -16;
261 if (codepoint > 0xfffff)
262 dest[0] |= static_cast<char>(((codepoint & 0x100000) >> 18));
263 dest[0] |= static_cast<char>(((codepoint & 0xc0000) >> 18));
264 dest[1] = static_cast<char>((0x80 | ((codepoint & 0x30000) >> 12)) | ((codepoint & 0xf000) >> 12));
265 dest[2] = static_cast<char>((0x80 | ((codepoint & 0xf00) >> 6)) | ((codepoint & 0xc0) >> 6));
266 dest[3] = static_cast<char>((0x80 | (codepoint & 0x30)) | (codepoint & 0xf));
267 dest[4] = 0;
268 }
269 else
270 *dest = 0;
271 }
272}
273
274#endif
constexpr char no
Definition utf8.hpp:21
constexpr std::array< char, 128 > ASCIIHexToInt
Definition utf8.hpp:22
Definition utf8.hpp:8
Utf8Type
Definition utf8.hpp:10
Utf8Type utf8type(const char *input, int32_t *out=nullptr)
Definition utf8.hpp:36
void codepointToUtf8(const int32_t codepoint, char *dest)
Generate an UTF8 character from a given codepoint.
Definition utf8.hpp:233
int32_t codepoint(const char *str)
Compute the UTF8 codepoint for a given UTF8 char.
Definition utf8.hpp:191
bool isValid(const char *str)
Check the validity of a given string in UTF8.
Definition utf8.hpp:138
void decode(const char *input, char *dest)
Convert hex string to utf8 string.
Definition utf8.hpp:67