ArkScript
A small, fast, functional and scripting language for video games
utf8_char.hpp
Go to the documentation of this file.
1#ifndef SRC_UTF8_CHAR_HPP
2#define SRC_UTF8_CHAR_HPP
3
4#include <array>
5#include <string>
6#include <limits>
7
8#undef max
9
10namespace Ark::internal
11{
13 {
14 public:
15 using codepoint_t = int;
16 using length_t = unsigned char;
17 using repr_t = std::array<unsigned char, 5>;
18
20 m_codepoint(0), m_length(0), m_repr({ 0 }) {}
21
22 utf8_char_t(const codepoint_t cp, const length_t len, const repr_t repr) :
23 m_codepoint(cp), m_length(len), m_repr(repr) {}
24
25 /**
26 * @brief Parse a codepoint and compute its length and representation
27 * @details https://github.com/sheredom/utf8.h/blob/4e4d828174c35e4564c31a9e35580c299c69a063/utf8.h#L1178
28 * @param it iterator in a string
29 * @param end end iterator, used to avoid going out of bound
30 * @return std::pair<std::string::iterator, utf8_char_t> the iterator points to the beginning of the next codepoint, the utf8_char_t represents the parsed codepoint
31 */
32 static std::pair<std::string::iterator, utf8_char_t> at(const std::string::iterator it, const std::string::iterator end)
33 {
34 codepoint_t cp;
35 length_t length;
36 repr_t repr = {};
37
38 if (0xf0 == (0xf8 & *it)) // 4 byte utf8 codepoint
39 {
40 if (it + 3 == end || it + 2 == end || it + 1 == end)
41 return std::make_pair(end, utf8_char_t {});
42
43 cp = (static_cast<codepoint_t>(0x07 & *it) << 18) |
44 (static_cast<codepoint_t>(0x3f & *(it + 1)) << 12) |
45 (static_cast<codepoint_t>(0x3f & *(it + 2)) << 6) |
46 static_cast<codepoint_t>(0x3f & *(it + 3));
47 length = 4;
48 }
49 else if (0xe0 == (0xf0 & *it)) // 3 byte utf8 codepoint
50 {
51 if (it + 2 == end || it + 1 == end)
52 return std::make_pair(end, utf8_char_t {});
53
54 cp = (static_cast<codepoint_t>(0x0f & *it) << 12) |
55 (static_cast<codepoint_t>(0x3f & *(it + 1)) << 6) |
56 static_cast<codepoint_t>(0x3f & *(it + 2));
57 length = 3;
58 }
59 else if (0xc0 == (0xe0 & *it)) // 2 byte utf8 codepoint
60 {
61 if (it + 1 == end)
62 return std::make_pair(end, utf8_char_t {});
63
64 cp = (static_cast<codepoint_t>(0x1f & *it) << 6) |
65 static_cast<codepoint_t>(0x3f & *(it + 1));
66 length = 2;
67 }
68 else // 1 byte utf8 codepoint otherwise
69 {
70 cp = static_cast<unsigned char>(*it);
71 length = 1;
72 }
73
74 for (length_t i = 0; i < length; ++i)
75 repr[i] = static_cast<unsigned char>(*(it + static_cast<int>(i)));
76
77 return std::make_pair(it + static_cast<long>(length),
78 utf8_char_t(cp, length, repr));
79 }
80
81 /**
82 *
83 * @return true if the given codepoint is printable according to std::isprint
84 */
85 [[nodiscard]] bool isPrintable() const
86 {
87 if (m_codepoint < std::numeric_limits<char>::max())
88 return std::isprint(m_codepoint);
89 return true;
90 }
91
92 [[nodiscard]] const char* c_str() const { return reinterpret_cast<const char*>(m_repr.data()); }
93 [[nodiscard]] std::size_t size() const { return m_length; }
94 [[nodiscard]] codepoint_t codepoint() const { return m_codepoint; }
95
96 private:
100 };
101}
102
103#endif
utf8_char_t(const codepoint_t cp, const length_t len, const repr_t repr)
Definition utf8_char.hpp:22
codepoint_t codepoint() const
Definition utf8_char.hpp:94
std::array< unsigned char, 5 > repr_t
Definition utf8_char.hpp:17
const char * c_str() const
Definition utf8_char.hpp:92
static std::pair< std::string::iterator, utf8_char_t > at(const std::string::iterator it, const std::string::iterator end)
Parse a codepoint and compute its length and representation.
Definition utf8_char.hpp:32
std::size_t size() const
Definition utf8_char.hpp:93