ArkScript
A small, lisp-inspired, functional scripting language
utf8_char.hpp
Go to the documentation of this file.
1#ifndef SRC_UTF8_CHAR_HPP
2#define SRC_UTF8_CHAR_HPP
3
4#include <array>
5#include <string>
6#include <limits>
7
8namespace Ark::internal
9{
11 {
12 public:
13 using codepoint_t = int;
14 using length_t = unsigned char;
15 using repr_t = std::array<unsigned char, 5>;
16
18 m_codepoint(0), m_length(0), m_repr({ 0 }) {}
19
20 utf8_char_t(const codepoint_t cp, const length_t len, const repr_t repr) :
21 m_codepoint(cp), m_length(len), m_repr(repr) {}
22
23 /**
24 * @brief Parse a codepoint and compute its length and representation
25 * @details https://github.com/sheredom/utf8.h/blob/4e4d828174c35e4564c31a9e35580c299c69a063/utf8.h#L1178
26 * @param it iterator in a string
27 * @param end end iterator, used to avoid going out of bound
28 * @return std::pair<std::string::iterator, utf8_char_t> the iterator points to the beginning of the next codepoint, the utf8_char_t represents the parsed codepoint
29 */
30 static std::pair<std::string::iterator, utf8_char_t> at(const std::string::iterator it, const std::string::iterator end)
31 {
32 codepoint_t cp;
33 length_t length;
34 repr_t repr = {};
35
36 if (0xf0 == (0xf8 & *it)) // 4 byte utf8 codepoint
37 {
38 if (it + 3 == end || it + 2 == end || it + 1 == end)
39 return std::make_pair(end, utf8_char_t {});
40
41 cp = (static_cast<codepoint_t>(0x07 & *it) << 18) |
42 (static_cast<codepoint_t>(0x3f & *(it + 1)) << 12) |
43 (static_cast<codepoint_t>(0x3f & *(it + 2)) << 6) |
44 static_cast<codepoint_t>(0x3f & *(it + 3));
45 length = 4;
46 }
47 else if (0xe0 == (0xf0 & *it)) // 3 byte utf8 codepoint
48 {
49 if (it + 2 == end || it + 1 == end)
50 return std::make_pair(end, utf8_char_t {});
51
52 cp = (static_cast<codepoint_t>(0x0f & *it) << 12) |
53 (static_cast<codepoint_t>(0x3f & *(it + 1)) << 6) |
54 static_cast<codepoint_t>(0x3f & *(it + 2));
55 length = 3;
56 }
57 else if (0xc0 == (0xe0 & *it)) // 2 byte utf8 codepoint
58 {
59 if (it + 1 == end)
60 return std::make_pair(end, utf8_char_t {});
61
62 cp = (static_cast<codepoint_t>(0x1f & *it) << 6) |
63 static_cast<codepoint_t>(0x3f & *(it + 1));
64 length = 2;
65 }
66 else // 1 byte utf8 codepoint otherwise
67 {
68 cp = static_cast<unsigned char>(*it);
69 length = 1;
70 }
71
72 for (length_t i = 0; i < length; ++i)
73 repr[i] = static_cast<unsigned char>(*(it + static_cast<int>(i)));
74
75 return std::make_pair(it + static_cast<long>(length),
76 utf8_char_t(cp, length, repr));
77 }
78
79 /**
80 *
81 * @return true if the given codepoint is printable according to std::isprint
82 */
83 [[nodiscard]] bool isPrintable() const
84 {
85 if (m_codepoint < std::numeric_limits<char>::max())
86 return std::isprint(m_codepoint);
87 return true;
88 }
89
90 [[nodiscard]] const char* c_str() const { return reinterpret_cast<const char*>(m_repr.data()); }
91 [[nodiscard]] std::size_t size() const { return m_length; }
92 [[nodiscard]] codepoint_t codepoint() const { return m_codepoint; }
93
94 private:
98 };
99}
100
101#endif
utf8_char_t(const codepoint_t cp, const length_t len, const repr_t repr)
Definition utf8_char.hpp:20
codepoint_t codepoint() const
Definition utf8_char.hpp:92
std::array< unsigned char, 5 > repr_t
Definition utf8_char.hpp:15
const char * c_str() const
Definition utf8_char.hpp:90
static std::pair< std::string::iterator, utf8_char_t > at(const std::string::iterator it, const std::string::iterator end)
Parse a codepoint and compute its length and representation.
Definition utf8_char.hpp:30
std::size_t size() const
Definition utf8_char.hpp:91