ArkScript
A small, lisp-inspired, functional scripting language
utf8_char.hpp
Go to the documentation of this file.
1
#ifndef SRC_UTF8_CHAR_HPP
2
#define SRC_UTF8_CHAR_HPP
3
4
#include <array>
5
#include <string>
6
#include <limits>
7
8
namespace
Ark::internal
9
{
10
class
utf8_char_t
11
{
12
public
:
13
using
codepoint_t
= int;
14
using
length_t
=
unsigned
char;
15
using
repr_t
= std::array<unsigned char, 5>;
16
17
utf8_char_t
() :
18
m_codepoint
(0),
m_length
(0),
m_repr
({ 0 }) {}
19
20
utf8_char_t
(
const
codepoint_t
cp,
const
length_t
len,
const
repr_t
repr) :
21
m_codepoint
(cp),
m_length
(len),
m_repr
(repr) {}
22
23
/**
24
* @brief Parse a codepoint and compute its length and representation
25
* @details https://github.com/sheredom/utf8.h/blob/4e4d828174c35e4564c31a9e35580c299c69a063/utf8.h#L1178
26
* @param it iterator in a string
27
* @param end end iterator, used to avoid going out of bound
28
* @return std::pair<std::string::iterator, utf8_char_t> the iterator points to the beginning of the next codepoint, the utf8_char_t represents the parsed codepoint
29
*/
30
static
std::pair<std::string::iterator, utf8_char_t>
at
(
const
std::string::iterator it,
const
std::string::iterator end)
31
{
32
codepoint_t
cp;
33
length_t
length;
34
repr_t
repr = {};
35
36
if
(0xf0 == (0xf8 & *it))
// 4 byte utf8 codepoint
37
{
38
if
(it + 3 == end || it + 2 == end || it + 1 == end)
39
return
std::make_pair(end,
utf8_char_t
{});
40
41
cp = (
static_cast<
codepoint_t
>
(0x07 & *it) << 18) |
42
(
static_cast<
codepoint_t
>
(0x3f & *(it + 1)) << 12) |
43
(
static_cast<
codepoint_t
>
(0x3f & *(it + 2)) << 6) |
44
static_cast<
codepoint_t
>
(0x3f & *(it + 3));
45
length = 4;
46
}
47
else
if
(0xe0 == (0xf0 & *it))
// 3 byte utf8 codepoint
48
{
49
if
(it + 2 == end || it + 1 == end)
50
return
std::make_pair(end,
utf8_char_t
{});
51
52
cp = (
static_cast<
codepoint_t
>
(0x0f & *it) << 12) |
53
(
static_cast<
codepoint_t
>
(0x3f & *(it + 1)) << 6) |
54
static_cast<
codepoint_t
>
(0x3f & *(it + 2));
55
length = 3;
56
}
57
else
if
(0xc0 == (0xe0 & *it))
// 2 byte utf8 codepoint
58
{
59
if
(it + 1 == end)
60
return
std::make_pair(end,
utf8_char_t
{});
61
62
cp = (
static_cast<
codepoint_t
>
(0x1f & *it) << 6) |
63
static_cast<
codepoint_t
>
(0x3f & *(it + 1));
64
length = 2;
65
}
66
else
// 1 byte utf8 codepoint otherwise
67
{
68
cp =
static_cast<
unsigned
char
>
(*it);
69
length = 1;
70
}
71
72
for
(
length_t
i = 0; i < length; ++i)
73
repr[i] =
static_cast<
unsigned
char
>
(*(it +
static_cast<
int
>
(i)));
74
75
return
std::make_pair(it +
static_cast<
long
>
(length),
76
utf8_char_t
(cp, length, repr));
77
}
78
79
/**
80
*
81
* @return true if the given codepoint is printable according to std::isprint
82
*/
83
[[nodiscard]]
bool
isPrintable
()
const
84
{
85
if
(
m_codepoint
< std::numeric_limits<char>::max())
86
return
std::isprint(
m_codepoint
);
87
return
true
;
88
}
89
90
[[nodiscard]]
const
char
*
c_str
()
const
{
return
reinterpret_cast<
const
char
*
>
(
m_repr
.data()); }
91
[[nodiscard]] std::size_t
size
()
const
{
return
m_length
; }
92
[[nodiscard]]
codepoint_t
codepoint
()
const
{
return
m_codepoint
; }
93
94
private
:
95
codepoint_t
m_codepoint
;
96
length_t
m_length
;
97
repr_t
m_repr
;
98
};
99
}
100
101
#endif
Ark::internal::utf8_char_t
Definition
utf8_char.hpp:11
Ark::internal::utf8_char_t::utf8_char_t
utf8_char_t()
Definition
utf8_char.hpp:17
Ark::internal::utf8_char_t::utf8_char_t
utf8_char_t(const codepoint_t cp, const length_t len, const repr_t repr)
Definition
utf8_char.hpp:20
Ark::internal::utf8_char_t::codepoint_t
int codepoint_t
Definition
utf8_char.hpp:13
Ark::internal::utf8_char_t::m_codepoint
codepoint_t m_codepoint
Definition
utf8_char.hpp:95
Ark::internal::utf8_char_t::m_length
length_t m_length
Definition
utf8_char.hpp:96
Ark::internal::utf8_char_t::codepoint
codepoint_t codepoint() const
Definition
utf8_char.hpp:92
Ark::internal::utf8_char_t::repr_t
std::array< unsigned char, 5 > repr_t
Definition
utf8_char.hpp:15
Ark::internal::utf8_char_t::c_str
const char * c_str() const
Definition
utf8_char.hpp:90
Ark::internal::utf8_char_t::m_repr
repr_t m_repr
Definition
utf8_char.hpp:97
Ark::internal::utf8_char_t::isPrintable
bool isPrintable() const
Definition
utf8_char.hpp:83
Ark::internal::utf8_char_t::at
static std::pair< std::string::iterator, utf8_char_t > at(const std::string::iterator it, const std::string::iterator end)
Parse a codepoint and compute its length and representation.
Definition
utf8_char.hpp:30
Ark::internal::utf8_char_t::length_t
unsigned char length_t
Definition
utf8_char.hpp:14
Ark::internal::utf8_char_t::size
std::size_t size() const
Definition
utf8_char.hpp:91
Ark::internal
Definition
Builtins.hpp:26
include
Ark
Compiler
AST
utf8_char.hpp
Generated on Sun Sep 28 2025 15:55:36 for ArkScript by
1.12.0