7#include <utf8_decoder.h>
21 std::size_t line = 0, character = 0;
22 std::size_t saved_line = 0, saved_char = 0;
24 bool in_string =
false, in_ctrl_char =
false, in_comment =
false;
26 std::string buffer, ctrl_char;
28 auto append_token_from_buffer = [&]() {
34 buffer = buffer.substr(1);
35 m_tokens.emplace_back(type, buffer, saved_line, saved_char);
39 for (std::size_t pos = 0, end = code.size(); pos < end; ++pos)
41 char current = code[pos];
45 "buffer: %s - ctrl_char: %s - current: '%c' - line: %zu, char: %zu\n",
46 buffer.c_str(), ctrl_char.c_str(), current, line, character);
54 else if (current ==
'(' || current ==
')' || current ==
'[' || current ==
']' || current ==
'{' || current ==
'}')
57 append_token_from_buffer();
61 else if (current ==
'"')
64 append_token_from_buffer();
68 saved_char = character;
71 else if (current ==
'\'' || (current ==
'!' && pos + 1 < code.size() && code[pos + 1] !=
'=' && buffer.empty()))
76 else if (current ==
'#')
79 append_token_from_buffer();
84 else if ((current ==
' ' || current ==
'\t' || current ==
'\v' || current ==
'\n'))
87 append_token_from_buffer();
90 else if (current ==
'&')
93 append_token_from_buffer();
98 else if (current ==
'.')
101 if (!buffer.empty() && !(
'0' <= buffer[0] && buffer[0] <=
'9') && buffer[0] !=
'+' && buffer[0] !=
'-' && buffer[0] !=
'.')
103 append_token_from_buffer();
113 saved_char = character;
126 else if (current ==
'"')
142 if (ctrl_char.empty())
144 else if (ctrl_char.size() == 1)
146 switch (ctrl_char[0])
148 case '"': buffer +=
'"';
break;
149 case 'n': buffer +=
'\n';
break;
150 case 'a': buffer +=
'\a';
break;
151 case 'b': buffer +=
'\b';
break;
152 case 't': buffer +=
'\t';
break;
153 case 'r': buffer +=
'\r';
break;
154 case 'f': buffer +=
'\f';
break;
155 case '\\': buffer +=
'\\';
break;
156 case '0': buffer +=
'\0';
break;
159 throwTokenizingError(
"unknown control character '\\" + ctrl_char +
"' in string", buffer, line, character, code);
165 switch (ctrl_char[0])
172 utf8decode(ctrl_char.c_str() + 1, utf8_str);
173 if (*utf8_str ==
'\0')
174 throwTokenizingError(
"invalid escape sequence \\" + ctrl_char +
" in string, expected hexadecimal number that in utf8 range, got a \"" + ctrl_char +
"\"", buffer, line, character + 1, code);
182 for (; ctrl_char[begin] ==
'0'; ++begin)
185 utf8decode(ctrl_char.c_str() + begin, utf8_str);
186 if (*utf8_str ==
'\0')
187 throwTokenizingError(
"invalid escape sequence \\" + ctrl_char +
" in string, expected hexadecimal number that in utf8 range, got a \"" + ctrl_char +
"\"", buffer, line, character + 1, code);
193 throwTokenizingError(
"unknown control character '\\" + ctrl_char +
"' in string", buffer, line, character, code);
199 in_ctrl_char =
false;
208 else if (current ==
'\\')
214 ctrl_char += current;
239 if (!buffer.empty() && buffer[0] !=
'#')
240 append_token_from_buffer();
248 "TokenType: %s\tLine: %zu\n[%zu\t]\tToken: %s\n",
252 last_token.token.c_str());
270 else if (isOperator(value))
272 else if (isKeyword(value))
274 else if (value[0] ==
'&' && value.size() > 1 && isIdentifier(value))
276 else if (value.size() > 3 && value[0] == value[1] && value[1] == value[2] && value[2] ==
'.')
278 else if (value[0] ==
'.' && value.size() > 1 && isIdentifier(value))
281 else if (isIdentifier(value))
293 return utf8valid(value.c_str());
308 return !isHexChar(next);
311 return sequence.size() == 5;
314 return sequence.size() == 9;
330 void Lexer::throwTokenizingError(
const std::string& message,
const std::string& match, std::size_t line, std::size_t col,
const std::string& context)
334 std::stringstream ss;
335 ss << message <<
"\n";
ArkScript homemade exceptions.
Lots of utilities about string, filesystem and more.
SyntaxError thrown by the lexer.
Lexer(unsigned debug) noexcept
Construct a new Lexer object.
std::vector< Token > & tokens() noexcept
Return the list of tokens.
bool isIdentifier(const std::string &value) noexcept
Check if the value can be an identifier in ArkScript.
bool isKeyword(const std::string &value) noexcept
Check if the value is a keyword in ArkScript.
TokenType guessType(const std::string &value) noexcept
Helper function to determine the type of a token.
void feed(const std::string &code)
Give code to tokenize and create the list of tokens.
bool isOperator(const std::string &value) noexcept
Check if the value is an operator in ArkScript.
std::vector< Token > m_tokens
bool endOfControlChar(const std::string &sequence, char next) noexcept
Check if a control character / sequence is complete or not.
void throwTokenizingError(const std::string &message, const std::string &match, std::size_t line, std::size_t col, const std::string &context)
To throw nice lexer errors.
Create string error context for AST errors.
std::vector< std::string > splitString(const std::string &source, char sep)
Cut a string into pieces, given a character separator.
bool isDouble(const std::string &s, double *output=nullptr)
Checks if a string is a valid double.
constexpr std::array< std::string_view, 13 > tokentype_string
std::string makeTokenBasedErrorCtx(const std::string &match, std::size_t line, std::size_t col, const std::string &code)
Construct an error message based on a given match in the code.
constexpr std::array< std::string_view, 25 > operators
constexpr std::array< std::string_view, 10 > keywords
List of available keywords in ArkScript.