ArkScript
A small, fast, functional and scripting language for video games
Lexer.cpp
Go to the documentation of this file.
2
3#include <cstdio> // TODO remove
4#include <algorithm>
5#include <utility>
6#include <sstream>
7#include <utf8_decoder.h>
8
9#include <Ark/Exceptions.hpp>
10#include <Ark/Utils.hpp>
12
13namespace Ark::internal
14{
15 Lexer::Lexer(unsigned debug) noexcept :
16 m_debug(debug)
17 {}
18
19 void Lexer::feed(const std::string& code)
20 {
21 std::size_t line = 0, character = 0;
22 std::size_t saved_line = 0, saved_char = 0;
23 // flags
24 bool in_string = false, in_ctrl_char = false, in_comment = false;
25 // buffers
26 std::string buffer, ctrl_char;
27
28 auto append_token_from_buffer = [&]() {
29 TokenType type = guessType(buffer);
30 // tokenizing error management
31 if (type == TokenType::Mismatch)
32 throwTokenizingError("invalid token '" + buffer + "'", buffer, line, character, code);
33 else if (type == TokenType::Capture || type == TokenType::GetField)
34 buffer = buffer.substr(1); // remove the & or the .
35 m_tokens.emplace_back(type, buffer, saved_line, saved_char);
36 buffer.clear();
37 };
38
39 for (std::size_t pos = 0, end = code.size(); pos < end; ++pos)
40 {
41 char current = code[pos];
42
43 if (m_debug >= 5)
44 std::printf(
45 "buffer: %s - ctrl_char: %s - current: '%c' - line: %zu, char: %zu\n",
46 buffer.c_str(), ctrl_char.c_str(), current, line, character);
47
48 if (!in_string)
49 {
50 // handle comments first
51 if (in_comment) // append every character to the buffer if we're in a comment, even spaces
52 buffer += current;
53 // handle ()[]{} then
54 else if (current == '(' || current == ')' || current == '[' || current == ']' || current == '{' || current == '}')
55 {
56 if (!buffer.empty())
57 append_token_from_buffer();
58 m_tokens.emplace_back(TokenType::Grouping, std::string(1, current), line, character);
59 }
60 // handle strings next
61 else if (current == '"')
62 {
63 if (!buffer.empty())
64 append_token_from_buffer();
65 in_string = true;
66 buffer = "\"";
67 saved_line = line;
68 saved_char = character;
69 }
70 // handle shorthands, be careful with ! and !=
71 else if (current == '\'' || (current == '!' && pos + 1 < code.size() && code[pos + 1] != '=' && buffer.empty()))
72 {
73 m_tokens.emplace_back(TokenType::Shorthand, std::string(1, current), line, character);
74 }
75 // handle comments
76 else if (current == '#')
77 {
78 if (!buffer.empty())
79 append_token_from_buffer();
80 in_comment = true;
81 buffer = "#";
82 }
83 // separation
84 else if ((current == ' ' || current == '\t' || current == '\v' || current == '\n'))
85 {
86 if (!buffer.empty())
87 append_token_from_buffer();
88 }
89 // capture
90 else if (current == '&')
91 {
92 if (!buffer.empty())
93 append_token_from_buffer();
94 buffer.clear();
95 buffer += current;
96 }
97 // getfield or spread
98 else if (current == '.')
99 {
100 // check numbers, we don't want to split 3.0 into 3 and .0
101 if (!buffer.empty() && !('0' <= buffer[0] && buffer[0] <= '9') && buffer[0] != '+' && buffer[0] != '-' && buffer[0] != '.')
102 {
103 append_token_from_buffer();
104 buffer.clear();
105 }
106 buffer += current;
107 }
108 // identifier, number, operator
109 else
110 {
111 if (buffer.empty())
112 {
113 saved_char = character;
114 saved_line = line;
115 }
116 buffer += current;
117 }
118 }
119 else // we are in a string here
120 {
121 // check for control character
122 if (!in_ctrl_char)
123 {
124 if (current == '\\')
125 in_ctrl_char = true;
126 else if (current == '"') // end of string
127 {
128 buffer += current;
129 in_string = false;
130 m_tokens.emplace_back(TokenType::String, buffer, saved_line, saved_char);
131 buffer.clear();
132 }
133 else
134 buffer += current;
135 }
136 else
137 {
138 // end of escape code
139 if (current == ' ' || endOfControlChar(ctrl_char, current))
140 {
141 // process escape code
142 if (ctrl_char.empty())
143 throwTokenizingError("empty control character '\\' in string", buffer, line, character, code);
144 else if (ctrl_char.size() == 1)
145 {
146 switch (ctrl_char[0])
147 {
148 case '"': buffer += '"'; break;
149 case 'n': buffer += '\n'; break;
150 case 'a': buffer += '\a'; break;
151 case 'b': buffer += '\b'; break;
152 case 't': buffer += '\t'; break;
153 case 'r': buffer += '\r'; break;
154 case 'f': buffer += '\f'; break;
155 case '\\': buffer += '\\'; break;
156 case '0': buffer += '\0'; break;
157
158 default:
159 throwTokenizingError("unknown control character '\\" + ctrl_char + "' in string", buffer, line, character, code);
160 break;
161 }
162 }
163 else
164 {
165 switch (ctrl_char[0])
166 {
167 case 'x': break; /// @todo
168
169 case 'u':
170 {
171 char utf8_str[5];
172 utf8decode(ctrl_char.c_str() + 1, utf8_str);
173 if (*utf8_str == '\0')
174 throwTokenizingError("invalid escape sequence \\" + ctrl_char + " in string, expected hexadecimal number that in utf8 range, got a \"" + ctrl_char + "\"", buffer, line, character + 1, code);
175 buffer += utf8_str;
176 break;
177 }
178
179 case 'U':
180 {
181 short begin = 1;
182 for (; ctrl_char[begin] == '0'; ++begin)
183 ;
184 char utf8_str[5];
185 utf8decode(ctrl_char.c_str() + begin, utf8_str);
186 if (*utf8_str == '\0')
187 throwTokenizingError("invalid escape sequence \\" + ctrl_char + " in string, expected hexadecimal number that in utf8 range, got a \"" + ctrl_char + "\"", buffer, line, character + 1, code);
188 buffer += utf8_str;
189 break;
190 }
191
192 default:
193 throwTokenizingError("unknown control character '\\" + ctrl_char + "' in string", buffer, line, character, code);
194 break;
195 }
196 }
197
198 ctrl_char.clear();
199 in_ctrl_char = false;
200
201 if (current == '"') // end of string
202 {
203 buffer += current;
204 in_string = false;
205 m_tokens.emplace_back(TokenType::String, buffer, saved_line, saved_char);
206 buffer.clear();
207 }
208 else if (current == '\\') // new escape code
209 in_ctrl_char = true;
210 else
211 buffer += current;
212 }
213 else // the escape code continues
214 ctrl_char += current;
215 }
216 }
217
218 // position counter
219 if (current == '\n')
220 {
221 line++;
222 character = 0; // before first character
223
224 // close comments, don't append them
225 if (in_comment)
226 {
227 in_comment = false;
228 buffer.clear();
229 continue;
230 }
231 }
232 else
233 {
234 // update position
235 character++;
236 }
237 }
238
239 if (!buffer.empty() && buffer[0] != '#')
240 append_token_from_buffer();
241
242 // debugging information
243 if (m_debug > 3)
244 {
245 for (auto& last_token : m_tokens)
246 {
247 std::printf(
248 "TokenType: %s\tLine: %zu\n[%zu\t]\tToken: %s\n",
249 tokentype_string[static_cast<std::size_t>(last_token.type)].data(),
250 last_token.line,
251 last_token.col,
252 last_token.token.c_str());
253 }
254 }
255 }
256
257 std::vector<Token>& Lexer::tokens() noexcept
258 {
259 return m_tokens;
260 }
261
262 TokenType Lexer::guessType(const std::string& value) noexcept
263 {
264 if (value.empty())
265 return TokenType::Mismatch;
266
267 // assuming we already detected ()[]{}, strings, shorthands and comments
268 if (Utils::isDouble(value)) // works on (\+|-)?[[:digit:]]+(\.[[:digit:]]+)?([e|E](\+|-)?[[:digit]]+)?
269 return TokenType::Number;
270 else if (isOperator(value))
271 return TokenType::Operator;
272 else if (isKeyword(value))
273 return TokenType::Keyword;
274 else if (value[0] == '&' && value.size() > 1 && isIdentifier(value))
275 return TokenType::Capture;
276 else if (value.size() > 3 && value[0] == value[1] && value[1] == value[2] && value[2] == '.')
277 return TokenType::Spread;
278 else if (value[0] == '.' && value.size() > 1 && isIdentifier(value))
279 return TokenType::GetField;
280 // otherwise, identifier if it starts with [a-zA-Z_]
281 else if (isIdentifier(value))
283 return TokenType::Mismatch;
284 }
285
286 bool Lexer::isKeyword(const std::string& value) noexcept
287 {
288 return std::find(keywords.begin(), keywords.end(), value) != keywords.end();
289 }
290
291 bool Lexer::isIdentifier(const std::string& value) noexcept
292 {
293 return utf8valid(value.c_str());
294 }
295
296 bool Lexer::isOperator(const std::string& value) noexcept
297 {
298 return std::find(operators.begin(), operators.end(), value) != operators.end();
299 }
300
301 bool Lexer::endOfControlChar(const std::string& sequence, char next) noexcept
302 {
303 switch (sequence[0])
304 {
305 case 'x':
306 // \x[any number of hex digits]
307 // if it's not a hex digit then it's most likely the end for us
308 return !isHexChar(next);
309
310 case 'u':
311 return sequence.size() == 5;
312
313 case 'U':
314 return sequence.size() == 9;
315
316 case '"':
317 case 'n':
318 case 'a':
319 case 'b':
320 case 't':
321 case 'r':
322 case 'f':
323 case '\\':
324 case '0':
325 return true;
326 }
327 return false;
328 }
329
330 void Lexer::throwTokenizingError(const std::string& message, const std::string& match, std::size_t line, std::size_t col, const std::string& context)
331 {
332 std::vector<std::string> ctx = Utils::splitString(context, '\n');
333
334 std::stringstream ss;
335 ss << message << "\n";
336 ss << makeTokenBasedErrorCtx(match, line, col, context);
337
338 throw SyntaxError(ss.str());
339 }
340}
ArkScript homemade exceptions.
Tokenize ArkScript code.
Lots of utilities about string, filesystem and more.
SyntaxError thrown by the lexer.
Definition: Exceptions.hpp:83
Lexer(unsigned debug) noexcept
Construct a new Lexer object.
Definition: Lexer.cpp:15
std::vector< Token > & tokens() noexcept
Return the list of tokens.
Definition: Lexer.cpp:257
bool isIdentifier(const std::string &value) noexcept
Check if the value can be an identifier in ArkScript.
Definition: Lexer.cpp:291
bool isKeyword(const std::string &value) noexcept
Check if the value is a keyword in ArkScript.
Definition: Lexer.cpp:286
TokenType guessType(const std::string &value) noexcept
Helper function to determine the type of a token.
Definition: Lexer.cpp:262
void feed(const std::string &code)
Give code to tokenize and create the list of tokens.
Definition: Lexer.cpp:19
bool isOperator(const std::string &value) noexcept
Check if the value is an operator in ArkScript.
Definition: Lexer.cpp:296
std::vector< Token > m_tokens
Definition: Lexer.hpp:52
unsigned m_debug
Definition: Lexer.hpp:51
bool endOfControlChar(const std::string &sequence, char next) noexcept
Check if a control character / sequence is complete or not.
Definition: Lexer.cpp:301
void throwTokenizingError(const std::string &message, const std::string &match, std::size_t line, std::size_t col, const std::string &context)
To throw nice lexer errors.
Definition: Lexer.cpp:330
Create string error context for AST errors.
std::vector< std::string > splitString(const std::string &source, char sep)
Cut a string into pieces, given a character separator.
Definition: Utils.hpp:34
bool isDouble(const std::string &s, double *output=nullptr)
Checks if a string is a valid double.
Definition: Utils.hpp:58
constexpr std::array< std::string_view, 13 > tokentype_string
Definition: Token.hpp:39
std::string makeTokenBasedErrorCtx(const std::string &match, std::size_t line, std::size_t col, const std::string &code)
Construct an error message based on a given match in the code.
constexpr std::array< std::string_view, 25 > operators
Definition: Common.hpp:89
constexpr std::array< std::string_view, 10 > keywords
List of available keywords in ArkScript.
Definition: Common.hpp:73