ArkScript
A small, fast, functional and scripting language for video games
Lexer.cpp
Go to the documentation of this file.
2 
3 #include <cstdio> // TODO remove
4 #include <algorithm>
5 #include <utility>
6 #include <sstream>
7 #include <utf8_decoder.h>
8 
9 #include <Ark/Exceptions.hpp>
10 #include <Ark/Utils.hpp>
12 
13 namespace Ark::internal
14 {
15  Lexer::Lexer(unsigned debug) noexcept :
16  m_debug(debug)
17  {}
18 
19  void Lexer::feed(const std::string& code)
20  {
21  std::size_t line = 0, character = 0;
22  std::size_t saved_line = 0, saved_char = 0;
23  // flags
24  bool in_string = false, in_ctrl_char = false, in_comment = false;
25  // buffers
26  std::string buffer, ctrl_char;
27 
28  auto append_token_from_buffer = [&]() {
29  TokenType type = guessType(buffer);
30  // tokenizing error management
31  if (type == TokenType::Mismatch)
32  throwTokenizingError("invalid token '" + buffer + "'", buffer, line, character, code);
33  else if (type == TokenType::Capture || type == TokenType::GetField)
34  buffer = buffer.substr(1); // remove the & or the .
35  m_tokens.emplace_back(type, buffer, saved_line, saved_char);
36  buffer.clear();
37  };
38 
39  for (std::size_t pos = 0, end = code.size(); pos < end; ++pos)
40  {
41  char current = code[pos];
42 
43  if (m_debug >= 5)
44  std::printf(
45  "buffer: %s - ctrl_char: %s - current: '%c' - line: %zu, char: %zu\n",
46  buffer.c_str(), ctrl_char.c_str(), current, line, character);
47 
48  if (!in_string)
49  {
50  // handle comments first
51  if (in_comment) // append every character to the buffer if we're in a comment, even spaces
52  buffer += current;
53  // handle ()[]{} then
54  else if (current == '(' || current == ')' || current == '[' || current == ']' || current == '{' || current == '}')
55  {
56  if (!buffer.empty())
57  append_token_from_buffer();
58  m_tokens.emplace_back(TokenType::Grouping, std::string(1, current), line, character);
59  }
60  // handle strings next
61  else if (current == '"')
62  {
63  if (!buffer.empty())
64  append_token_from_buffer();
65  in_string = true;
66  buffer = "\"";
67  saved_line = line;
68  saved_char = character;
69  }
70  // handle shorthands, be careful with ! and !=
71  else if (current == '\'' || (current == '!' && pos + 1 < code.size() && code[pos + 1] != '=' && buffer.empty()))
72  {
73  m_tokens.emplace_back(TokenType::Shorthand, std::string(1, current), line, character);
74  }
75  // handle comments
76  else if (current == '#')
77  {
78  if (!buffer.empty())
79  append_token_from_buffer();
80  in_comment = true;
81  buffer = "#";
82  }
83  // separation
84  else if ((current == ' ' || current == '\t' || current == '\v' || current == '\n'))
85  {
86  if (!buffer.empty())
87  append_token_from_buffer();
88  }
89  // capture
90  else if (current == '&')
91  {
92  if (!buffer.empty())
93  append_token_from_buffer();
94  buffer.clear();
95  buffer += current;
96  }
97  // getfield or spread
98  else if (current == '.')
99  {
100  // check numbers, we don't want to split 3.0 into 3 and .0
101  if (!buffer.empty() && !('0' <= buffer[0] && buffer[0] <= '9') && buffer[0] != '+' && buffer[0] != '-' && buffer[0] != '.')
102  {
103  append_token_from_buffer();
104  buffer.clear();
105  }
106  buffer += current;
107  }
108  // identifier, number, operator
109  else
110  {
111  if (buffer.empty())
112  {
113  saved_char = character;
114  saved_line = line;
115  }
116  buffer += current;
117  }
118  }
119  else // we are in a string here
120  {
121  // check for control character
122  if (!in_ctrl_char)
123  {
124  if (current == '\\')
125  in_ctrl_char = true;
126  else if (current == '"') // end of string
127  {
128  buffer += current;
129  in_string = false;
130  m_tokens.emplace_back(TokenType::String, buffer, saved_line, saved_char);
131  buffer.clear();
132  }
133  else
134  buffer += current;
135  }
136  else
137  {
138  // end of escape code
139  if (current == ' ' || endOfControlChar(ctrl_char, current))
140  {
141  // process escape code
142  if (ctrl_char.empty())
143  throwTokenizingError("empty control character '\\' in string", buffer, line, character, code);
144  else if (ctrl_char.size() == 1)
145  {
146  switch (ctrl_char[0])
147  {
148  case '"': buffer += '"'; break;
149  case 'n': buffer += '\n'; break;
150  case 'a': buffer += '\a'; break;
151  case 'b': buffer += '\b'; break;
152  case 't': buffer += '\t'; break;
153  case 'r': buffer += '\r'; break;
154  case 'f': buffer += '\f'; break;
155  case '\\': buffer += '\\'; break;
156  case '0': buffer += '\0'; break;
157 
158  default:
159  throwTokenizingError("unknown control character '\\" + ctrl_char + "' in string", buffer, line, character, code);
160  break;
161  }
162  }
163  else
164  {
165  switch (ctrl_char[0])
166  {
167  case 'x': break; /// @todo
168 
169  case 'u':
170  {
171  char utf8_str[5];
172  utf8decode(ctrl_char.c_str() + 1, utf8_str);
173  if (*utf8_str == '\0')
174  throwTokenizingError("invalid escape sequence \\" + ctrl_char + " in string, expected hexadecimal number that in utf8 range, got a \"" + ctrl_char + "\"", buffer, line, character + 1, code);
175  buffer += utf8_str;
176  break;
177  }
178 
179  case 'U':
180  {
181  short begin = 1;
182  for (; ctrl_char[begin] == '0'; ++begin)
183  ;
184  char utf8_str[5];
185  utf8decode(ctrl_char.c_str() + begin, utf8_str);
186  if (*utf8_str == '\0')
187  throwTokenizingError("invalid escape sequence \\" + ctrl_char + " in string, expected hexadecimal number that in utf8 range, got a \"" + ctrl_char + "\"", buffer, line, character + 1, code);
188  buffer += utf8_str;
189  break;
190  }
191 
192  default:
193  throwTokenizingError("unknown control character '\\" + ctrl_char + "' in string", buffer, line, character, code);
194  break;
195  }
196  }
197 
198  ctrl_char.clear();
199  in_ctrl_char = false;
200 
201  if (current == '"') // end of string
202  {
203  buffer += current;
204  in_string = false;
205  m_tokens.emplace_back(TokenType::String, buffer, saved_line, saved_char);
206  buffer.clear();
207  }
208  else if (current == '\\') // new escape code
209  in_ctrl_char = true;
210  else
211  buffer += current;
212  }
213  else // the escape code continues
214  ctrl_char += current;
215  }
216  }
217 
218  // position counter
219  if (current == '\n')
220  {
221  line++;
222  character = 0; // before first character
223 
224  // close comments, don't append them
225  if (in_comment)
226  {
227  in_comment = false;
228  buffer.clear();
229  continue;
230  }
231  }
232  else
233  {
234  // update position
235  character++;
236  }
237  }
238 
239  if (!buffer.empty() && buffer[0] != '#')
240  append_token_from_buffer();
241 
242  // debugging information
243  if (m_debug > 3)
244  {
245  for (auto& last_token : m_tokens)
246  {
247  std::printf(
248  "TokenType: %s\tLine: %zu\n[%zu\t]\tToken: %s\n",
249  tokentype_string[static_cast<std::size_t>(last_token.type)].data(),
250  last_token.line,
251  last_token.col,
252  last_token.token.c_str());
253  }
254  }
255  }
256 
257  std::vector<Token>& Lexer::tokens() noexcept
258  {
259  return m_tokens;
260  }
261 
262  TokenType Lexer::guessType(const std::string& value) noexcept
263  {
264  if (value.empty())
265  return TokenType::Mismatch;
266 
267  // assuming we already detected ()[]{}, strings, shorthands and comments
268  if (Utils::isDouble(value)) // works on (\+|-)?[[:digit:]]+(\.[[:digit:]]+)?([e|E](\+|-)?[[:digit]]+)?
269  return TokenType::Number;
270  else if (isOperator(value))
271  return TokenType::Operator;
272  else if (isKeyword(value))
273  return TokenType::Keyword;
274  else if (value[0] == '&' && value.size() > 1 && isIdentifier(value))
275  return TokenType::Capture;
276  else if (value.size() > 3 && value[0] == value[1] && value[1] == value[2] && value[2] == '.')
277  return TokenType::Spread;
278  else if (value[0] == '.' && value.size() > 1 && isIdentifier(value))
279  return TokenType::GetField;
280  // otherwise, identifier if it starts with [a-zA-Z_]
281  else if (isIdentifier(value))
282  return TokenType::Identifier;
283  return TokenType::Mismatch;
284  }
285 
286  bool Lexer::isKeyword(const std::string& value) noexcept
287  {
288  return std::find(keywords.begin(), keywords.end(), value) != keywords.end();
289  }
290 
291  bool Lexer::isIdentifier(const std::string& value) noexcept
292  {
293  return utf8valid(value.c_str());
294  }
295 
296  bool Lexer::isOperator(const std::string& value) noexcept
297  {
298  return std::find(operators.begin(), operators.end(), value) != operators.end();
299  }
300 
301  bool Lexer::endOfControlChar(const std::string& sequence, char next) noexcept
302  {
303  switch (sequence[0])
304  {
305  case 'x':
306  // \x[any number of hex digits]
307  // if it's not a hex digit then it's most likely the end for us
308  return !isHexChar(next);
309 
310  case 'u':
311  return sequence.size() == 5;
312 
313  case 'U':
314  return sequence.size() == 9;
315 
316  case '"':
317  case 'n':
318  case 'a':
319  case 'b':
320  case 't':
321  case 'r':
322  case 'f':
323  case '\\':
324  case '0':
325  return true;
326  }
327  return false;
328  }
329 
330  void Lexer::throwTokenizingError(const std::string& message, const std::string& match, std::size_t line, std::size_t col, const std::string& context)
331  {
332  std::vector<std::string> ctx = Utils::splitString(context, '\n');
333 
334  std::stringstream ss;
335  ss << message << "\n";
336  ss << makeTokenBasedErrorCtx(match, line, col, context);
337 
338  throw SyntaxError(ss.str());
339  }
340 }
ArkScript homemade exceptions.
Tokenize ArkScript code.
Lots of utilities about string, filesystem and more.
SyntaxError thrown by the lexer.
Definition: Exceptions.hpp:83
Lexer(unsigned debug) noexcept
Construct a new Lexer object.
Definition: Lexer.cpp:15
std::vector< Token > & tokens() noexcept
Return the list of tokens.
Definition: Lexer.cpp:257
bool isIdentifier(const std::string &value) noexcept
Check if the value can be an identifier in ArkScript.
Definition: Lexer.cpp:291
bool isKeyword(const std::string &value) noexcept
Check if the value is a keyword in ArkScript.
Definition: Lexer.cpp:286
TokenType guessType(const std::string &value) noexcept
Helper function to determine the type of a token.
Definition: Lexer.cpp:262
void feed(const std::string &code)
Give code to tokenize and create the list of tokens.
Definition: Lexer.cpp:19
bool isOperator(const std::string &value) noexcept
Check if the value is an operator in ArkScript.
Definition: Lexer.cpp:296
std::vector< Token > m_tokens
Definition: Lexer.hpp:52
unsigned m_debug
Definition: Lexer.hpp:51
bool endOfControlChar(const std::string &sequence, char next) noexcept
Check if a control character / sequence is complete or not.
Definition: Lexer.cpp:301
void throwTokenizingError(const std::string &message, const std::string &match, std::size_t line, std::size_t col, const std::string &context)
To throw nice lexer errors.
Definition: Lexer.cpp:330
Create string error context for AST errors.
std::vector< std::string > splitString(const std::string &source, char sep)
Cut a string into pieces, given a character separator.
Definition: Utils.hpp:34
bool isDouble(const std::string &s, double *output=nullptr)
Checks if a string is a valid double.
Definition: Utils.hpp:58
constexpr std::array< std::string_view, 13 > tokentype_string
Definition: Token.hpp:39
std::string makeTokenBasedErrorCtx(const std::string &match, std::size_t line, std::size_t col, const std::string &code)
Construct an error message based on a given match in the code.
constexpr std::array< std::string_view, 25 > operators
Definition: Common.hpp:89
constexpr std::array< std::string_view, 10 > keywords
List of available keywords in ArkScript.
Definition: Common.hpp:73