ArkScript
A small, lisp-inspired, functional scripting language
IRCompiler.cpp
Go to the documentation of this file.
2
3#include <chrono>
4#include <utility>
5#include <optional>
6#include <unordered_map>
7#include <Proxy/Picosha2.hpp>
8#include <fmt/ostream.h>
9
10#include <Ark/Constants.hpp>
11#include <Ark/Literals.hpp>
15
16namespace Ark::internal
17{
18 using namespace literals;
19
20 IRCompiler::IRCompiler(const unsigned debug) :
21 m_logger("IRCompiler", debug)
22 {}
23
24 void IRCompiler::process(const std::vector<IR::Block>& pages, const std::vector<std::string>& symbols, const std::vector<ValTableElem>& values)
25 {
26 m_logger.traceStart("process");
28 pushSymbolTable(symbols);
29 pushValueTable(values);
30
31 // compute a list of unique filenames
32 for (const auto& page : pages)
33 {
34 for (const auto& inst : page)
35 {
36 if (std::ranges::find(m_filenames, inst.filename()) == m_filenames.end() && inst.hasValidSourceLocation())
37 m_filenames.push_back(inst.filename());
38 }
39 }
40
42 pushInstLocTable(pages);
43
44 m_ir = pages;
45 compile();
46
47 if (m_ir.empty())
48 {
49 // code segment with a single instruction
51 m_bytecode.push_back(0_u8);
52 m_bytecode.push_back(1_u8);
53
54 m_bytecode.push_back(0_u8);
55 m_bytecode.push_back(HALT);
56 m_bytecode.push_back(0_u8);
57 m_bytecode.push_back(0_u8);
58 }
59
60 // generate a hash of the tables + bytecode
61 std::vector<unsigned char> hash_out(picosha2::k_digest_size);
62 picosha2::hash256(m_bytecode.begin() + bytecode::HeaderSize, m_bytecode.end(), hash_out);
63 m_bytecode.insert(m_bytecode.begin() + bytecode::HeaderSize, hash_out.begin(), hash_out.end());
64
66 }
67
68 void IRCompiler::dumpToStream(std::ostream& stream) const
69 {
70 std::size_t index = 0;
71 for (const auto& block : m_ir)
72 {
73 fmt::println(stream, "page_{}", index);
74 for (const auto& entity : block)
75 {
76 switch (entity.kind())
77 {
78 case IR::Kind::Label:
79 fmt::println(stream, ".L{}:", entity.label());
80 break;
81
82 case IR::Kind::Goto:
83 fmt::println(stream, "\t{} L{}", InstructionNames[entity.inst()], entity.label());
84 break;
85
87 fmt::println(stream, "\t{} L{}, {}", InstructionNames[entity.inst()], entity.label(), entity.primaryArg());
88 break;
89
91 fmt::println(stream, "\t{} {}", InstructionNames[entity.inst()], entity.primaryArg());
92 break;
93
95 fmt::println(stream, "\t{} {}, {}", InstructionNames[entity.inst()], entity.primaryArg(), entity.secondaryArg());
96 break;
97 }
98 }
99
100 fmt::println(stream, "");
101 ++index;
102 }
103 }
104
105 const bytecode_t& IRCompiler::bytecode() const noexcept
106 {
107 return m_bytecode;
108 }
109
111 {
112 // push the different code segments
113 for (std::size_t i = 0, end = m_ir.size(); i < end; ++i)
114 {
115 IR::Block& page = m_ir[i];
116 // just in case we got too far, always add a HALT to be sure the
117 // VM won't do anything crazy
118 page.emplace_back(HALT);
119
120 // push number of elements
121 const auto page_size = std::ranges::count_if(page, [](const auto& a) {
122 return a.kind() != IR::Kind::Label;
123 });
124 if (std::cmp_greater(page_size, std::numeric_limits<uint16_t>::max()))
125 throw std::overflow_error(fmt::format("Size of page {} exceeds the maximum size of 2^16 - 1", i));
126
129
130 // register labels position
131 uint16_t pos = 0;
132 std::unordered_map<IR::label_t, uint16_t> label_to_position;
133 for (auto& inst : page)
134 {
135 switch (inst.kind())
136 {
137 case IR::Kind::Label:
138 label_to_position[inst.label()] = pos;
139 break;
140
141 default:
142 ++pos;
143 }
144 }
145
146 for (auto& inst : page)
147 {
148 switch (inst.kind())
149 {
150 case IR::Kind::Goto:
151 pushWord(Word(inst.inst(), label_to_position[inst.label()]));
152 break;
153
155 pushWord(Word(inst.inst(), inst.primaryArg(), label_to_position[inst.label()]));
156 break;
157
158 case IR::Kind::Opcode:
159 [[fallthrough]];
161 pushWord(inst.bytecode());
162 break;
163
164 default:
165 break;
166 }
167 }
168 }
169 }
170
171 void IRCompiler::pushWord(const Word& word)
172 {
173 m_bytecode.push_back(word.opcode);
174 m_bytecode.push_back(word.byte_1);
175 m_bytecode.push_back(word.byte_2);
176 m_bytecode.push_back(word.byte_3);
177 }
178
180 {
181 /*
182 Generating headers:
183 - lang name (to be sure we are executing an ArkScript file)
184 on 4 bytes (ark + padding)
185 - version (major: 2 bytes, minor: 2 bytes, patch: 2 bytes)
186 - timestamp (8 bytes, unix format)
187 */
188
189 m_bytecode.push_back('a');
190 m_bytecode.push_back('r');
191 m_bytecode.push_back('k');
192 m_bytecode.push_back(0_u8);
193
194 // push version
195 for (const int n : std::array { ARK_VERSION_MAJOR, ARK_VERSION_MINOR, ARK_VERSION_PATCH })
197
198 // push timestamp
199 const long long timestamp = std::chrono::duration_cast<std::chrono::seconds>(
200 std::chrono::system_clock::now().time_since_epoch())
201 .count();
202 for (long i = 0; i < 8; ++i)
203 {
204 const long shift = 8 * (7 - i);
205 const auto ts_byte = static_cast<uint8_t>((timestamp & (0xffLL << shift)) >> shift);
206 m_bytecode.push_back(ts_byte);
207 }
208 }
209
210 void IRCompiler::pushSymbolTable(const std::vector<std::string>& symbols)
211 {
212 const std::size_t symbol_size = symbols.size();
213 if (symbol_size > std::numeric_limits<uint16_t>::max())
214 throw std::overflow_error(fmt::format("Too many symbols: {}, exceeds the maximum size of 2^16 - 1", symbol_size));
215
216 m_bytecode.push_back(SYM_TABLE_START);
218
219 for (const auto& sym : symbols)
220 {
221 // push the string, null terminated
222 std::ranges::transform(sym, std::back_inserter(m_bytecode), [](const char i) {
223 return static_cast<uint8_t>(i);
224 });
225 m_bytecode.push_back(0_u8);
226 }
227 }
228
229 void IRCompiler::pushValueTable(const std::vector<ValTableElem>& values)
230 {
231 const std::size_t value_size = values.size();
232 if (value_size > std::numeric_limits<uint16_t>::max())
233 throw std::overflow_error(fmt::format("Too many values: {}, exceeds the maximum size of 2^16 - 1", value_size));
234
235 m_bytecode.push_back(VAL_TABLE_START);
237
238 for (const ValTableElem& val : values)
239 {
240 switch (val.type)
241 {
243 {
244 m_bytecode.push_back(NUMBER_TYPE);
245 const auto n = std::get<double>(val.value);
246 const auto [exponent, mantissa] = ieee754::serialize(n);
247 serializeToVecLE(exponent, m_bytecode);
248 serializeToVecLE(mantissa, m_bytecode);
249 break;
250 }
251
253 {
254 m_bytecode.push_back(STRING_TYPE);
255 auto t = std::get<std::string>(val.value);
256 std::ranges::transform(t, std::back_inserter(m_bytecode), [](const char i) {
257 return static_cast<uint8_t>(i);
258 });
259 break;
260 }
261
263 {
264 m_bytecode.push_back(FUNC_TYPE);
265 const std::size_t addr = std::get<std::size_t>(val.value);
267 break;
268 }
269 }
270
271 m_bytecode.push_back(0_u8);
272 }
273 }
274
276 {
277 if (m_filenames.size() > std::numeric_limits<uint16_t>::max())
278 throw std::overflow_error(fmt::format("Too many filenames: {}, exceeds the maximum size of 2^16 - 1", m_filenames.size()));
279
281 // push number of elements
283
284 for (const auto& name : m_filenames)
285 {
286 std::ranges::transform(name, std::back_inserter(m_bytecode), [](const char i) {
287 return static_cast<uint8_t>(i);
288 });
289 m_bytecode.push_back(0_u8);
290 }
291 }
292
293 void IRCompiler::pushInstLocTable(const std::vector<IR::Block>& pages)
294 {
295 std::vector<internal::InstLoc> locations;
296 for (std::size_t i = 0, end = pages.size(); i < end; ++i)
297 {
298 const auto& page = pages[i];
299 uint16_t ip = 0;
300
301 for (const auto& inst : page)
302 {
303 if (inst.hasValidSourceLocation())
304 {
305 // we are guaranteed to have a value since we listed all existing filenames in IRCompiler::process before,
306 // thus we do not have to check if std::ranges::find returned a valid iterator.
307 auto file_id = static_cast<uint16_t>(std::distance(m_filenames.begin(), std::ranges::find(m_filenames, inst.filename())));
308
309 std::optional<internal::InstLoc> prev = std::nullopt;
310 if (!locations.empty())
311 prev = locations.back();
312
313 // skip redundant instruction location
314 if (!(prev.has_value() && prev->filename_id == file_id && prev->line == inst.sourceLine() && prev->page_pointer == i))
315 locations.push_back(
316 { .page_pointer = static_cast<uint16_t>(i),
317 .inst_pointer = ip,
318 .filename_id = file_id,
319 .line = static_cast<uint32_t>(inst.sourceLine()) });
320 }
321
322 if (inst.kind() != IR::Kind::Label)
323 ++ip;
324 }
325 }
326
328 serializeOn2BytesToVecBE(locations.size(), m_bytecode);
329
330 std::optional<internal::InstLoc> prev = std::nullopt;
331
332 for (const auto& loc : locations)
333 {
334 serializeOn2BytesToVecBE(loc.page_pointer, m_bytecode);
335 serializeOn2BytesToVecBE(loc.inst_pointer, m_bytecode);
336 serializeOn2BytesToVecBE(loc.filename_id, m_bytecode);
337 serializeToVecBE(loc.line, m_bytecode);
338
339 prev = loc;
340 }
341 }
342}
Constants used by ArkScript.
constexpr int ARK_VERSION_MAJOR
Definition Constants.hpp:17
constexpr int ARK_VERSION_PATCH
Definition Constants.hpp:19
constexpr int ARK_VERSION_MINOR
Definition Constants.hpp:18
Compile the intermediate representation to bytecode.
User defined literals for Ark internals.
void pushInstLocTable(const std::vector< IR::Block > &pages)
IRCompiler(unsigned debug)
Create a new IRCompiler.
void dumpToStream(std::ostream &stream) const
Dump the IR given to process to an output stream.
const bytecode_t & bytecode() const noexcept
Return the constructed bytecode object.
std::vector< std::string > m_filenames
void pushWord(const Word &word)
Push a word (4 bytes) to the m_bytecode.
std::vector< IR::Block > m_ir
void pushFileHeader() noexcept
Push the file headers (magic, version used, timestamp)
void process(const std::vector< IR::Block > &pages, const std::vector< std::string > &symbols, const std::vector< ValTableElem > &values)
Turn a given IR into bytecode.
void pushValueTable(const std::vector< ValTableElem > &values)
void pushSymbolTable(const std::vector< std::string > &symbols)
void traceStart(std::string &&trace_name)
Definition Logger.hpp:90
std::vector< Entity > Block
Definition Entity.hpp:84
constexpr std::size_t HeaderSize
Definition Common.hpp:39
DecomposedDouble serialize(const double n)
void serializeToVecBE(std::integral auto number, std::vector< uint8_t > &out)
void serializeToVecLE(std::integral auto number, std::vector< uint8_t > &out)
void serializeOn2BytesToVecBE(std::integral auto number, std::vector< uint8_t > &out)
constexpr std::array InstructionNames
std::vector< uint8_t > bytecode_t
Definition Common.hpp:22
A Compiler Value class helper to handle multiple types.
uint8_t opcode
Instruction opcode.
Definition Word.hpp:18