From 106abebc276f4ad119e6fd4af79d51229a929419 Mon Sep 17 00:00:00 2001 From: Ludovic 'Archivist' Lagouardette Date: Wed, 12 Jul 2023 18:20:25 +0200 Subject: [PATCH] Clean up the repo and split large multipurpose files into more single purpose files --- CMakeLists.txt | 4 +- priv_include/UserScript/interpreter.h | 155 +++++++ {include => priv_include}/UserScript/parser.h | 0 script_exe/main.cpp | 17 +- src/generator.cpp | 219 ++++++++++ src/interpreter.cpp | 314 +-------------- src/lex.cpp | 381 ++++++++++++++++++ src/{lex_parse.cpp => parse.cpp} | 378 +---------------- 8 files changed, 773 insertions(+), 695 deletions(-) create mode 100644 priv_include/UserScript/interpreter.h rename {include => priv_include}/UserScript/parser.h (100%) create mode 100644 src/generator.cpp create mode 100644 src/lex.cpp rename src/{lex_parse.cpp => parse.cpp} (66%) diff --git a/CMakeLists.txt b/CMakeLists.txt index d07dadd..19c3faa 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -21,12 +21,12 @@ include(Catch) add_library(UserScript STATIC src/interpreter.cpp - src/lex_parse.cpp) + src/lex.cpp src/parse.cpp priv_include/UserScript/interpreter.h src/generator.cpp) target_include_directories(UserScript PUBLIC include) +include_directories(priv_include) add_executable(ushell script_exe/main.cpp) target_link_libraries(ushell PUBLIC UserScript) -include_directories(include) add_executable(userscript_tests tests/lexer_test.cpp tests/parser_test.cpp) target_link_libraries(userscript_tests PUBLIC UserScript Catch2::Catch2WithMain) diff --git a/priv_include/UserScript/interpreter.h b/priv_include/UserScript/interpreter.h new file mode 100644 index 0000000..85b78e0 --- /dev/null +++ b/priv_include/UserScript/interpreter.h @@ -0,0 +1,155 @@ +#pragma once +#include +#include +#include "UserScript.h" +#include "UserScript/parser.h" + +namespace scripting { + class ByteCodeInterpreter final : public UserScript { + std::map variables; + std::map functions; + std::vector execution_stack; + + public: + struct function_tag { + std::string name; + size_t arity; + std::shared_ptr location; + }; + + struct variable_tag { + std::string name; + std::shared_ptr location; + }; + + enum class operator_t : uint8_t { + logical_not, + binary_not, + unary_plus, + unary_minus, + divide, + modulo, + multiply, + subtract, + add, + bitshift_left, + bitshift_right, + rotate_left, + rotate_right, + less_than, + greater_than, + less_or_equal_than, + greater_or_equal_than, + equals, + different, + binary_and, + binary_or, + binary_xor, + logical_and, + logical_or, + INTERNAL_jump, + INTERNAL_jump_if, + INTERNAL_stack_cls, + }; + + struct operand { + std::variant element; + std::shared_ptr location; + }; + + std::optional> getValue(const std::string &name) { + if (auto var = variables.find(name); var != variables.end()) { + return var->second; + } else { + return std::nullopt; + } + } + + bool setValue(const std::string &name, script_value value) { + if (auto var = variables.find(name); var != variables.end()) { + var->second = value; + return true; + } else { + variables.emplace(std::make_pair(name, value)); + return false; + } + } + + std::vector bytecode; + size_t instruction_ptr; + + script_value resolve(const std::string &name) final { + auto it = variables.find(name); + if (it == variables.end()) { + return script_value{}; + } + return (*it).second; + } + + script_value resolve_and_pop() { + if (execution_stack.empty()) return script_value{}; + auto value = std::move(execution_stack.back()); + auto resolved = std::visit([&](auto v) -> script_value { + if constexpr (std::is_same_v) { + auto it = variables.find(v.name); + if (it == variables.end()) { + return script_value{}; + } + return (*it).second; + } else { + return v; + } + }, value); + execution_stack.pop_back(); + return resolved; + } + + void big_f_ing_switch(operand &op, std::optional &error); + + std::vector + generate(std::vector &errors, ast::block &tree, bool loop = true); + + void registerFunction(std::string name, function fn) final { + functions.insert_or_assign(name, std::move(fn)); + } + + std::variant> executeAtOnce(std::string code) final { + std::vector errors; + auto lexed = ast::lex(code, errors); + auto parsed = ast::parse(lexed, errors); + if (not errors.empty()) return errors; + bytecode = generate(errors, parsed, false); + if (not errors.empty()) return errors; + std::optional maybe_error; + instruction_ptr = 0; + while (instruction_ptr < bytecode.size()) { + step(maybe_error); + if (maybe_error) return std::vector({maybe_error.value()}); + } + auto v = resolve_and_pop(); + execution_stack.clear(); + return v; + } + + std::vector prepare(std::string code) final { + std::vector errors; + auto lexed = ast::lex(code, errors); + auto parsed = ast::parse(lexed, errors); + if (errors.empty()) { + bytecode = generate(errors, parsed, true); + } + + return errors; + } + + std::optional stepOnce() final { + std::optional error; + while (not step(error)); + return error; + } + + bool step(std::optional &error); + + ~ByteCodeInterpreter() final {} + }; +} \ No newline at end of file diff --git a/include/UserScript/parser.h b/priv_include/UserScript/parser.h similarity index 100% rename from include/UserScript/parser.h rename to priv_include/UserScript/parser.h diff --git a/script_exe/main.cpp b/script_exe/main.cpp index 06d750d..e4bf977 100644 --- a/script_exe/main.cpp +++ b/script_exe/main.cpp @@ -127,7 +127,7 @@ void process_bench(std::string target = "./tests/scripts/testfile.test") { decltype(std::chrono::high_resolution_clock::now()-std::chrono::high_resolution_clock::now()) per_exec{}, per_step{}, per_op{}; - for(int runs = 0; runs < 20; runs++) { + for(int runs = 0; runs < 5000; runs++) { auto res = engine->prepare(code.str()); @@ -137,14 +137,13 @@ void process_bench(std::string target = "./tests/scripts/testfile.test") { steps++; } auto end = std::chrono::high_resolution_clock::now(); - per_exec += (end - begin) / 5000; - per_step += (end - begin) / steps; - per_op += (end - begin) / (5000 * 53); + per_exec += (end - begin); + per_step += (end - begin); + per_op += (end - begin); } - - per_exec /= 20; - per_step /= 20; - per_op /= 20; + per_exec /= 5000; + per_step /= steps; + per_op = per_op / 5000 / 53; std::cout << "time per exec = " << std::chrono::duration_cast(per_exec).count() << "ns\n"; std::cout << "time per step = " << std::chrono::duration_cast(per_step).count() << "ns\n"; @@ -325,7 +324,7 @@ int cpp_main(std::span args) { } else if(args.front() == "bench_compile") { args = args.subspan(1); if(args.size() > 1) { - std::cerr << "bench_exec expects 0 or 1 file as arguments" << std::endl; + std::cerr << "bench_compile expects 0 or 1 file as arguments" << std::endl; std::terminate(); } if(args.empty()) compile_bench(); diff --git a/src/generator.cpp b/src/generator.cpp new file mode 100644 index 0000000..4cf0574 --- /dev/null +++ b/src/generator.cpp @@ -0,0 +1,219 @@ +#include "UserScript/interpreter.h" + +namespace scripting { + +// Replace with constexpr vector & find ? + static const std::map mappings = { + {ast::operator_t::logical_not, ByteCodeInterpreter::operator_t::logical_not}, + {ast::operator_t::binary_not, ByteCodeInterpreter::operator_t::binary_not}, + {ast::operator_t::divide, ByteCodeInterpreter::operator_t::divide}, + {ast::operator_t::modulo, ByteCodeInterpreter::operator_t::modulo}, + {ast::operator_t::multiply, ByteCodeInterpreter::operator_t::multiply}, + {ast::operator_t::subtract, ByteCodeInterpreter::operator_t::subtract}, + {ast::operator_t::add, ByteCodeInterpreter::operator_t::add}, + {ast::operator_t::bitshift_left, ByteCodeInterpreter::operator_t::bitshift_left}, + {ast::operator_t::bitshift_right, ByteCodeInterpreter::operator_t::bitshift_right}, + {ast::operator_t::rotate_left, ByteCodeInterpreter::operator_t::rotate_left}, + {ast::operator_t::rotate_right, ByteCodeInterpreter::operator_t::rotate_right}, + {ast::operator_t::less_than, ByteCodeInterpreter::operator_t::less_than}, + {ast::operator_t::greater_than, ByteCodeInterpreter::operator_t::greater_than}, + {ast::operator_t::less_or_equal_than, ByteCodeInterpreter::operator_t::less_or_equal_than}, + {ast::operator_t::greater_or_equal_than, ByteCodeInterpreter::operator_t::greater_or_equal_than}, + {ast::operator_t::equals, ByteCodeInterpreter::operator_t::equals}, + {ast::operator_t::different, ByteCodeInterpreter::operator_t::different}, + {ast::operator_t::binary_and, ByteCodeInterpreter::operator_t::binary_and}, + {ast::operator_t::binary_or, ByteCodeInterpreter::operator_t::binary_or}, + {ast::operator_t::binary_xor, ByteCodeInterpreter::operator_t::binary_xor}, + {ast::operator_t::logical_and, ByteCodeInterpreter::operator_t::logical_and}, + {ast::operator_t::logical_or, ByteCodeInterpreter::operator_t::logical_or}, + }; + +/// GENERATION HANDLERS DECLARATIONS + + template + void handle(std::vector &, std::vector &, T &); + + template<> + void handle(std::vector &ctx, std::vector &errors, + ast::block &block); + + template<> + void + handle(std::vector &ctx, std::vector &errors, + ast::command_expression &cmd); + + template<> + void handle(std::vector &ctx, + std::vector &errors, + ast::binary_algebraic_expression &cmd); + + template<> + void handle(std::vector &ctx, + std::vector &errors, + ast::unary_algebraic_expression &cmd); + + template<> + void + handle(std::vector &ctx, std::vector &errors, + ast::paren_expression &cmd); + + template<> + void handle(std::vector &ctx, std::vector &errors, + ast::conditional &cmd); + + template<> + void handle(std::vector &ctx, std::vector &errors, + ast::while_loop &cmd); + + template<> + void handle(std::vector &ctx, std::vector &errors, + ast::expression &cmd); + + template<> + void + handle(std::vector &ctx, std::vector &errors, + ast::variable_expression &cmd); + + template<> + void handle(std::vector &ctx, + std::vector &errors, ast::literal_int_expression &cmd); + + template<> + void handle(std::vector &ctx, + std::vector &errors, ast::literal_string_expression &cmd); + +/// GENERATION HANDLERS DEFINITIONS + + template<> + void handle(std::vector &ctx, std::vector &errors, + ast::block &block) { + for (auto &elem: block.contents) { + std::visit([&](auto &v) { handle(ctx, errors, *v); }, elem.contents); + ctx.push_back(ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_stack_cls}); + } + } + + template<> + void + handle(std::vector &ctx, std::vector &errors, + ast::command_expression &cmd) { + for (auto it = cmd.arguments.rbegin(); it != cmd.arguments.rend(); ++it) { + std::visit([&](auto &v) { handle(ctx, errors, *v); }, (*it)->contents); + } + ctx.push_back( + ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::function_tag{.name = cmd.name.value, .arity = cmd.arguments.size()}, .location = cmd.location}); + } + + template<> + void + handle(std::vector &ctx, std::vector &errors, + ast::paren_expression &expr) { + std::visit([&](auto &v) { handle(ctx, errors, *v); }, expr.content); + } + + template<> + void handle(std::vector &ctx, std::vector &errors, + ast::expression &expr) { + std::visit([&](auto &v) { handle(ctx, errors, *v); }, expr.contents); + } + + template<> + void handle(std::vector &ctx, + std::vector &errors, + ast::binary_algebraic_expression &expr) { + handle(ctx, errors, *expr.lhs); + handle(ctx, errors, *expr.rhs); + ctx.push_back(ByteCodeInterpreter::operand{.element = mappings.at(expr.op), .location = expr.location}); + } + + template<> + void handle(std::vector &ctx, + std::vector &errors, + ast::unary_algebraic_expression &expr) { + handle(ctx, errors, *expr.content); + ctx.push_back(ByteCodeInterpreter::operand{.element = mappings.at(expr.op), .location = expr.location}); + } + + template<> + void + handle(std::vector &ctx, std::vector &errors, + ast::variable_expression &expr) { + ctx.push_back(ByteCodeInterpreter::operand{ + ByteCodeInterpreter::variable_tag{.name = expr.name.value, .location = expr.location}}); + } + + template<> + void handle(std::vector &ctx, + std::vector &errors, ast::literal_int_expression &expr) { + ctx.push_back(ByteCodeInterpreter::operand{script_value{expr.value}}); + } + + template<> + void handle(std::vector &ctx, + std::vector &errors, + ast::literal_string_expression &expr) { + ctx.push_back(ByteCodeInterpreter::operand{script_value{expr.value}}); + } + + template<> + void handle(std::vector &ctx, std::vector &errors, + ast::conditional &cond) { + /// some basic documentation (from before the reference stability bug but things are the same): + /// https://app.excalidraw.com/s/hxPegpAmTX/2c8KKzinqeg + std::visit([&](auto &v) { handle(ctx, errors, *v); }, cond.condition->contents); + ctx.push_back(ByteCodeInterpreter::operand{.element = script_value{}, .location = cond.location}); + /// As you can see, being smart is dumb, be a fucking monkey that comes from the 70s and use 70s technology:tm: to your advantage + /// More seriously, WTF (?) we do this because we used to have a bug with unreliable references to these locations, which makes sense since we + /// don't have reference stability + auto else_side_idx = ctx.size() - 1; + ctx.push_back( + ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump_if, .location = cond.location}); + handle(ctx, errors, *cond.on_condition); + if (cond.otherwise) { + ctx.push_back(ByteCodeInterpreter::operand{.element = script_value{}, .location = cond.location}); + auto end_side_idx = ctx.size() - 1; + ctx.push_back( + ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump, .location = cond.location}); + ctx[else_side_idx].element = static_cast(ctx.size()) - 1; + ctx[else_side_idx].location = cond.location; + handle(ctx, errors, *cond.otherwise); + ctx[end_side_idx].element = static_cast(ctx.size()) - 1; + ctx[end_side_idx].location = cond.location; + } else { + ctx[else_side_idx].element = static_cast(ctx.size()) - 1; + ctx[else_side_idx].location = cond.location; + } + } + + template<> + void handle(std::vector &ctx, std::vector &errors, + ast::while_loop &cond) { + auto beforewhile_side_idx = static_cast(ctx.size()) - 1; + std::visit([&](auto &v) { handle(ctx, errors, *v); }, cond.condition->contents); + ctx.push_back(ByteCodeInterpreter::operand{.element = script_value{}, .location = cond.location}); + auto endwhile_side_idx = ctx.size() - 1; + ctx.push_back( + ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump_if, .location = cond.location}); + handle(ctx, errors, *cond.on_condition); + ctx.push_back( + ByteCodeInterpreter::operand{.element = script_value{beforewhile_side_idx}, .location = cond.location}); + ctx.push_back( + ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump, .location = cond.location}); + ctx[endwhile_side_idx].element = static_cast(ctx.size()) - 1; + ctx[endwhile_side_idx].location = cond.location; + } + + std::vector + ByteCodeInterpreter::generate(std::vector &errors, ast::block &tree, bool loop) { + std::vector code; + + handle(code, errors, tree); + if (loop) { + // Here we have to deal with the quirks of jumping before the increments happens again + code.push_back(ByteCodeInterpreter::operand{.element = script_value{-1}, .location = tree.location}); + code.push_back( + ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump, .location = tree.location}); + } + return code; + } +} \ No newline at end of file diff --git a/src/interpreter.cpp b/src/interpreter.cpp index 8e9834c..b31d26a 100644 --- a/src/interpreter.cpp +++ b/src/interpreter.cpp @@ -1,7 +1,4 @@ -#include -#include -#include "UserScript.h" -#include "UserScript/parser.h" +#include "UserScript/interpreter.h" namespace scripting { void to_null(script_value& value, auto on_value, auto on_error) { @@ -33,151 +30,6 @@ namespace scripting { } } - class ByteCodeInterpreter final : public UserScript { - std::map variables; - std::map functions; - std::vector execution_stack; - - public: - struct function_tag { - std::string name; - size_t arity; - std::shared_ptr location; - }; - - struct variable_tag { - std::string name; - std::shared_ptr location; - }; - - enum class operator_t : uint8_t { - logical_not, - binary_not, - unary_plus, - unary_minus, - divide, - modulo, - multiply, - subtract, - add, - bitshift_left, - bitshift_right, - rotate_left, - rotate_right, - less_than, - greater_than, - less_or_equal_than, - greater_or_equal_than, - equals, - different, - binary_and, - binary_or, - binary_xor, - logical_and, - logical_or, - INTERNAL_jump, - INTERNAL_jump_if, - INTERNAL_stack_cls, - }; - - struct operand { - std::variant element; - std::shared_ptr location; - }; - - std::optional> getValue(const std::string& name) { - if(auto var = variables.find(name); var != variables.end()) { - return var->second; - } else { - return std::nullopt; - } - } - bool setValue(const std::string& name, script_value value) { - if(auto var = variables.find(name); var != variables.end()) { - var->second = value; - return true; - } else { - variables.emplace(std::make_pair(name, value)); - return false; - } - } - - std::vector bytecode; - size_t instruction_ptr; - - script_value resolve(const std::string& name) final { - auto it = variables.find(name); - if(it == variables.end()) { - return script_value{}; - } - return (*it).second; - } - - script_value resolve_and_pop() { - if(execution_stack.empty()) return script_value{}; - auto value = std::move(execution_stack.back()); - auto resolved = std::visit([&](auto v) -> script_value { - if constexpr (std::is_same_v) { - auto it = variables.find(v.name); - if(it == variables.end()) { - return script_value{}; - } - return (*it).second; - } else { - return v; - } - }, value); - execution_stack.pop_back(); - return resolved; - } - - void big_f_ing_switch(operand& op, std::optional& error); - - std::vector generate(std::vector& errors, ast::block &tree, bool loop = true); - - void registerFunction(std::string name, function fn) final { - functions.insert_or_assign(name, std::move(fn)); - } - - std::variant> executeAtOnce(std::string code) final { - std::vector errors; - auto lexed = ast::lex(code, errors); - auto parsed = ast::parse(lexed, errors); - if(not errors.empty()) return errors; - bytecode = generate(errors, parsed, false); - if(not errors.empty()) return errors; - std::optional maybe_error; - instruction_ptr = 0; - while(instruction_ptr < bytecode.size()) { - step(maybe_error); - if(maybe_error) return std::vector({maybe_error.value()}); - } - auto v = resolve_and_pop(); - execution_stack.clear(); - return v; - } - - std::vector prepare(std::string code) final { - std::vector errors; - auto lexed = ast::lex(code, errors); - auto parsed = ast::parse(lexed, errors); - if(errors.empty()) { - bytecode = generate(errors, parsed, true); - } - - return errors; - } - - std::optional stepOnce() final { - std::optional error; - while(not step(error)); - return error; - } - - bool step(std::optional& error); - ~ByteCodeInterpreter() final {} - }; - namespace wizardry { // taken from cppreference: https://en.cppreference.com/w/cpp/utility/variant/visit template @@ -236,168 +88,6 @@ namespace scripting { return instruction_ptr >= bytecode.size() || error || ret; } - // Replace with constexpr vector & find ? - static const std::map mappings = { - {ast::operator_t::logical_not, ByteCodeInterpreter::operator_t::logical_not}, - {ast::operator_t::binary_not, ByteCodeInterpreter::operator_t::binary_not}, - {ast::operator_t::divide, ByteCodeInterpreter::operator_t::divide}, - {ast::operator_t::modulo, ByteCodeInterpreter::operator_t::modulo}, - {ast::operator_t::multiply, ByteCodeInterpreter::operator_t::multiply}, - {ast::operator_t::subtract, ByteCodeInterpreter::operator_t::subtract}, - {ast::operator_t::add, ByteCodeInterpreter::operator_t::add}, - {ast::operator_t::bitshift_left, ByteCodeInterpreter::operator_t::bitshift_left}, - {ast::operator_t::bitshift_right, ByteCodeInterpreter::operator_t::bitshift_right}, - {ast::operator_t::rotate_left, ByteCodeInterpreter::operator_t::rotate_left}, - {ast::operator_t::rotate_right, ByteCodeInterpreter::operator_t::rotate_right}, - {ast::operator_t::less_than, ByteCodeInterpreter::operator_t::less_than}, - {ast::operator_t::greater_than, ByteCodeInterpreter::operator_t::greater_than}, - {ast::operator_t::less_or_equal_than, ByteCodeInterpreter::operator_t::less_or_equal_than}, - {ast::operator_t::greater_or_equal_than, ByteCodeInterpreter::operator_t::greater_or_equal_than}, - {ast::operator_t::equals, ByteCodeInterpreter::operator_t::equals}, - {ast::operator_t::different, ByteCodeInterpreter::operator_t::different}, - {ast::operator_t::binary_and, ByteCodeInterpreter::operator_t::binary_and}, - {ast::operator_t::binary_or, ByteCodeInterpreter::operator_t::binary_or}, - {ast::operator_t::binary_xor, ByteCodeInterpreter::operator_t::binary_xor}, - {ast::operator_t::logical_and, ByteCodeInterpreter::operator_t::logical_and}, - {ast::operator_t::logical_or, ByteCodeInterpreter::operator_t::logical_or}, - }; - - /// GENERATION HANDLERS DECLARATIONS - - template - void handle(std::vector&, std::vector&, T&); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::block& block); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::command_expression& cmd); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::binary_algebraic_expression& cmd); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::unary_algebraic_expression& cmd); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::paren_expression& cmd); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::conditional& cmd); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::while_loop& cmd); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::expression& cmd); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::variable_expression& cmd); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::literal_int_expression& cmd); - template<> - void handle(std::vector& ctx, std::vector& errors, ast::literal_string_expression& cmd); - - /// GENERATION HANDLERS DEFINITIONS - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::block& block) { - for(auto& elem : block.contents) { - std::visit([&](auto& v) {handle(ctx, errors, *v);}, elem.contents); - ctx.push_back(ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_stack_cls}); - } - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::command_expression& cmd) { - for(auto it = cmd.arguments.rbegin(); it != cmd.arguments.rend(); ++it) { - std::visit([&](auto& v) {handle(ctx, errors, *v);}, (*it)->contents); - } - ctx.push_back(ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::function_tag{.name = cmd.name.value, .arity = cmd.arguments.size()}, .location = cmd.location}); - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::paren_expression& expr) { - std::visit([&](auto& v) {handle(ctx, errors, *v);}, expr.content); - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::expression& expr) { - std::visit([&](auto& v) {handle(ctx, errors, *v);}, expr.contents); - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::binary_algebraic_expression& expr) { - handle(ctx, errors, *expr.lhs); - handle(ctx, errors, *expr.rhs); - ctx.push_back(ByteCodeInterpreter::operand{.element = mappings.at(expr.op), .location = expr.location}); - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::unary_algebraic_expression& expr) { - handle(ctx, errors, *expr.content); - ctx.push_back(ByteCodeInterpreter::operand{.element = mappings.at(expr.op), .location = expr.location}); - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::variable_expression& expr) { - ctx.push_back(ByteCodeInterpreter::operand{ByteCodeInterpreter::variable_tag{.name = expr.name.value, .location = expr.location}}); - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::literal_int_expression& expr) { - ctx.push_back(ByteCodeInterpreter::operand{script_value{expr.value}}); - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::literal_string_expression& expr) { - ctx.push_back(ByteCodeInterpreter::operand{script_value{expr.value}}); - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::conditional& cond) { - /// some basic documentation (from before the reference stability bug but things are the same): - /// https://app.excalidraw.com/s/hxPegpAmTX/2c8KKzinqeg - std::visit([&](auto& v) {handle(ctx, errors, *v);}, cond.condition->contents); - ctx.push_back(ByteCodeInterpreter::operand{.element = script_value{}, .location = cond.location}); - /// As you can see, being smart is dumb, be a fucking monkey that comes from the 70s and use 70s technology:tm: to your advantage - /// More seriously, WTF (?) we do this because we used to have a bug with unreliable references to these locations, which makes sense since we - /// don't have reference stability - auto else_side_idx = ctx.size()-1; - ctx.push_back(ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump_if, .location = cond.location}); - handle(ctx, errors, *cond.on_condition); - if(cond.otherwise) { - ctx.push_back(ByteCodeInterpreter::operand{.element = script_value{}, .location = cond.location}); - auto end_side_idx = ctx.size()-1; - ctx.push_back(ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump, .location = cond.location}); - ctx[else_side_idx].element = static_cast(ctx.size())-1; - ctx[else_side_idx].location = cond.location; - handle(ctx, errors, *cond.otherwise); - ctx[end_side_idx].element = static_cast(ctx.size())-1; - ctx[end_side_idx].location = cond.location; - } else { - ctx[else_side_idx].element = static_cast(ctx.size())-1; - ctx[else_side_idx].location = cond.location; - } - } - - template<> - void handle(std::vector& ctx, std::vector& errors, ast::while_loop& cond) { - auto beforewhile_side_idx = static_cast(ctx.size())-1; - std::visit([&](auto& v) {handle(ctx, errors, *v);}, cond.condition->contents); - ctx.push_back(ByteCodeInterpreter::operand{.element = script_value{}, .location = cond.location}); - auto endwhile_side_idx = ctx.size()-1; - ctx.push_back(ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump_if, .location = cond.location}); - handle(ctx, errors, *cond.on_condition); - ctx.push_back(ByteCodeInterpreter::operand{.element = script_value{beforewhile_side_idx}, .location = cond.location}); - ctx.push_back(ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump, .location = cond.location}); - ctx[endwhile_side_idx].element = static_cast(ctx.size())-1; - ctx[endwhile_side_idx].location = cond.location; - } - - std::vector ByteCodeInterpreter::generate(std::vector& errors, ast::block &tree, bool loop) { - std::vector code; - - handle(code, errors, tree); - if(loop) { - // Here we have to deal with the quirks of jumping before the increments happens again - code.push_back(ByteCodeInterpreter::operand{.element = script_value{-1}, .location = tree.location}); - code.push_back(ByteCodeInterpreter::operand{.element = ByteCodeInterpreter::operator_t::INTERNAL_jump, .location = tree.location}); - } - return code; - } - std::unique_ptr prepare_interpreter(const std::string& code) { auto script = std::make_unique(); script->prepare(code); @@ -1162,7 +852,7 @@ namespace scripting { [&](auto &instruction_target) { error = script_error{ op.location, - "JumpIf to invalid location "// + std::to_string(holds_alternative(instruction_target)) + "JumpIf to invalid location " }; } ); diff --git a/src/lex.cpp b/src/lex.cpp new file mode 100644 index 0000000..b681cfc --- /dev/null +++ b/src/lex.cpp @@ -0,0 +1,381 @@ +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include "UserScript/parser.h" +#include "UserScript.h" + +///////////////// +/// CONSTANTS /// +///////////////// + +using symbol_t = scripting::ast::symbol_t; + +constexpr std::array, 25> operators { + std::pair{"(", symbol_t::l_paren}, + std::pair{")", symbol_t::r_paren}, + std::pair{"!=", symbol_t::different}, + std::pair{"!", symbol_t::logical_not}, + std::pair{"~", symbol_t::binary_not}, + std::pair{"/", symbol_t::divide}, + std::pair{"%", symbol_t::modulo}, + std::pair{"*", symbol_t::multiply}, + std::pair{"-", symbol_t::subtract}, + std::pair{"+", symbol_t::add}, + std::pair{"<<<", symbol_t::rotate_left}, + std::pair{">>>", symbol_t::rotate_right}, + std::pair{"<<", symbol_t::bitshift_left}, + std::pair{">>", symbol_t::bitshift_right}, + std::pair{"<=", symbol_t::less_or_equal_than}, + std::pair{">=", symbol_t::greater_or_equal_than}, + std::pair{"<", symbol_t::less_than}, + std::pair{">", symbol_t::greater_than}, + std::pair{"==", symbol_t::equals}, + std::pair{"&&", symbol_t::logical_and}, + std::pair{"&", symbol_t::binary_and}, + std::pair{"||", symbol_t::logical_or}, + std::pair{"|", symbol_t::binary_or}, + std::pair{"^", symbol_t::binary_xor}, + std::pair{"\n", symbol_t::new_line} +}; + +const std::vector reserved_character_sequences { + "(", + ")", + "!=", + "!", + "~", + "/", + "%", + "*", + "-", + "+", + "<<<", + ">>>", + "<<", + ">>", + "<=", + ">=", + "<", + ">", + "==", + "&&", + "&", + "||", + "|", + "^", + "=", + "\n" +}; + +using token = scripting::ast::token; + +///////////////////// +/// LEXER HELPERS /// +///////////////////// + +struct lex_token_result { + token tok; + std::string_view rest; +}; + +struct rune_ref { + std::string_view str; + explicit operator uint32_t() const { + if(str.empty()) return 0; + if(str.size() == 1) return str[0]; + auto bytes = 8 - (str.size() + 1); + uint32_t rune = static_cast(str[0]) & (1 << (bytes - 1)); + for(auto c : str.substr(1)) { + rune <<= 6; + rune ^= static_cast(c) & 0b00111111; + } + return rune; + } + + [[nodiscard]] bool is_space() const { + constexpr std::array spaces{ + 0x0020, 0x00A0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, + 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x2002, 0x205F, 0x3000 + }; + + return std::find(spaces.begin(), spaces.end(), static_cast(*this)) != spaces.end(); + } +}; + +struct try_rune_result { + rune_ref rune; + std::string_view rest; +}; + +std::shared_ptr get_loc(std::string_view original, std::string_view rest, std::shared_ptr last_line) { + // TODO: Check everything again for weird ass cases + if(original.empty()) { + return std::make_shared(scripting::code_location{ + .line_contents = std::make_shared(), + .line_number = (int32_t)std::clamp(1, 1, std::numeric_limits::max()), + .column_number = (int32_t)std::clamp(1 + 1, 1, std::numeric_limits::max()) + }); + } + const auto before = original.substr(0, original.size() - rest.size()); + const auto line_no = std::ranges::count(before, '\n') + 1; + const auto line_start = std::find(before.crbegin(), before.crend(), '\n'); + const auto column_no = line_start != before.crend() ? (line_start - before.crbegin()) : before.size(); + const auto back_tracked = before.size() - column_no; + const auto front_tracked = rest.empty() ? original.size() : before.size() + (std::ranges::find(rest, '\n') - rest.begin()); + const std::string_view current{original.begin() + back_tracked, original.begin() + front_tracked}; + + if(not last_line || *last_line != current) { + last_line = std::make_shared(current); + } + + return std::make_shared(scripting::code_location{ + .line_contents = last_line, + .line_number = (int32_t)std::clamp(line_no, 1, std::numeric_limits::max()), + .column_number = (int32_t)std::clamp(column_no + 1, 1, std::numeric_limits::max()) + }); +} + +//////////////////// +/// LEXER PROPER /// +//////////////////// + +auto try_rune(std::string_view text, std::shared_ptr& location, std::vector& errors) -> std::optional { + static_assert(CHAR_BIT == 8, "Get your weird ass cpu outta here"); + + if(text.empty()) return std::nullopt; + + if(0 == (*reinterpret_cast(&text.front()) & 0b10000000)) { + return try_rune_result{text.substr(0, 1), text.substr(1)}; + } + + switch(auto bytes = std::countl_one(*reinterpret_cast(&text.front())); bytes) { + case 0: // ASCII + { + return try_rune_result{text.substr(0, 1), text.substr(1)}; + } + case 1: // Middle of sequence + { + return std::nullopt; + } + case 7: [[fallthrough]]; + case 8: // Invalid sequence start + { + return std::nullopt; + } + default: // Maybe it is valid + { + if(text.size() < bytes) { // Nope, too short to get a full rune + errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"}); + return std::nullopt; + } + auto rune = text.substr(0,bytes); + + // Check if the rest of the rune is valid + if(std::ranges::any_of(rune.substr(1), [](const char& byte){ return std::countl_one(*reinterpret_cast(&byte)) != 1;})) { + errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"}); + return std::nullopt; + } + return try_rune_result{rune, text.substr(bytes)}; + } + } +} +constexpr auto try_string = [](std::string_view view, std::shared_ptr& location, std::vector& errors) -> std::optional { + constexpr std::array hexdigits = { + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, + -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 + }; + auto it = view.begin(); + while (it != view.end() and std::isspace(*it)) ++it; + if(it == view.end()) return std::nullopt; + std::stringstream generated; + if(*it != '"') return std::nullopt; + std::string str; + while(true) { + ++it; + if(it == view.end()) { + errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"}); + return std::nullopt; + } + switch(*it) { + case '\\': + ++it; + if(it == view.end()) { + errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"}); + } + switch(*it) { + case '\\': generated << '\\'; break; + case 'a': generated << '\a'; break; + case 'b': generated << '\b'; break; + case 'f': generated << '\f'; break; + case 'n': generated << '\n'; break; + case 'r': generated << '\r'; break; + case 't': generated << '\t'; break; + case 'v': generated << '\v'; break; + case '\'': generated << '\''; break; + case '"': generated << '"'; break; + case '0': [[fallthrough]]; + case '1': [[fallthrough]]; + case '2': [[fallthrough]]; + case '3': [[fallthrough]]; + case '4': [[fallthrough]]; + case '5': [[fallthrough]]; + case '6': [[fallthrough]]; + case '7': + { + char c = uint8_t(*it - '0') * 8 * 8; + if(uint8_t(*it - '0') > 8) { + errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"}); + } + ++it; + if(it == view.end()) return std::nullopt; + c += uint8_t(*it - '0') * 8; + if(uint8_t(*it - '0') > 8) { + errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"}); + } + ++it; + if(it == view.end()) return std::nullopt; + c += uint8_t(*it - '0'); + if(uint8_t(*it - '0') > 8) { + errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"}); + } + generated << c; + break; } + case 'x': + { + ++it; + if(it == view.end()) return std::nullopt; + if(hexdigits[*it] < 0) return std::nullopt; + char c = hexdigits[*it] << 4; + ++it; + if(it == view.end()) return std::nullopt; + if(hexdigits[*it] < 0) return std::nullopt; + c += hexdigits[*it]; + generated << c; + break; } + default: + generated << *it; + } + break; + case '"': + str = generated.str(); + return lex_token_result { + token{.location = location, .value = std::string(str)}, + std::string_view(++it, view.end()) + }; + default: + generated << *it; + break; + } + } +}; +constexpr auto try_int32 = [](std::string_view view, std::shared_ptr& location, std::vector& errors) -> std::optional { + int32_t i; + auto v = std::from_chars(view.begin(), view.end(), i); + if(v.ptr == view.begin()) return std::nullopt; + auto rest = std::string_view(v.ptr, view.end()); + return lex_token_result{ + token{.location = std::move(location), .value = i}, + rest + }; +}; +std::optional try_operator(std::string_view code, std::shared_ptr& location, std::vector& errors) { + for(auto& [representation, type] : operators) { + if(code.starts_with(representation)) { + return lex_token_result{ + token{.location = location, .value = type}, + code.substr(representation.size()) + }; + } + } + return std::nullopt; +} +auto try_identifier(std::string_view view, std::shared_ptr& location, std::vector& errors) -> std::optional { + constexpr auto starts_with_reserved = [](std::string_view v) -> bool { + return std::ranges::any_of(reserved_character_sequences, [&](auto seq){ + return v.starts_with(seq); + }); + }; + + std::stringstream identifier_value; + + if(view.empty()) return std::nullopt; + while(!view.empty() && !starts_with_reserved(view)) { + if(auto maybe_rune = try_rune(view, location, errors); maybe_rune) { + auto [rune, rest] = maybe_rune.value(); + if(rune.is_space()) { + view = rest; + break; + } + identifier_value << rune.str; + view = rest; + } else { + errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"}); + return std::nullopt; + } + } + + scripting::ast::identifier result {.location = location, .value = identifier_value.str()}; + + if(result.value.empty()) return std::nullopt; + + return lex_token_result{.tok = token{.location = location, .value = result}, .rest = view}; +} + +std::vector scripting::ast::lex(const std::string& code, std::vector& errors) { + std::vector return_value; + std::string_view current = code; + std::shared_ptr last_line; + + while(not current.empty()) { + for(;;) { + if(current.empty()) break; + auto location = get_loc(code, current, last_line); + auto c = try_rune(current, location, errors); + if(not c.has_value()) { + errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 encoding detected while trimming space"}); + return return_value; + } else { + if(c.value().rune.is_space()) { + current = c.value().rest; + } else break; + } + } + + auto location = get_loc(code, current, last_line); + last_line = location->line_contents; + auto res = try_string(current, location, errors); + if (!res) res = try_operator(current, location, errors); + if (!res) res = try_int32(current, location, errors); + if (!res) res = try_identifier(current, location, errors); + if(res.has_value()) { + current = res.value().rest; + return_value.emplace_back(std::move(res.value().tok)); + } else { + errors.push_back(scripting::script_error{.location = location, .message = "Unknown token"}); + return return_value; + } + } + + return return_value; +} + diff --git a/src/lex_parse.cpp b/src/parse.cpp similarity index 66% rename from src/lex_parse.cpp rename to src/parse.cpp index efb6546..86c31c6 100644 --- a/src/lex_parse.cpp +++ b/src/parse.cpp @@ -10,379 +10,13 @@ #include "UserScript/parser.h" #include "UserScript.h" -///////////////// -/// CONSTANTS /// -///////////////// - -using symbol_t = scripting::ast::symbol_t; - -constexpr std::array, 25> operators { - std::pair{"(", symbol_t::l_paren}, - std::pair{")", symbol_t::r_paren}, - std::pair{"!=", symbol_t::different}, - std::pair{"!", symbol_t::logical_not}, - std::pair{"~", symbol_t::binary_not}, - std::pair{"/", symbol_t::divide}, - std::pair{"%", symbol_t::modulo}, - std::pair{"*", symbol_t::multiply}, - std::pair{"-", symbol_t::subtract}, - std::pair{"+", symbol_t::add}, - std::pair{"<<<", symbol_t::rotate_left}, - std::pair{">>>", symbol_t::rotate_right}, - std::pair{"<<", symbol_t::bitshift_left}, - std::pair{">>", symbol_t::bitshift_right}, - std::pair{"<=", symbol_t::less_or_equal_than}, - std::pair{">=", symbol_t::greater_or_equal_than}, - std::pair{"<", symbol_t::less_than}, - std::pair{">", symbol_t::greater_than}, - std::pair{"==", symbol_t::equals}, - std::pair{"&&", symbol_t::logical_and}, - std::pair{"&", symbol_t::binary_and}, - std::pair{"||", symbol_t::logical_or}, - std::pair{"|", symbol_t::binary_or}, - std::pair{"^", symbol_t::binary_xor}, - std::pair{"\n", symbol_t::new_line} -}; - -const std::vector reserved_character_sequences { - "(", - ")", - "!=", - "!", - "~", - "/", - "%", - "*", - "-", - "+", - "<<<", - ">>>", - "<<", - ">>", - "<=", - ">=", - "<", - ">", - "==", - "&&", - "&", - "||", - "|", - "^", - "=", - "\n" -}; - -///////////////////// -/// LEXER HELPERS /// -///////////////////// - -using token = scripting::ast::token; - -struct lex_token_result { - token tok; - std::string_view rest; -}; - -struct rune_ref { - std::string_view str; - explicit operator uint32_t() const { - if(str.empty()) return 0; - if(str.size() == 1) return str[0]; - auto bytes = 8 - (str.size() + 1); - uint32_t rune = static_cast(str[0]) & (1 << (bytes - 1)); - for(auto c : str.substr(1)) { - rune <<= 6; - rune ^= static_cast(c) & 0b00111111; - } - return rune; - } - - [[nodiscard]] bool is_space() const { - constexpr std::array spaces{ - 0x0020, 0x00A0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, - 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x2002, 0x205F, 0x3000 - }; - - return std::find(spaces.begin(), spaces.end(), static_cast(*this)) != spaces.end(); - } -}; - -struct try_rune_result { - rune_ref rune; - std::string_view rest; -}; - -std::shared_ptr get_loc(std::string_view original, std::string_view rest, std::shared_ptr last_line) { - // TODO: Check everything again for weird ass cases - if(original.empty()) { - return std::make_shared(scripting::code_location{ - .line_contents = std::make_shared(), - .line_number = (int32_t)std::clamp(1, 1, std::numeric_limits::max()), - .column_number = (int32_t)std::clamp(1 + 1, 1, std::numeric_limits::max()) - }); - } - const auto before = original.substr(0, original.size() - rest.size()); - const auto line_no = std::ranges::count(before, '\n') + 1; - const auto line_start = std::find(before.crbegin(), before.crend(), '\n'); - const auto column_no = line_start != before.crend() ? (line_start - before.crbegin()) : before.size(); - const auto back_tracked = before.size() - column_no; - const auto front_tracked = rest.empty() ? original.size() : before.size() + (std::ranges::find(rest, '\n') - rest.begin()); - const std::string_view current{original.begin() + back_tracked, original.begin() + front_tracked}; - - if(not last_line || *last_line != current) { - last_line = std::make_shared(current); - } - - return std::make_shared(scripting::code_location{ - .line_contents = last_line, - .line_number = (int32_t)std::clamp(line_no, 1, std::numeric_limits::max()), - .column_number = (int32_t)std::clamp(column_no + 1, 1, std::numeric_limits::max()) - }); -} - -//////////////////// -/// LEXER PROPER /// -//////////////////// - -auto try_rune(std::string_view text, std::shared_ptr& location, std::vector& errors) -> std::optional { - static_assert(CHAR_BIT == 8, "Get your weird ass cpu outta here"); - - if(text.empty()) return std::nullopt; - - if(0 == (*reinterpret_cast(&text.front()) & 0b10000000)) { - return try_rune_result{text.substr(0, 1), text.substr(1)}; - } - - switch(auto bytes = std::countl_one(*reinterpret_cast(&text.front())); bytes) { - case 0: // ASCII - { - return try_rune_result{text.substr(0, 1), text.substr(1)}; - } - case 1: // Middle of sequence - { - return std::nullopt; - } - case 7: [[fallthrough]]; - case 8: // Invalid sequence start - { - return std::nullopt; - } - default: // Maybe it is valid - { - if(text.size() < bytes) { // Nope, too short to get a full rune - errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"}); - return std::nullopt; - } - auto rune = text.substr(0,bytes); - - // Check if the rest of the rune is valid - if(std::ranges::any_of(rune.substr(1), [](const char& byte){ return std::countl_one(*reinterpret_cast(&byte)) != 1;})) { - errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"}); - return std::nullopt; - } - return try_rune_result{rune, text.substr(bytes)}; - } - } -} -constexpr auto try_string = [](std::string_view view, std::shared_ptr& location, std::vector& errors) -> std::optional { - constexpr std::array hexdigits = { - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, - -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, - -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 - }; - auto it = view.begin(); - while (it != view.end() and std::isspace(*it)) ++it; - if(it == view.end()) return std::nullopt; - std::stringstream generated; - if(*it != '"') return std::nullopt; - std::string str; - while(true) { - ++it; - if(it == view.end()) { - errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"}); - return std::nullopt; - } - switch(*it) { - case '\\': - ++it; - if(it == view.end()) { - errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"}); - } - switch(*it) { - case '\\': generated << '\\'; break; - case 'a': generated << '\a'; break; - case 'b': generated << '\b'; break; - case 'f': generated << '\f'; break; - case 'n': generated << '\n'; break; - case 'r': generated << '\r'; break; - case 't': generated << '\t'; break; - case 'v': generated << '\v'; break; - case '\'': generated << '\''; break; - case '"': generated << '"'; break; - case '0': [[fallthrough]]; - case '1': [[fallthrough]]; - case '2': [[fallthrough]]; - case '3': [[fallthrough]]; - case '4': [[fallthrough]]; - case '5': [[fallthrough]]; - case '6': [[fallthrough]]; - case '7': - { - char c = uint8_t(*it - '0') * 8 * 8; - if(uint8_t(*it - '0') > 8) { - errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"}); - } - ++it; - if(it == view.end()) return std::nullopt; - c += uint8_t(*it - '0') * 8; - if(uint8_t(*it - '0') > 8) { - errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"}); - } - ++it; - if(it == view.end()) return std::nullopt; - c += uint8_t(*it - '0'); - if(uint8_t(*it - '0') > 8) { - errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"}); - } - generated << c; - break; } - case 'x': - { - ++it; - if(it == view.end()) return std::nullopt; - if(hexdigits[*it] < 0) return std::nullopt; - char c = hexdigits[*it] << 4; - ++it; - if(it == view.end()) return std::nullopt; - if(hexdigits[*it] < 0) return std::nullopt; - c += hexdigits[*it]; - generated << c; - break; } - default: - generated << *it; - } - break; - case '"': - str = generated.str(); - return lex_token_result { - token{.location = location, .value = std::string(str)}, - std::string_view(++it, view.end()) - }; - default: - generated << *it; - break; - } - } -}; -constexpr auto try_int32 = [](std::string_view view, std::shared_ptr& location, std::vector& errors) -> std::optional { - int32_t i; - auto v = std::from_chars(view.begin(), view.end(), i); - if(v.ptr == view.begin()) return std::nullopt; - auto rest = std::string_view(v.ptr, view.end()); - return lex_token_result{ - token{.location = std::move(location), .value = i}, - rest - }; -}; -std::optional try_operator(std::string_view code, std::shared_ptr& location, std::vector& errors) { - for(auto& [representation, type] : operators) { - if(code.starts_with(representation)) { - return lex_token_result{ - token{.location = location, .value = type}, - code.substr(representation.size()) - }; - } - } - return std::nullopt; -} -auto try_identifier(std::string_view view, std::shared_ptr& location, std::vector& errors) -> std::optional { - constexpr auto starts_with_reserved = [](std::string_view v) -> bool { - return std::ranges::any_of(reserved_character_sequences, [&](auto seq){ - return v.starts_with(seq); - }); - }; - - std::stringstream identifier_value; - - if(view.empty()) return std::nullopt; - while(!view.empty() && !starts_with_reserved(view)) { - if(auto maybe_rune = try_rune(view, location, errors); maybe_rune) { - auto [rune, rest] = maybe_rune.value(); - if(rune.is_space()) { - view = rest; - break; - } - identifier_value << rune.str; - view = rest; - } else { - errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"}); - return std::nullopt; - } - } - - scripting::ast::identifier result {.location = location, .value = identifier_value.str()}; - - if(result.value.empty()) return std::nullopt; - - return lex_token_result{.tok = token{.location = location, .value = result}, .rest = view}; -} - -std::vector scripting::ast::lex(const std::string& code, std::vector& errors) { - std::vector return_value; - std::string_view current = code; - std::shared_ptr last_line; - - while(not current.empty()) { - for(;;) { - if(current.empty()) break; - auto location = get_loc(code, current, last_line); - auto c = try_rune(current, location, errors); - if(not c.has_value()) { - errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 encoding detected while trimming space"}); - return return_value; - } else { - if(c.value().rune.is_space()) { - current = c.value().rest; - } else break; - } - } - - auto location = get_loc(code, current, last_line); - last_line = location->line_contents; - auto res = try_string(current, location, errors); - if (!res) res = try_operator(current, location, errors); - if (!res) res = try_int32(current, location, errors); - if (!res) res = try_identifier(current, location, errors); - if(res.has_value()) { - current = res.value().rest; - return_value.emplace_back(std::move(res.value().tok)); - } else { - errors.push_back(scripting::script_error{.location = location, .message = "Unknown token"}); - return return_value; - } - } - - return return_value; -} - ////////////////////// /// PARSER HELPERS /// ////////////////////// +using token = scripting::ast::token; +using symbol_t = scripting::ast::symbol_t; + template struct parse_result { std::optional result; @@ -454,7 +88,7 @@ parse_result try_command_expr(std::span(current.front().value) and get(current.front().value) == symbol_t::r_paren ) - ) { + ) { auto [expr, rest] = try_expression(current, errors); if(not expr) { @@ -473,7 +107,7 @@ parse_result try_command_expr(std::span try_expression(std::span code, std::vector& errors) { scripting::ast::expression node; auto current = code; - + #ifdef HANDLE_EXPRESSION static_assert(false, "Found a macro name HANDLE_EXPRESSION, halting"); #endif @@ -896,7 +530,7 @@ parse_result try_binary_algebraic_e scripting::ast::binary_algebraic_expression node; auto current = code; - + #ifdef HANDLE_EXPRESSION static_assert(false, "Found a macro name HANDLE_EXPRESSION, halting"); #endif