#include #include #include #include #include #include #include #include #include #include "UserScript/parser.h" #include "UserScript.h" ///////////////// /// CONSTANTS /// ///////////////// using symbol_t = scripting::ast::symbol_t; constexpr std::array, 25> operators { std::pair{"(", symbol_t::l_paren}, std::pair{")", symbol_t::r_paren}, std::pair{"!=", symbol_t::different}, std::pair{"!", symbol_t::logical_not}, std::pair{"~", symbol_t::binary_not}, std::pair{"/", symbol_t::divide}, std::pair{"%", symbol_t::modulo}, std::pair{"*", symbol_t::multiply}, std::pair{"-", symbol_t::subtract}, std::pair{"+", symbol_t::add}, std::pair{"<<<", symbol_t::rotate_left}, std::pair{">>>", symbol_t::rotate_right}, std::pair{"<<", symbol_t::bitshift_left}, std::pair{">>", symbol_t::bitshift_right}, std::pair{"<=", symbol_t::less_or_equal_than}, std::pair{">=", symbol_t::greater_or_equal_than}, std::pair{"<", symbol_t::less_than}, std::pair{">", symbol_t::greater_than}, std::pair{"==", symbol_t::equals}, std::pair{"&&", symbol_t::logical_and}, std::pair{"&", symbol_t::binary_and}, std::pair{"||", symbol_t::logical_or}, std::pair{"|", symbol_t::binary_or}, std::pair{"^", symbol_t::binary_xor}, std::pair{"\n", symbol_t::new_line} }; const std::vector reserved_character_sequences { "(", ")", "!=", "!", "~", "/", "%", "*", "-", "+", "<<<", ">>>", "<<", ">>", "<=", ">=", "<", ">", "==", "&&", "&", "||", "|", "^", "=", "\n" }; using token = scripting::ast::token; ///////////////////// /// LEXER HELPERS /// ///////////////////// struct lex_token_result { token tok; std::string_view rest; }; struct rune_ref { std::string_view str; explicit operator uint32_t() const { if(str.empty()) return 0; if(str.size() == 1) return str[0]; auto bytes = 8 - (str.size() + 1); uint32_t rune = static_cast(str[0]) & (1 << (bytes - 1)); for(auto c : str.substr(1)) { rune <<= 6; rune ^= static_cast(c) & 0b00111111; } return rune; } [[nodiscard]] bool is_space() const { constexpr std::array spaces{ 0x0020, 0x00A0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005, 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x2002, 0x205F, 0x3000 }; return std::find(spaces.begin(), spaces.end(), static_cast(*this)) != spaces.end(); } }; struct try_rune_result { rune_ref rune; std::string_view rest; }; std::shared_ptr get_loc(std::string_view original, std::string_view rest, std::shared_ptr last_line) { // TODO: Check everything again for weird ass cases if(original.empty()) { return std::make_shared(scripting::code_location{ .line_contents = std::make_shared(), .line_number = (int32_t)std::clamp(1, 1, std::numeric_limits::max()), .column_number = (int32_t)std::clamp(1 + 1, 1, std::numeric_limits::max()) }); } const auto before = original.substr(0, original.size() - rest.size()); const auto line_no = std::ranges::count(before, '\n') + 1; const auto line_start = std::find(before.crbegin(), before.crend(), '\n'); const auto column_no = line_start != before.crend() ? (line_start - before.crbegin()) : before.size(); const auto back_tracked = before.size() - column_no; const auto front_tracked = rest.empty() ? original.size() : before.size() + (std::ranges::find(rest, '\n') - rest.begin()); const std::string_view current{original.begin() + back_tracked, original.begin() + front_tracked}; if(not last_line || *last_line != current) { last_line = std::make_shared(current); } return std::make_shared(scripting::code_location{ .line_contents = last_line, .line_number = (int32_t)std::clamp(line_no, 1, std::numeric_limits::max()), .column_number = (int32_t)std::clamp(column_no + 1, 1, std::numeric_limits::max()) }); } //////////////////// /// LEXER PROPER /// //////////////////// auto try_rune(std::string_view text, std::shared_ptr& location, std::vector& errors) -> std::optional { static_assert(CHAR_BIT == 8, "Get your weird ass cpu outta here"); if(text.empty()) return std::nullopt; if(0 == (*reinterpret_cast(&text.front()) & 0b10000000)) { return try_rune_result{text.substr(0, 1), text.substr(1)}; } switch(auto bytes = std::countl_one(*reinterpret_cast(&text.front())); bytes) { case 0: // ASCII { return try_rune_result{text.substr(0, 1), text.substr(1)}; } case 1: // Middle of sequence { return std::nullopt; } case 7: [[fallthrough]]; case 8: // Invalid sequence start { return std::nullopt; } default: // Maybe it is valid { if(text.size() < bytes) { // Nope, too short to get a full rune errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"}); return std::nullopt; } auto rune = text.substr(0,bytes); // Check if the rest of the rune is valid if(std::ranges::any_of(rune.substr(1), [](const char& byte){ return std::countl_one(*reinterpret_cast(&byte)) != 1;})) { errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"}); return std::nullopt; } return try_rune_result{rune, text.substr(bytes)}; } } } constexpr auto try_string = [](std::string_view view, std::shared_ptr& location, std::vector& errors) -> std::optional { constexpr std::array hexdigits = { -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }; auto it = view.begin(); while (it != view.end() and std::isspace(*it)) ++it; if(it == view.end()) return std::nullopt; std::stringstream generated; if(*it != '"') return std::nullopt; std::string str; while(true) { ++it; if(it == view.end()) { errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"}); return std::nullopt; } switch(*it) { case '\\': ++it; if(it == view.end()) { errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"}); } switch(*it) { case '\\': generated << '\\'; break; case 'a': generated << '\a'; break; case 'b': generated << '\b'; break; case 'f': generated << '\f'; break; case 'n': generated << '\n'; break; case 'r': generated << '\r'; break; case 't': generated << '\t'; break; case 'v': generated << '\v'; break; case '\'': generated << '\''; break; case '"': generated << '"'; break; case '0': [[fallthrough]]; case '1': [[fallthrough]]; case '2': [[fallthrough]]; case '3': [[fallthrough]]; case '4': [[fallthrough]]; case '5': [[fallthrough]]; case '6': [[fallthrough]]; case '7': { char c = uint8_t(*it - '0') * 8 * 8; if(uint8_t(*it - '0') > 8) { errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"}); } ++it; if(it == view.end()) return std::nullopt; c += uint8_t(*it - '0') * 8; if(uint8_t(*it - '0') > 8) { errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"}); } ++it; if(it == view.end()) return std::nullopt; c += uint8_t(*it - '0'); if(uint8_t(*it - '0') > 8) { errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"}); } generated << c; break; } case 'x': { ++it; if(it == view.end()) return std::nullopt; if(hexdigits[*it] < 0) return std::nullopt; char c = hexdigits[*it] << 4; ++it; if(it == view.end()) return std::nullopt; if(hexdigits[*it] < 0) return std::nullopt; c += hexdigits[*it]; generated << c; break; } default: generated << *it; } break; case '"': str = generated.str(); return lex_token_result { token{.location = location, .value = std::string(str)}, std::string_view(++it, view.end()) }; default: generated << *it; break; } } }; constexpr auto try_int32 = [](std::string_view view, std::shared_ptr& location, std::vector& errors) -> std::optional { int32_t i; auto v = std::from_chars(view.begin(), view.end(), i); if(v.ptr == view.begin()) return std::nullopt; auto rest = std::string_view(v.ptr, view.end()); return lex_token_result{ token{.location = std::move(location), .value = i}, rest }; }; std::optional try_operator(std::string_view code, std::shared_ptr& location, std::vector& errors) { for(auto& [representation, type] : operators) { if(code.starts_with(representation)) { return lex_token_result{ token{.location = location, .value = type}, code.substr(representation.size()) }; } } return std::nullopt; } auto try_identifier(std::string_view view, std::shared_ptr& location, std::vector& errors) -> std::optional { constexpr auto starts_with_reserved = [](std::string_view v) -> bool { return std::ranges::any_of(reserved_character_sequences, [&](auto seq){ return v.starts_with(seq); }); }; std::stringstream identifier_value; if(view.empty()) return std::nullopt; while(!view.empty() && !starts_with_reserved(view)) { if(auto maybe_rune = try_rune(view, location, errors); maybe_rune) { auto [rune, rest] = maybe_rune.value(); if(rune.is_space()) { view = rest; break; } identifier_value << rune.str; view = rest; } else { errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"}); return std::nullopt; } } scripting::ast::identifier result {.location = location, .value = identifier_value.str()}; if(result.value.empty()) return std::nullopt; return lex_token_result{.tok = token{.location = location, .value = result}, .rest = view}; } std::vector scripting::ast::lex(const std::string& code, std::vector& errors) { std::vector return_value; std::string_view current = code; std::shared_ptr last_line; while(not current.empty()) { for(;;) { if(current.empty()) break; auto location = get_loc(code, current, last_line); auto c = try_rune(current, location, errors); if(not c.has_value()) { errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 encoding detected while trimming space"}); return return_value; } else { if(c.value().rune.is_space()) { current = c.value().rest; } else break; } } auto location = get_loc(code, current, last_line); last_line = location->line_contents; auto res = try_string(current, location, errors); if (!res) res = try_operator(current, location, errors); if (!res) res = try_int32(current, location, errors); if (!res) res = try_identifier(current, location, errors); if(res.has_value()) { current = res.value().rest; return_value.emplace_back(std::move(res.value().tok)); } else { errors.push_back(scripting::script_error{.location = location, .message = "Unknown token"}); return return_value; } } return return_value; }