|
|
- #include <map>
- #include <optional>
- #include <sstream>
- #include <iostream>
- #include <array>
- #include <charconv>
- #include <utility>
- #include <algorithm>
- #include <limits>
- #include "UserScript/parser.h"
- #include "UserScript.h"
-
- /////////////////
- /// CONSTANTS ///
- /////////////////
-
- using symbol_t = scripting::ast::symbol_t;
-
- constexpr std::array<std::pair<std::string_view, symbol_t>, 25> operators {
- std::pair<std::string_view, symbol_t>{"(", symbol_t::l_paren},
- std::pair<std::string_view, symbol_t>{")", symbol_t::r_paren},
- std::pair<std::string_view, symbol_t>{"!=", symbol_t::different},
- std::pair<std::string_view, symbol_t>{"!", symbol_t::logical_not},
- std::pair<std::string_view, symbol_t>{"~", symbol_t::binary_not},
- std::pair<std::string_view, symbol_t>{"/", symbol_t::divide},
- std::pair<std::string_view, symbol_t>{"%", symbol_t::modulo},
- std::pair<std::string_view, symbol_t>{"*", symbol_t::multiply},
- std::pair<std::string_view, symbol_t>{"-", symbol_t::subtract},
- std::pair<std::string_view, symbol_t>{"+", symbol_t::add},
- std::pair<std::string_view, symbol_t>{"<<<", symbol_t::rotate_left},
- std::pair<std::string_view, symbol_t>{">>>", symbol_t::rotate_right},
- std::pair<std::string_view, symbol_t>{"<<", symbol_t::bitshift_left},
- std::pair<std::string_view, symbol_t>{">>", symbol_t::bitshift_right},
- std::pair<std::string_view, symbol_t>{"<=", symbol_t::less_or_equal_than},
- std::pair<std::string_view, symbol_t>{">=", symbol_t::greater_or_equal_than},
- std::pair<std::string_view, symbol_t>{"<", symbol_t::less_than},
- std::pair<std::string_view, symbol_t>{">", symbol_t::greater_than},
- std::pair<std::string_view, symbol_t>{"==", symbol_t::equals},
- std::pair<std::string_view, symbol_t>{"&&", symbol_t::logical_and},
- std::pair<std::string_view, symbol_t>{"&", symbol_t::binary_and},
- std::pair<std::string_view, symbol_t>{"||", symbol_t::logical_or},
- std::pair<std::string_view, symbol_t>{"|", symbol_t::binary_or},
- std::pair<std::string_view, symbol_t>{"^", symbol_t::binary_xor},
- std::pair<std::string_view, symbol_t>{"\n", symbol_t::new_line}
- };
-
- const std::vector<std::string_view> reserved_character_sequences {
- "(",
- ")",
- "!=",
- "!",
- "~",
- "/",
- "%",
- "*",
- "-",
- "+",
- "<<<",
- ">>>",
- "<<",
- ">>",
- "<=",
- ">=",
- "<",
- ">",
- "==",
- "&&",
- "&",
- "||",
- "|",
- "^",
- "=",
- "\n"
- };
-
- using token = scripting::ast::token;
-
- /////////////////////
- /// LEXER HELPERS ///
- /////////////////////
-
- struct lex_token_result {
- token tok;
- std::string_view rest;
- };
-
- struct rune_ref {
- std::string_view str;
- explicit operator uint32_t() const {
- if(str.empty()) return 0;
- if(str.size() == 1) return str[0];
- auto bytes = 8 - (str.size() + 1);
- uint32_t rune = static_cast<const uint8_t>(str[0]) & (1 << (bytes - 1));
- for(auto c : str.substr(1)) {
- rune <<= 6;
- rune ^= static_cast<const uint8_t>(c) & 0b00111111;
- }
- return rune;
- }
-
- [[nodiscard]] bool is_space() const {
- constexpr std::array<uint32_t, 19> spaces{
- 0x0020, 0x00A0, 0x1680, 0x180E, 0x2000, 0x2001, 0x2002, 0x2003, 0x2004, 0x2005,
- 0x2006, 0x2007, 0x2008, 0x2009, 0x200A, 0x202F, 0x2002, 0x205F, 0x3000
- };
-
- return std::find(spaces.begin(), spaces.end(), static_cast<uint32_t>(*this)) != spaces.end();
- }
- };
-
- struct try_rune_result {
- rune_ref rune;
- std::string_view rest;
- };
-
- std::shared_ptr<scripting::code_location> get_loc(std::string_view original, std::string_view rest, std::shared_ptr<const std::string> last_line) {
- // TODO: Check everything again for weird ass cases
- if(original.empty()) {
- return std::make_shared<scripting::code_location>(scripting::code_location{
- .line_contents = std::make_shared<std::string>(),
- .line_number = (int32_t)std::clamp<size_t>(1, 1, std::numeric_limits<int32_t>::max()),
- .column_number = (int32_t)std::clamp<size_t>(1 + 1, 1, std::numeric_limits<int32_t>::max())
- });
- }
- const auto before = original.substr(0, original.size() - rest.size());
- const auto line_no = std::ranges::count(before, '\n') + 1;
- const auto line_start = std::find(before.crbegin(), before.crend(), '\n');
- const auto column_no = line_start != before.crend() ? (line_start - before.crbegin()) : before.size();
- const auto back_tracked = before.size() - column_no;
- const auto front_tracked = rest.empty() ? original.size() : before.size() + (std::ranges::find(rest, '\n') - rest.begin());
- const std::string_view current{original.begin() + back_tracked, original.begin() + front_tracked};
-
- if(not last_line || *last_line != current) {
- last_line = std::make_shared<std::string>(current);
- }
-
- return std::make_shared<scripting::code_location>(scripting::code_location{
- .line_contents = last_line,
- .line_number = (int32_t)std::clamp<size_t>(line_no, 1, std::numeric_limits<int32_t>::max()),
- .column_number = (int32_t)std::clamp<size_t>(column_no + 1, 1, std::numeric_limits<int32_t>::max())
- });
- }
-
- ////////////////////
- /// LEXER PROPER ///
- ////////////////////
-
- auto try_rune(std::string_view text, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) -> std::optional<try_rune_result> {
- static_assert(CHAR_BIT == 8, "Get your weird ass cpu outta here");
-
- if(text.empty()) return std::nullopt;
-
- if(0 == (*reinterpret_cast<const uint8_t*>(&text.front()) & 0b10000000)) {
- return try_rune_result{text.substr(0, 1), text.substr(1)};
- }
-
- switch(auto bytes = std::countl_one(*reinterpret_cast<const uint8_t*>(&text.front())); bytes) {
- case 0: // ASCII
- {
- return try_rune_result{text.substr(0, 1), text.substr(1)};
- }
- case 1: // Middle of sequence
- {
- return std::nullopt;
- }
- case 7: [[fallthrough]];
- case 8: // Invalid sequence start
- {
- return std::nullopt;
- }
- default: // Maybe it is valid
- {
- if(text.size() < bytes) { // Nope, too short to get a full rune
- errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"});
- return std::nullopt;
- }
- auto rune = text.substr(0,bytes);
-
- // Check if the rest of the rune is valid
- if(std::ranges::any_of(rune.substr(1), [](const char& byte){ return std::countl_one(*reinterpret_cast<const uint8_t*>(&byte)) != 1;})) {
- errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"});
- return std::nullopt;
- }
- return try_rune_result{rune, text.substr(bytes)};
- }
- }
- }
- constexpr auto try_string = [](std::string_view view, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) -> std::optional<lex_token_result> {
- constexpr std::array<int8_t, 256> hexdigits = {
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- +0, 1, 2, 3, 4, 5, 6, 7, 8, 9, -1, -1, -1, -1, -1, -1,
- -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, 10, 11, 12, 13, 14, 15, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1,
- -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1
- };
- auto it = view.begin();
- while (it != view.end() and std::isspace(*it)) ++it;
- if(it == view.end()) return std::nullopt;
- std::stringstream generated;
- if(*it != '"') return std::nullopt;
- std::string str;
- while(true) {
- ++it;
- if(it == view.end()) {
- errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"});
- return std::nullopt;
- }
- switch(*it) {
- case '\\':
- ++it;
- if(it == view.end()) {
- errors.push_back(scripting::script_error{.location = location, .message = "Unterminated string"});
- }
- switch(*it) {
- case '\\': generated << '\\'; break;
- case 'a': generated << '\a'; break;
- case 'b': generated << '\b'; break;
- case 'f': generated << '\f'; break;
- case 'n': generated << '\n'; break;
- case 'r': generated << '\r'; break;
- case 't': generated << '\t'; break;
- case 'v': generated << '\v'; break;
- case '\'': generated << '\''; break;
- case '"': generated << '"'; break;
- case '0': [[fallthrough]];
- case '1': [[fallthrough]];
- case '2': [[fallthrough]];
- case '3': [[fallthrough]];
- case '4': [[fallthrough]];
- case '5': [[fallthrough]];
- case '6': [[fallthrough]];
- case '7':
- {
- char c = uint8_t(*it - '0') * 8 * 8;
- if(uint8_t(*it - '0') > 8) {
- errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"});
- }
- ++it;
- if(it == view.end()) return std::nullopt;
- c += uint8_t(*it - '0') * 8;
- if(uint8_t(*it - '0') > 8) {
- errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"});
- }
- ++it;
- if(it == view.end()) return std::nullopt;
- c += uint8_t(*it - '0');
- if(uint8_t(*it - '0') > 8) {
- errors.push_back(scripting::script_error{.location = location, .message = "Bad octal value in string"});
- }
- generated << c;
- break; }
- case 'x':
- {
- ++it;
- if(it == view.end()) return std::nullopt;
- if(hexdigits[*it] < 0) return std::nullopt;
- char c = hexdigits[*it] << 4;
- ++it;
- if(it == view.end()) return std::nullopt;
- if(hexdigits[*it] < 0) return std::nullopt;
- c += hexdigits[*it];
- generated << c;
- break; }
- default:
- generated << *it;
- }
- break;
- case '"':
- str = generated.str();
- return lex_token_result {
- token{.location = location, .value = std::string(str)},
- std::string_view(++it, view.end())
- };
- default:
- generated << *it;
- break;
- }
- }
- };
- constexpr auto try_int32 = [](std::string_view view, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) -> std::optional<lex_token_result> {
- int32_t i;
- auto v = std::from_chars(view.begin(), view.end(), i);
- if(v.ptr == view.begin()) return std::nullopt;
- auto rest = std::string_view(v.ptr, view.end());
- return lex_token_result{
- token{.location = std::move(location), .value = i},
- rest
- };
- };
- std::optional<lex_token_result> try_operator(std::string_view code, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) {
- for(auto& [representation, type] : operators) {
- if(code.starts_with(representation)) {
- return lex_token_result{
- token{.location = location, .value = type},
- code.substr(representation.size())
- };
- }
- }
- return std::nullopt;
- }
- auto try_identifier(std::string_view view, std::shared_ptr<scripting::code_location>& location, std::vector<scripting::script_error>& errors) -> std::optional<lex_token_result> {
- constexpr auto starts_with_reserved = [](std::string_view v) -> bool {
- return std::ranges::any_of(reserved_character_sequences, [&](auto seq){
- return v.starts_with(seq);
- });
- };
-
- std::stringstream identifier_value;
-
- if(view.empty()) return std::nullopt;
- while(!view.empty() && !starts_with_reserved(view)) {
- if(auto maybe_rune = try_rune(view, location, errors); maybe_rune) {
- auto [rune, rest] = maybe_rune.value();
- if(rune.is_space()) {
- view = rest;
- break;
- }
- identifier_value << rune.str;
- view = rest;
- } else {
- errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 found when parsing identifier"});
- return std::nullopt;
- }
- }
-
- scripting::ast::identifier result {.location = location, .value = identifier_value.str()};
-
- if(result.value.empty()) return std::nullopt;
-
- return lex_token_result{.tok = token{.location = location, .value = result}, .rest = view};
- }
-
- std::vector<token> scripting::ast::lex(const std::string& code, std::vector<scripting::script_error>& errors) {
- std::vector<token> return_value;
- std::string_view current = code;
- std::shared_ptr<const std::string> last_line;
-
- while(not current.empty()) {
- for(;;) {
- if(current.empty()) break;
- auto location = get_loc(code, current, last_line);
- auto c = try_rune(current, location, errors);
- if(not c.has_value()) {
- errors.push_back(scripting::script_error{.location = location, .message = "Bad UTF-8 encoding detected while trimming space"});
- return return_value;
- } else {
- if(c.value().rune.is_space()) {
- current = c.value().rest;
- } else break;
- }
- }
-
- auto location = get_loc(code, current, last_line);
- last_line = location->line_contents;
- auto res = try_string(current, location, errors);
- if (!res) res = try_operator(current, location, errors);
- if (!res) res = try_int32(current, location, errors);
- if (!res) res = try_identifier(current, location, errors);
- if(res.has_value()) {
- current = res.value().rest;
- return_value.emplace_back(std::move(res.value().tok));
- } else {
- errors.push_back(scripting::script_error{.location = location, .message = "Unknown token"});
- return return_value;
- }
- }
-
- return return_value;
- }
-
|