From a45dbee7f1d489730e5077c6b4820210f5fa03f8 Mon Sep 17 00:00:00 2001 From: Ludovic 'Archivist' Lagouardette Date: Wed, 14 Apr 2021 20:26:45 +0200 Subject: [PATCH] Added a grimoire Lexer for GP Courtesy of Enalye for the assistance in understanding their gibberish --- include/gp/dynamic/compiler/lexer.hpp | 957 ++++++++++++++++++++++++++ 1 file changed, 957 insertions(+) create mode 100644 include/gp/dynamic/compiler/lexer.hpp diff --git a/include/gp/dynamic/compiler/lexer.hpp b/include/gp/dynamic/compiler/lexer.hpp new file mode 100644 index 0000000..10aa74d --- /dev/null +++ b/include/gp/dynamic/compiler/lexer.hpp @@ -0,0 +1,957 @@ +#pragma once + +#include "gp/containers/array.hpp" +#include "gp/containers/vector.hpp" +#include "gp/text/ascii.hpp" + +using string = gp::vector; + +#define FOREACH_LEXEME_TYPE \ + LEX(leftBracket) \ + LEX(rightBracket) \ + LEX(leftParenthesis) \ + LEX(rightParenthesis) \ + LEX(leftCurlyBrace) \ + LEX(rightCurlyBrace) \ + LEX(period) \ + LEX(semicolon) \ + LEX(colon) \ + LEX(doubleColon) \ + LEX(comma) \ + LEX(at) \ + LEX(pointer) \ + LEX(as) \ + LEX(try_) \ + LEX(catch_) \ + LEX(raise_) \ + LEX(defer) \ + LEX(assign) \ + LEX(addAssign) \ + LEX(substractAssign) \ + LEX(multiplyAssign) \ + LEX(divideAssign) \ + LEX(concatenateAssign) \ + LEX(remainderAssign) \ + LEX(powerAssign) \ + LEX(plus) \ + LEX(minus) \ + LEX(add) \ + LEX(substract) \ + LEX(multiply) \ + LEX(divide) \ + LEX(concatenate) \ + LEX(remainder) \ + LEX(power) \ + LEX(equal) \ + LEX(doubleEqual) \ + LEX(threeWayComparison) \ + LEX(notEqual) \ + LEX(greaterOrEqual) \ + LEX(greater) \ + LEX(lesserOrEqual) \ + LEX(lesser) \ + LEX(leftShift) \ + LEX(rightShift) \ + LEX(and_) \ + LEX(or_) \ + LEX(xor_) \ + LEX(not_) \ + LEX(increment) \ + LEX(decrement) \ + LEX(identifier) \ + LEX(integer) \ + LEX(float_) \ + LEX(boolean) \ + LEX(string_) \ + LEX(null_) \ + LEX(public_) \ + LEX(main_) \ + LEX(type_) \ + LEX(event_) \ + LEX(class_) \ + LEX(enum_) \ + LEX(template_) \ + LEX(new_) \ + LEX(copy) \ + LEX(send) \ + LEX(receive) \ + LEX(intType) \ + LEX(floatType) \ + LEX(boolType) \ + LEX(stringType) \ + LEX(arrayType) \ + LEX(functionType) \ + LEX(taskType) \ + LEX(chanType) \ + LEX(autoType) \ + LEX(if_) \ + LEX(unless) \ + LEX(else_) \ + LEX(switch_) \ + LEX(select) \ + LEX(case_) \ + LEX(while_) \ + LEX(do_) \ + LEX(until) \ + LEX(for_) \ + LEX(loop) \ + LEX(return_) \ + LEX(self) \ + LEX(kill) \ + LEX(killAll) \ + LEX(yield) \ + LEX(break_) \ + LEX(continue_) + +#define LEX(x) x, +enum class gr_lexeme_type { + FOREACH_LEXEME_TYPE +}; +#undef LEX + +#define LEX(x) case gr_lexeme_type::x: return #x; +constexpr inline const char* to_string(const gr_lexeme_type& value) { + switch(value) { + FOREACH_LEXEME_TYPE + } +} +#undef LEX + +#define LEX(x) +1 +constexpr inline const char* to_pretty_string(const gr_lexeme_type& value) { + constexpr gp::array names = { + "[", "]", "(", ")", "{", "}", ".", ";", ":", "::", ",", "@", "&", "as", + "try", "catch", "raise", "defer", "=", "+=", "-=", "*=", "/=", "~=", + "%=", "^=", "+", "-", "+", "-", "*", "/", "~", "%", "^", "==", "===", + "<=>", "!=", ">=", ">", "<=", "<", "<<", ">>", "and", "or", "xor", "not", "++", + "--", "identifier", "const_int", "const_float", "const_bool", + "const_str", "null", "pub", "main", "type", "event", "class", "enum", + "template", "new", "copy", "send", "receive", "int", "float", "bool", + "string", "array", "func", "task", "chan", "let", "if", "unless", + "else", "switch", "select", "case", "while", "do", "until", "for", "loop", + "return", "self", "kill", "killall", "yield", "break", "continue" + }; + return names[(uint64_t)value]; +} +#undef LEX + +struct gr_lexeme; + +class gr_lexer { +public: + using file_loader_t = gp::function(const gp::vector&)>; + gp::allocator& _allocator; + file_loader_t file_loader; +private: + gp::vector _files_to_import, _files_imported, _lines; + + string _file, _text; + uint64_t _line, _current, _position_of_line, _file_id; + + gp::vector _lexemes; + + char get(ssize_t offset = 0); + bool advance(bool start_from_current = false); + void scan_script(); + void scan_number(); + void scan_string(); + void scan_operator(); + void scan_word(); + void scan_file_path(); + void scan_use(); + string convert_path_to_import(string&); + +public: + + gr_lexer(gp::allocator& alloc, file_loader_t& loader) + : _allocator(alloc) + , file_loader(loader) + , _files_to_import(_allocator) + , _files_imported(_allocator) + , _lines(_allocator) + , _file(_allocator) + , _text(_allocator) + , _lexemes(_allocator) + {} + + const gp::vector& lexemes() { + return _lexemes; + } + + void scan_file(gp::vector& file_name); + + const string& get_line(const gr_lexeme&) const; + const string& get_file(const gr_lexeme&) const; + const string& get_file(const size_t&) const; +}; + +struct gr_lexeme { + const gr_lexer& lexer; + size_t _file_id; + size_t _line, _column, _text_length = 1; + + gr_lexeme_type type; + bool + is_literal, + is_operator, + is_keyword, + is_type; + int ivalue; + float fvalue; + bool bvalue; + string svalue; + + gr_lexeme(const gr_lexer& v) + : lexer(v) + , svalue(v._allocator) + {} + + const string& get_line() const { + return lexer.get_line(*this); + } + + const string& get_file() { + return lexer.get_file(*this); + } +}; + +inline char gr_lexer::get(ssize_t offset) { + const uint position = ssize_t(_current) + offset; + gp_config::assertion(!(position < 0 || position >= _text.size()), "Unexpected end of script"); + return _text[position]; +} + +inline const string& gr_lexer::get_line(const gr_lexeme& lex) const { + gp_config::assertion(!(lex._file_id >= _files_imported.size()), "Lexeme file id out of bounds"); + // TODO: Implement this + return _files_imported[lex._file_id]; +} + +inline const string& gr_lexer::get_file(const gr_lexeme& lex) const { + gp_config::assertion(!(lex._file_id >= _files_imported.size()), "Lexeme file id out of bounds"); + return _files_imported[lex._file_id]; +} + +inline const string& gr_lexer::get_file(const size_t& file_id) const { + gp_config::assertion(!(file_id >= _files_imported.size()), "File id out of bounds"); + return _files_imported[file_id]; +} + +inline bool gr_lexer::advance(bool start_from_current) { + if(!start_from_current) { + _current++; + } + + if(_current >= _text.size()) { + return false; + } + + char symbol = _text[_current]; + + whileLoop: while(symbol <= 0x20 || symbol == '/' || symbol == '#') { + if(_current >= _text.size()) { + return false; + } + + symbol = _text[_current]; + + if(symbol == '\n') { + _position_of_line = _current; + _line++; + } + else if(symbol == '#') + { + do { + if(_current >= _text.size()) return false; + _current++; + } while (_text[_current] != '\n'); + _position_of_line = _current; + _line++; + } + else if(symbol == '/') + { + if((_current + 1) >= _text.size()) { + return false; + } + + switch(_text[_current + 1]) { + case '/': { + do { + if(_current >= _text.size()) { + return false; + } + } while(_current < _text.size() && _text[_current] != '\n'); + _position_of_line = _current; + _line++; + }break; + case '*': { + for(;;) { + if((_current + 1) >= _text.size()) { + _current++; + return false; + } + + if(_text[_current] == '\n') { + _position_of_line = _current; + _line++; + } + + if(_text[_current] == '*' && _text[_current + 1] == '/') { + _current++; + break; + } + + _current++; + } + }break; + default: + // Goto honorable + goto whileLoop; + } + } + + _current++; + + if(_current >= _text.size()) { + return false; + } + + symbol = _text[_current]; + } + return true; +} + +inline void gr_lexer::scan_script() { + advance(true); + + constexpr static auto is_operator = [](char v) { + if(v == '!') return true; + if(v >= '#' && v <='&') return true; + if(v >= '(' && v <='-') return true; + if(v == '/') return true; + if(v >= ':' && v <='@') return true; + if(v >= '[' && v <='^') return true; + if(v >= '{' && v <='~') return true; + return false; + }; + + do { + if (_current >= _text.size()) + break; + auto c = get(); + if(is_digit(c)) scan_number(); + else if(c == '.') { + if (get(1) >= '0' && get(1) <= '9') + scan_number(); + else + scan_operator(); + } + else if(is_operator(c)) scan_operator(); + else if(c == '\"') scan_string(); + else scan_word(); + }while (advance()); +} + + inline void gr_lexer::scan_number(){ + gr_lexeme lex = gr_lexeme(*this); + lex.is_literal = true; + + bool isFloat; + string buffer(_allocator); + for (;;) { + char symbol = get(); + + if (symbol >= '0' && symbol <= '9') + buffer.push_back(symbol); + else if (symbol == '_') { + // Do nothing, only cosmetic (e.g. 1_000_000). + } + else if (symbol == '.') { + if (isFloat) + break; + isFloat = true; + buffer.push_back(symbol); + } + else if (symbol == 'f') { + isFloat = true; + break; + } + else { + if (_current) + _current--; + break; + } + + _current++; + + if (_current >= _text.size()) + break; + } + } + + inline void gr_lexer::scan_string(){ + gr_lexeme lex = gr_lexeme(*this); + lex.type = gr_lexeme_type::string_; + lex.is_literal = true; + + gp_config::assertion(get() != '\"',"Expected \'\"\' at the beginning of the string."); + _current++; + + string buffer(_allocator); + bool escape = false; + bool wasEscape = false; + for (;;) { + gp_config::assertion(_current >= _text.size(),"Missing \'\"\' character."); + char symbol = get(); + + if (symbol == '\n') { + _position_of_line = _current; + _line++; + } + else if (symbol == '\"' && (!wasEscape)) + break; + else if (symbol == '\\' && (!wasEscape)) { + escape = true; + } + + if (!escape) { + if (!wasEscape) { + buffer.push_back(symbol); + } + else { + if (symbol == 'n') + buffer.push_back('\n'); + else + buffer.push_back(symbol); + } + } + wasEscape = escape; + escape = false; + + _current++; + } + + lex._text_length = size_t(buffer.size() + 2u); + lex.svalue = buffer; + _lexemes.push_back(lex); + } + + inline void gr_lexer::scan_operator(){ + gr_lexeme lex = gr_lexeme(*this); + lex.is_operator = true; + + switch (get()) { + case '{': + lex.type = gr_lexeme_type::leftCurlyBrace; + break; + case '}': + lex.type = gr_lexeme_type::rightCurlyBrace; + break; + case '(': + lex.type = gr_lexeme_type::leftParenthesis; + break; + case ')': + lex.type = gr_lexeme_type::rightParenthesis; + break; + case '[': + lex.type = gr_lexeme_type::leftBracket; + break; + case ']': + lex.type = gr_lexeme_type::rightBracket; + break; + case '.': + lex.type = gr_lexeme_type::period; + break; + case ';': + lex.type = gr_lexeme_type::semicolon; + break; + case ':': + lex.type = gr_lexeme_type::colon; + if (_current + 1 >= _text.size()) + break; + if (get(1) == ':') { + lex.type = gr_lexeme_type::doubleColon; + lex._text_length = 2; + _current++; + } + break; + case ',': + lex.type = gr_lexeme_type::comma; + break; + case '^': + lex.type = gr_lexeme_type::power; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::powerAssign; + lex._text_length = 2; + _current++; + } + break; + case '@': + lex.type = gr_lexeme_type::at; + break; + case '&': + lex.type = gr_lexeme_type::pointer; + break; + case '~': + lex.type = gr_lexeme_type::concatenate; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::concatenateAssign; + lex._text_length = 2; + _current++; + } + break; + case '+': + lex.type = gr_lexeme_type::add; + if (_current + 1 >= _text.size()) + break; + switch (get(1)) { + case '=': + lex.type = gr_lexeme_type::addAssign; + lex._text_length = 2; + _current++; + break; + case '+': + lex.type = gr_lexeme_type::increment; + lex._text_length = 2; + _current++; + break; + default: + break; + } + break; + case '-': + lex.type = gr_lexeme_type::substract; + if (_current + 1 >= _text.size()) + break; + switch (get(1)) { + case '=': + lex.type = gr_lexeme_type::substractAssign; + lex._text_length = 2; + _current++; + break; + case '-': + lex.type = gr_lexeme_type::decrement; + lex._text_length = 2; + _current++; + break; + default: + break; + } + break; + case '*': + lex.type = gr_lexeme_type::multiply; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::multiplyAssign; + lex._text_length = 2; + _current++; + } + break; + case '/': + lex.type = gr_lexeme_type::divide; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::divideAssign; + lex._text_length = 2; + _current++; + } + break; + case '%': + lex.type = gr_lexeme_type::remainder; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::remainderAssign; + lex._text_length = 2; + _current++; + } + break; + case '=': + lex.type = gr_lexeme_type::assign; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::equal; + lex._text_length = 2; + _current++; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::doubleEqual; + lex._text_length = 3; + _current++; + } + } + break; + case '<': + lex.type = gr_lexeme_type::lesser; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::lesserOrEqual; + lex._text_length = 2; + _current++; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '>') { + lex.type = gr_lexeme_type::threeWayComparison; + lex._text_length = 3; + _current++; + } + } + else if (get(1) == '-') { + lex.type = gr_lexeme_type::send; + lex._text_length = 2; + _current++; + } + else if (get(1) == '<') { + lex.type = gr_lexeme_type::leftShift; + lex._text_length = 2; + _current++; + } + break; + case '>': + lex.type = gr_lexeme_type::greater; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::greaterOrEqual; + lex._text_length = 2; + _current++; + } + else if (get(1) == '>') { + lex.type = gr_lexeme_type::rightShift; + lex._text_length = 2; + _current++; + } + break; + case '!': + lex.type = gr_lexeme_type::not_; + if (_current + 1 >= _text.size()) + break; + if (get(1) == '=') { + lex.type = gr_lexeme_type::notEqual; + lex._text_length = 2; + _current++; + } + break; + default: + gp_config::assertion(false, "GrLexer: invalid operator"); + } + + _lexemes.push_back(lex); + } + + namespace _hidden { + bool operator==(const gp::vector& lhs, const char* rhs) { + for(size_t index = 0; index < lhs.size() && rhs[index] != 0; index++){ + if(lhs[index] != rhs[index]) return false; + } + return true; + } + } + + inline void gr_lexer::scan_word(){ + gr_lexeme lex = gr_lexeme(*this); + lex.is_keyword = true; + + string symbol_buffer(_allocator); + for (;;) { + if (_current >= _text.size()) + break; + + char symbol = get(); + if (symbol == '!' || symbol == '?') { + symbol_buffer.push_back(symbol); + _current++; + break; + } + if (symbol <= '&' || (symbol >= '(' && symbol <= '/') || (symbol >= ':' + && symbol <= '@') || (symbol >= '[' && symbol <= '^') + || (symbol >= '{' && symbol <= 0x7F)) + break; + + symbol_buffer.push_back(symbol); + _current++; + } + _current--; + + lex._text_length = symbol_buffer.size(); + + using namespace _hidden; + + if(symbol_buffer == "use"){ + scan_use(); + return; + } + else if(symbol_buffer == "pub"){ + lex.type = gr_lexeme_type::public_; + } + else if(symbol_buffer == "main"){ + lex.type = gr_lexeme_type::main_; + } + else if(symbol_buffer == "type"){ + lex.type = gr_lexeme_type::type_; + } + else if(symbol_buffer == "event"){ + lex.type = gr_lexeme_type::event_; + } + else if(symbol_buffer == "class"){ + lex.type = gr_lexeme_type::class_; + } + else if(symbol_buffer == "enum"){ + lex.type = gr_lexeme_type::enum_; + } + else if(symbol_buffer == "template"){ + lex.type = gr_lexeme_type::template_; + } + else if(symbol_buffer == "if"){ + lex.type = gr_lexeme_type::if_; + } + else if(symbol_buffer == "unless"){ + lex.type = gr_lexeme_type::unless; + } + else if(symbol_buffer == "else"){ + lex.type = gr_lexeme_type::else_; + } + else if(symbol_buffer == "switch"){ + lex.type = gr_lexeme_type::switch_; + } + else if(symbol_buffer == "select"){ + lex.type = gr_lexeme_type::select; + } + else if(symbol_buffer == "case"){ + lex.type = gr_lexeme_type::case_; + } + else if(symbol_buffer == "while"){ + lex.type = gr_lexeme_type::while_; + } + else if(symbol_buffer == "do"){ + lex.type = gr_lexeme_type::do_; + } + else if(symbol_buffer == "until"){ + lex.type = gr_lexeme_type::until; + } + else if(symbol_buffer == "for"){ + lex.type = gr_lexeme_type::for_; + } + else if(symbol_buffer == "loop"){ + lex.type = gr_lexeme_type::loop; + } + else if(symbol_buffer == "return"){ + lex.type = gr_lexeme_type::return_; + } + else if(symbol_buffer == "self"){ + lex.type = gr_lexeme_type::self; + } + else if(symbol_buffer == "kill"){ + lex.type = gr_lexeme_type::kill; + } + else if(symbol_buffer == "killall"){ + lex.type = gr_lexeme_type::killAll; + } + else if(symbol_buffer == "yield"){ + lex.type = gr_lexeme_type::yield; + } + else if(symbol_buffer == "break"){ + lex.type = gr_lexeme_type::break_; + } + else if(symbol_buffer == "continue"){ + lex.type = gr_lexeme_type::continue_; + } + else if(symbol_buffer == "as"){ + lex.type = gr_lexeme_type::as; + } + else if(symbol_buffer == "try"){ + lex.type = gr_lexeme_type::try_; + } + else if(symbol_buffer == "catch"){ + lex.type = gr_lexeme_type::catch_; + } + else if(symbol_buffer == "raise"){ + lex.type = gr_lexeme_type::raise_; + } + else if(symbol_buffer == "defer"){ + lex.type = gr_lexeme_type::defer; + } + else if(symbol_buffer == "task"){ + lex.type = gr_lexeme_type::taskType; + lex.is_type = true; + } + else if(symbol_buffer == "func"){ + lex.type = gr_lexeme_type::functionType; + lex.is_type = true; + } + else if(symbol_buffer == "int"){ + lex.type = gr_lexeme_type::intType; + lex.is_type = true; + } + else if(symbol_buffer == "float"){ + lex.type = gr_lexeme_type::floatType; + lex.is_type = true; + } + else if(symbol_buffer == "bool"){ + lex.type = gr_lexeme_type::boolType; + lex.is_type = true; + } + else if(symbol_buffer == "string"){ + lex.type = gr_lexeme_type::stringType; + lex.is_type = true; + } + else if(symbol_buffer == "array"){ + lex.type = gr_lexeme_type::arrayType; + lex.is_type = true; + } + else if(symbol_buffer == "chan"){ + lex.type = gr_lexeme_type::chanType; + lex.is_type = true; + } + else if(symbol_buffer == "new"){ + lex.type = gr_lexeme_type::new_; + lex.is_type = false; + } + else if(symbol_buffer == "let"){ + lex.type = gr_lexeme_type::autoType; + lex.is_type = false; + } + else if(symbol_buffer == "true"){ + lex.type = gr_lexeme_type::boolean; + lex.is_keyword = false; + lex.is_literal = true; + lex.bvalue = true; + } + else if(symbol_buffer == "false"){ + lex.type = gr_lexeme_type::boolean; + lex.is_keyword = false; + lex.is_literal = true; + lex.bvalue = false; + } + else if(symbol_buffer == "null"){ + lex.type = gr_lexeme_type::null_; + lex.is_keyword = false; + lex.is_literal = true; + } + else if(symbol_buffer == "not"){ + lex.type = gr_lexeme_type::not_; + lex.is_keyword = false; + lex.is_operator = true; + } + else if(symbol_buffer == "and"){ + lex.type = gr_lexeme_type::and_; + lex.is_keyword = false; + lex.is_operator = true; + } + else if(symbol_buffer == "or"){ + lex.type = gr_lexeme_type::or_; + lex.is_keyword = false; + lex.is_operator = true; + } + else if(symbol_buffer == "xor"){ + lex.type = gr_lexeme_type::xor_; + lex.is_keyword = false; + lex.is_operator = true; + } else { + lex.is_keyword = false; + lex.type = gr_lexeme_type::identifier; + lex.svalue = symbol_buffer; + } + + _lexemes.push_back(lex); + } + + inline string gr_lexer::convert_path_to_import(string& path) { + return path; + } + + inline void gr_lexer::scan_file(gp::vector& file_name){ + _files_to_import.push_back(file_name); + + while (_files_to_import.size()) { + _file = _files_to_import[_files_to_import.size()-1]; + _files_imported.push_back(_file); + _text = file_loader(file_name); + _files_to_import.pop_back(); + + _line = 0u; + _current = 0u; + _lines = gp::vector(_allocator); + + gp::vector tmp(_allocator); + for(char c : _text) { + if(c == '\n') { + _lines.push_back(tmp); + tmp = gp::vector(_allocator); + } else { + tmp.push_back(c); + } + } + + scan_script(); + + _file_id++; + } + } + + inline void gr_lexer::scan_file_path(){ + gp_config::assertion(get() != '\"', "Expected \'\"\' at the beginning of the import."); + _current++; + + string buffer(_allocator); + for (;;) { + gp_config::assertion(_current >= _text.size(), "Missing \'\"\' character."); + char symbol = get(); + if (symbol == '\n') { + _position_of_line = _current; + _line++; + } + else if (symbol == '\"') + break; + + buffer.push_back(symbol); + _current++; + } + buffer = convert_path_to_import(buffer); + + for(auto& file : _files_imported) { + if(file == buffer) return; + } + for(auto& file : _files_to_import) { + if(file == buffer) return; + } + + _files_to_import.push_back(buffer); + } + + inline void gr_lexer::scan_use(){ + advance(); + + // Multiple files import. + if (get() == '{') { + advance(); + bool isFirst = true; + for (;;) { + if (isFirst) + isFirst = false; + else if (get() == '\"') + advance(); + else + gp_config::assertion(false, "Missing \'}\' after import list."); + // EOF + gp_config::assertion(_current >= _text.size(), "Missing \'}\' after import list."); + // End of the import list. + if (get() == '}') + break; + // Scan + scan_file_path(); + } + } + else { + scan_file_path(); + } + }