Explorar el Código

Added a grimoire Lexer for GP

Courtesy of Enalye for the assistance in understanding their gibberish
channel
Ludovic 'Archivist' Lagouardette hace 3 años
padre
commit
a45dbee7f1
Se han modificado 1 ficheros con 957 adiciones y 0 borrados
  1. +957
    -0
      include/gp/dynamic/compiler/lexer.hpp

+ 957
- 0
include/gp/dynamic/compiler/lexer.hpp Ver fichero

@ -0,0 +1,957 @@
#pragma once
#include "gp/containers/array.hpp"
#include "gp/containers/vector.hpp"
#include "gp/text/ascii.hpp"
using string = gp::vector<char>;
#define FOREACH_LEXEME_TYPE \
LEX(leftBracket) \
LEX(rightBracket) \
LEX(leftParenthesis) \
LEX(rightParenthesis) \
LEX(leftCurlyBrace) \
LEX(rightCurlyBrace) \
LEX(period) \
LEX(semicolon) \
LEX(colon) \
LEX(doubleColon) \
LEX(comma) \
LEX(at) \
LEX(pointer) \
LEX(as) \
LEX(try_) \
LEX(catch_) \
LEX(raise_) \
LEX(defer) \
LEX(assign) \
LEX(addAssign) \
LEX(substractAssign) \
LEX(multiplyAssign) \
LEX(divideAssign) \
LEX(concatenateAssign) \
LEX(remainderAssign) \
LEX(powerAssign) \
LEX(plus) \
LEX(minus) \
LEX(add) \
LEX(substract) \
LEX(multiply) \
LEX(divide) \
LEX(concatenate) \
LEX(remainder) \
LEX(power) \
LEX(equal) \
LEX(doubleEqual) \
LEX(threeWayComparison) \
LEX(notEqual) \
LEX(greaterOrEqual) \
LEX(greater) \
LEX(lesserOrEqual) \
LEX(lesser) \
LEX(leftShift) \
LEX(rightShift) \
LEX(and_) \
LEX(or_) \
LEX(xor_) \
LEX(not_) \
LEX(increment) \
LEX(decrement) \
LEX(identifier) \
LEX(integer) \
LEX(float_) \
LEX(boolean) \
LEX(string_) \
LEX(null_) \
LEX(public_) \
LEX(main_) \
LEX(type_) \
LEX(event_) \
LEX(class_) \
LEX(enum_) \
LEX(template_) \
LEX(new_) \
LEX(copy) \
LEX(send) \
LEX(receive) \
LEX(intType) \
LEX(floatType) \
LEX(boolType) \
LEX(stringType) \
LEX(arrayType) \
LEX(functionType) \
LEX(taskType) \
LEX(chanType) \
LEX(autoType) \
LEX(if_) \
LEX(unless) \
LEX(else_) \
LEX(switch_) \
LEX(select) \
LEX(case_) \
LEX(while_) \
LEX(do_) \
LEX(until) \
LEX(for_) \
LEX(loop) \
LEX(return_) \
LEX(self) \
LEX(kill) \
LEX(killAll) \
LEX(yield) \
LEX(break_) \
LEX(continue_)
#define LEX(x) x,
enum class gr_lexeme_type {
FOREACH_LEXEME_TYPE
};
#undef LEX
#define LEX(x) case gr_lexeme_type::x: return #x;
constexpr inline const char* to_string(const gr_lexeme_type& value) {
switch(value) {
FOREACH_LEXEME_TYPE
}
}
#undef LEX
#define LEX(x) +1
constexpr inline const char* to_pretty_string(const gr_lexeme_type& value) {
constexpr gp::array<const char*, 0 + FOREACH_LEXEME_TYPE> names = {
"[", "]", "(", ")", "{", "}", ".", ";", ":", "::", ",", "@", "&", "as",
"try", "catch", "raise", "defer", "=", "+=", "-=", "*=", "/=", "~=",
"%=", "^=", "+", "-", "+", "-", "*", "/", "~", "%", "^", "==", "===",
"<=>", "!=", ">=", ">", "<=", "<", "<<", ">>", "and", "or", "xor", "not", "++",
"--", "identifier", "const_int", "const_float", "const_bool",
"const_str", "null", "pub", "main", "type", "event", "class", "enum",
"template", "new", "copy", "send", "receive", "int", "float", "bool",
"string", "array", "func", "task", "chan", "let", "if", "unless",
"else", "switch", "select", "case", "while", "do", "until", "for", "loop",
"return", "self", "kill", "killall", "yield", "break", "continue"
};
return names[(uint64_t)value];
}
#undef LEX
struct gr_lexeme;
class gr_lexer {
public:
using file_loader_t = gp::function<gp::vector<char>(const gp::vector<char>&)>;
gp::allocator& _allocator;
file_loader_t file_loader;
private:
gp::vector<string> _files_to_import, _files_imported, _lines;
string _file, _text;
uint64_t _line, _current, _position_of_line, _file_id;
gp::vector<gr_lexeme> _lexemes;
char get(ssize_t offset = 0);
bool advance(bool start_from_current = false);
void scan_script();
void scan_number();
void scan_string();
void scan_operator();
void scan_word();
void scan_file_path();
void scan_use();
string convert_path_to_import(string&);
public:
gr_lexer(gp::allocator& alloc, file_loader_t& loader)
: _allocator(alloc)
, file_loader(loader)
, _files_to_import(_allocator)
, _files_imported(_allocator)
, _lines(_allocator)
, _file(_allocator)
, _text(_allocator)
, _lexemes(_allocator)
{}
const gp::vector<gr_lexeme>& lexemes() {
return _lexemes;
}
void scan_file(gp::vector<char>& file_name);
const string& get_line(const gr_lexeme&) const;
const string& get_file(const gr_lexeme&) const;
const string& get_file(const size_t&) const;
};
struct gr_lexeme {
const gr_lexer& lexer;
size_t _file_id;
size_t _line, _column, _text_length = 1;
gr_lexeme_type type;
bool
is_literal,
is_operator,
is_keyword,
is_type;
int ivalue;
float fvalue;
bool bvalue;
string svalue;
gr_lexeme(const gr_lexer& v)
: lexer(v)
, svalue(v._allocator)
{}
const string& get_line() const {
return lexer.get_line(*this);
}
const string& get_file() {
return lexer.get_file(*this);
}
};
inline char gr_lexer::get(ssize_t offset) {
const uint position = ssize_t(_current) + offset;
gp_config::assertion(!(position < 0 || position >= _text.size()), "Unexpected end of script");
return _text[position];
}
inline const string& gr_lexer::get_line(const gr_lexeme& lex) const {
gp_config::assertion(!(lex._file_id >= _files_imported.size()), "Lexeme file id out of bounds");
// TODO: Implement this
return _files_imported[lex._file_id];
}
inline const string& gr_lexer::get_file(const gr_lexeme& lex) const {
gp_config::assertion(!(lex._file_id >= _files_imported.size()), "Lexeme file id out of bounds");
return _files_imported[lex._file_id];
}
inline const string& gr_lexer::get_file(const size_t& file_id) const {
gp_config::assertion(!(file_id >= _files_imported.size()), "File id out of bounds");
return _files_imported[file_id];
}
inline bool gr_lexer::advance(bool start_from_current) {
if(!start_from_current) {
_current++;
}
if(_current >= _text.size()) {
return false;
}
char symbol = _text[_current];
whileLoop: while(symbol <= 0x20 || symbol == '/' || symbol == '#') {
if(_current >= _text.size()) {
return false;
}
symbol = _text[_current];
if(symbol == '\n') {
_position_of_line = _current;
_line++;
}
else if(symbol == '#')
{
do {
if(_current >= _text.size()) return false;
_current++;
} while (_text[_current] != '\n');
_position_of_line = _current;
_line++;
}
else if(symbol == '/')
{
if((_current + 1) >= _text.size()) {
return false;
}
switch(_text[_current + 1]) {
case '/': {
do {
if(_current >= _text.size()) {
return false;
}
} while(_current < _text.size() && _text[_current] != '\n');
_position_of_line = _current;
_line++;
}break;
case '*': {
for(;;) {
if((_current + 1) >= _text.size()) {
_current++;
return false;
}
if(_text[_current] == '\n') {
_position_of_line = _current;
_line++;
}
if(_text[_current] == '*' && _text[_current + 1] == '/') {
_current++;
break;
}
_current++;
}
}break;
default:
// Goto honorable
goto whileLoop;
}
}
_current++;
if(_current >= _text.size()) {
return false;
}
symbol = _text[_current];
}
return true;
}
inline void gr_lexer::scan_script() {
advance(true);
constexpr static auto is_operator = [](char v) {
if(v == '!') return true;
if(v >= '#' && v <='&') return true;
if(v >= '(' && v <='-') return true;
if(v == '/') return true;
if(v >= ':' && v <='@') return true;
if(v >= '[' && v <='^') return true;
if(v >= '{' && v <='~') return true;
return false;
};
do {
if (_current >= _text.size())
break;
auto c = get();
if(is_digit(c)) scan_number();
else if(c == '.') {
if (get(1) >= '0' && get(1) <= '9')
scan_number();
else
scan_operator();
}
else if(is_operator(c)) scan_operator();
else if(c == '\"') scan_string();
else scan_word();
}while (advance());
}
inline void gr_lexer::scan_number(){
gr_lexeme lex = gr_lexeme(*this);
lex.is_literal = true;
bool isFloat;
string buffer(_allocator);
for (;;) {
char symbol = get();
if (symbol >= '0' && symbol <= '9')
buffer.push_back(symbol);
else if (symbol == '_') {
// Do nothing, only cosmetic (e.g. 1_000_000).
}
else if (symbol == '.') {
if (isFloat)
break;
isFloat = true;
buffer.push_back(symbol);
}
else if (symbol == 'f') {
isFloat = true;
break;
}
else {
if (_current)
_current--;
break;
}
_current++;
if (_current >= _text.size())
break;
}
}
inline void gr_lexer::scan_string(){
gr_lexeme lex = gr_lexeme(*this);
lex.type = gr_lexeme_type::string_;
lex.is_literal = true;
gp_config::assertion(get() != '\"',"Expected \'\"\' at the beginning of the string.");
_current++;
string buffer(_allocator);
bool escape = false;
bool wasEscape = false;
for (;;) {
gp_config::assertion(_current >= _text.size(),"Missing \'\"\' character.");
char symbol = get();
if (symbol == '\n') {
_position_of_line = _current;
_line++;
}
else if (symbol == '\"' && (!wasEscape))
break;
else if (symbol == '\\' && (!wasEscape)) {
escape = true;
}
if (!escape) {
if (!wasEscape) {
buffer.push_back(symbol);
}
else {
if (symbol == 'n')
buffer.push_back('\n');
else
buffer.push_back(symbol);
}
}
wasEscape = escape;
escape = false;
_current++;
}
lex._text_length = size_t(buffer.size() + 2u);
lex.svalue = buffer;
_lexemes.push_back(lex);
}
inline void gr_lexer::scan_operator(){
gr_lexeme lex = gr_lexeme(*this);
lex.is_operator = true;
switch (get()) {
case '{':
lex.type = gr_lexeme_type::leftCurlyBrace;
break;
case '}':
lex.type = gr_lexeme_type::rightCurlyBrace;
break;
case '(':
lex.type = gr_lexeme_type::leftParenthesis;
break;
case ')':
lex.type = gr_lexeme_type::rightParenthesis;
break;
case '[':
lex.type = gr_lexeme_type::leftBracket;
break;
case ']':
lex.type = gr_lexeme_type::rightBracket;
break;
case '.':
lex.type = gr_lexeme_type::period;
break;
case ';':
lex.type = gr_lexeme_type::semicolon;
break;
case ':':
lex.type = gr_lexeme_type::colon;
if (_current + 1 >= _text.size())
break;
if (get(1) == ':') {
lex.type = gr_lexeme_type::doubleColon;
lex._text_length = 2;
_current++;
}
break;
case ',':
lex.type = gr_lexeme_type::comma;
break;
case '^':
lex.type = gr_lexeme_type::power;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::powerAssign;
lex._text_length = 2;
_current++;
}
break;
case '@':
lex.type = gr_lexeme_type::at;
break;
case '&':
lex.type = gr_lexeme_type::pointer;
break;
case '~':
lex.type = gr_lexeme_type::concatenate;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::concatenateAssign;
lex._text_length = 2;
_current++;
}
break;
case '+':
lex.type = gr_lexeme_type::add;
if (_current + 1 >= _text.size())
break;
switch (get(1)) {
case '=':
lex.type = gr_lexeme_type::addAssign;
lex._text_length = 2;
_current++;
break;
case '+':
lex.type = gr_lexeme_type::increment;
lex._text_length = 2;
_current++;
break;
default:
break;
}
break;
case '-':
lex.type = gr_lexeme_type::substract;
if (_current + 1 >= _text.size())
break;
switch (get(1)) {
case '=':
lex.type = gr_lexeme_type::substractAssign;
lex._text_length = 2;
_current++;
break;
case '-':
lex.type = gr_lexeme_type::decrement;
lex._text_length = 2;
_current++;
break;
default:
break;
}
break;
case '*':
lex.type = gr_lexeme_type::multiply;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::multiplyAssign;
lex._text_length = 2;
_current++;
}
break;
case '/':
lex.type = gr_lexeme_type::divide;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::divideAssign;
lex._text_length = 2;
_current++;
}
break;
case '%':
lex.type = gr_lexeme_type::remainder;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::remainderAssign;
lex._text_length = 2;
_current++;
}
break;
case '=':
lex.type = gr_lexeme_type::assign;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::equal;
lex._text_length = 2;
_current++;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::doubleEqual;
lex._text_length = 3;
_current++;
}
}
break;
case '<':
lex.type = gr_lexeme_type::lesser;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::lesserOrEqual;
lex._text_length = 2;
_current++;
if (_current + 1 >= _text.size())
break;
if (get(1) == '>') {
lex.type = gr_lexeme_type::threeWayComparison;
lex._text_length = 3;
_current++;
}
}
else if (get(1) == '-') {
lex.type = gr_lexeme_type::send;
lex._text_length = 2;
_current++;
}
else if (get(1) == '<') {
lex.type = gr_lexeme_type::leftShift;
lex._text_length = 2;
_current++;
}
break;
case '>':
lex.type = gr_lexeme_type::greater;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::greaterOrEqual;
lex._text_length = 2;
_current++;
}
else if (get(1) == '>') {
lex.type = gr_lexeme_type::rightShift;
lex._text_length = 2;
_current++;
}
break;
case '!':
lex.type = gr_lexeme_type::not_;
if (_current + 1 >= _text.size())
break;
if (get(1) == '=') {
lex.type = gr_lexeme_type::notEqual;
lex._text_length = 2;
_current++;
}
break;
default:
gp_config::assertion(false, "GrLexer: invalid operator");
}
_lexemes.push_back(lex);
}
namespace _hidden {
bool operator==(const gp::vector<char>& lhs, const char* rhs) {
for(size_t index = 0; index < lhs.size() && rhs[index] != 0; index++){
if(lhs[index] != rhs[index]) return false;
}
return true;
}
}
inline void gr_lexer::scan_word(){
gr_lexeme lex = gr_lexeme(*this);
lex.is_keyword = true;
string symbol_buffer(_allocator);
for (;;) {
if (_current >= _text.size())
break;
char symbol = get();
if (symbol == '!' || symbol == '?') {
symbol_buffer.push_back(symbol);
_current++;
break;
}
if (symbol <= '&' || (symbol >= '(' && symbol <= '/') || (symbol >= ':'
&& symbol <= '@') || (symbol >= '[' && symbol <= '^')
|| (symbol >= '{' && symbol <= 0x7F))
break;
symbol_buffer.push_back(symbol);
_current++;
}
_current--;
lex._text_length = symbol_buffer.size();
using namespace _hidden;
if(symbol_buffer == "use"){
scan_use();
return;
}
else if(symbol_buffer == "pub"){
lex.type = gr_lexeme_type::public_;
}
else if(symbol_buffer == "main"){
lex.type = gr_lexeme_type::main_;
}
else if(symbol_buffer == "type"){
lex.type = gr_lexeme_type::type_;
}
else if(symbol_buffer == "event"){
lex.type = gr_lexeme_type::event_;
}
else if(symbol_buffer == "class"){
lex.type = gr_lexeme_type::class_;
}
else if(symbol_buffer == "enum"){
lex.type = gr_lexeme_type::enum_;
}
else if(symbol_buffer == "template"){
lex.type = gr_lexeme_type::template_;
}
else if(symbol_buffer == "if"){
lex.type = gr_lexeme_type::if_;
}
else if(symbol_buffer == "unless"){
lex.type = gr_lexeme_type::unless;
}
else if(symbol_buffer == "else"){
lex.type = gr_lexeme_type::else_;
}
else if(symbol_buffer == "switch"){
lex.type = gr_lexeme_type::switch_;
}
else if(symbol_buffer == "select"){
lex.type = gr_lexeme_type::select;
}
else if(symbol_buffer == "case"){
lex.type = gr_lexeme_type::case_;
}
else if(symbol_buffer == "while"){
lex.type = gr_lexeme_type::while_;
}
else if(symbol_buffer == "do"){
lex.type = gr_lexeme_type::do_;
}
else if(symbol_buffer == "until"){
lex.type = gr_lexeme_type::until;
}
else if(symbol_buffer == "for"){
lex.type = gr_lexeme_type::for_;
}
else if(symbol_buffer == "loop"){
lex.type = gr_lexeme_type::loop;
}
else if(symbol_buffer == "return"){
lex.type = gr_lexeme_type::return_;
}
else if(symbol_buffer == "self"){
lex.type = gr_lexeme_type::self;
}
else if(symbol_buffer == "kill"){
lex.type = gr_lexeme_type::kill;
}
else if(symbol_buffer == "killall"){
lex.type = gr_lexeme_type::killAll;
}
else if(symbol_buffer == "yield"){
lex.type = gr_lexeme_type::yield;
}
else if(symbol_buffer == "break"){
lex.type = gr_lexeme_type::break_;
}
else if(symbol_buffer == "continue"){
lex.type = gr_lexeme_type::continue_;
}
else if(symbol_buffer == "as"){
lex.type = gr_lexeme_type::as;
}
else if(symbol_buffer == "try"){
lex.type = gr_lexeme_type::try_;
}
else if(symbol_buffer == "catch"){
lex.type = gr_lexeme_type::catch_;
}
else if(symbol_buffer == "raise"){
lex.type = gr_lexeme_type::raise_;
}
else if(symbol_buffer == "defer"){
lex.type = gr_lexeme_type::defer;
}
else if(symbol_buffer == "task"){
lex.type = gr_lexeme_type::taskType;
lex.is_type = true;
}
else if(symbol_buffer == "func"){
lex.type = gr_lexeme_type::functionType;
lex.is_type = true;
}
else if(symbol_buffer == "int"){
lex.type = gr_lexeme_type::intType;
lex.is_type = true;
}
else if(symbol_buffer == "float"){
lex.type = gr_lexeme_type::floatType;
lex.is_type = true;
}
else if(symbol_buffer == "bool"){
lex.type = gr_lexeme_type::boolType;
lex.is_type = true;
}
else if(symbol_buffer == "string"){
lex.type = gr_lexeme_type::stringType;
lex.is_type = true;
}
else if(symbol_buffer == "array"){
lex.type = gr_lexeme_type::arrayType;
lex.is_type = true;
}
else if(symbol_buffer == "chan"){
lex.type = gr_lexeme_type::chanType;
lex.is_type = true;
}
else if(symbol_buffer == "new"){
lex.type = gr_lexeme_type::new_;
lex.is_type = false;
}
else if(symbol_buffer == "let"){
lex.type = gr_lexeme_type::autoType;
lex.is_type = false;
}
else if(symbol_buffer == "true"){
lex.type = gr_lexeme_type::boolean;
lex.is_keyword = false;
lex.is_literal = true;
lex.bvalue = true;
}
else if(symbol_buffer == "false"){
lex.type = gr_lexeme_type::boolean;
lex.is_keyword = false;
lex.is_literal = true;
lex.bvalue = false;
}
else if(symbol_buffer == "null"){
lex.type = gr_lexeme_type::null_;
lex.is_keyword = false;
lex.is_literal = true;
}
else if(symbol_buffer == "not"){
lex.type = gr_lexeme_type::not_;
lex.is_keyword = false;
lex.is_operator = true;
}
else if(symbol_buffer == "and"){
lex.type = gr_lexeme_type::and_;
lex.is_keyword = false;
lex.is_operator = true;
}
else if(symbol_buffer == "or"){
lex.type = gr_lexeme_type::or_;
lex.is_keyword = false;
lex.is_operator = true;
}
else if(symbol_buffer == "xor"){
lex.type = gr_lexeme_type::xor_;
lex.is_keyword = false;
lex.is_operator = true;
} else {
lex.is_keyword = false;
lex.type = gr_lexeme_type::identifier;
lex.svalue = symbol_buffer;
}
_lexemes.push_back(lex);
}
inline string gr_lexer::convert_path_to_import(string& path) {
return path;
}
inline void gr_lexer::scan_file(gp::vector<char>& file_name){
_files_to_import.push_back(file_name);
while (_files_to_import.size()) {
_file = _files_to_import[_files_to_import.size()-1];
_files_imported.push_back(_file);
_text = file_loader(file_name);
_files_to_import.pop_back();
_line = 0u;
_current = 0u;
_lines = gp::vector<string>(_allocator);
gp::vector<char> tmp(_allocator);
for(char c : _text) {
if(c == '\n') {
_lines.push_back(tmp);
tmp = gp::vector<char>(_allocator);
} else {
tmp.push_back(c);
}
}
scan_script();
_file_id++;
}
}
inline void gr_lexer::scan_file_path(){
gp_config::assertion(get() != '\"', "Expected \'\"\' at the beginning of the import.");
_current++;
string buffer(_allocator);
for (;;) {
gp_config::assertion(_current >= _text.size(), "Missing \'\"\' character.");
char symbol = get();
if (symbol == '\n') {
_position_of_line = _current;
_line++;
}
else if (symbol == '\"')
break;
buffer.push_back(symbol);
_current++;
}
buffer = convert_path_to_import(buffer);
for(auto& file : _files_imported) {
if(file == buffer) return;
}
for(auto& file : _files_to_import) {
if(file == buffer) return;
}
_files_to_import.push_back(buffer);
}
inline void gr_lexer::scan_use(){
advance();
// Multiple files import.
if (get() == '{') {
advance();
bool isFirst = true;
for (;;) {
if (isFirst)
isFirst = false;
else if (get() == '\"')
advance();
else
gp_config::assertion(false, "Missing \'}\' after import list.");
// EOF
gp_config::assertion(_current >= _text.size(), "Missing \'}\' after import list.");
// End of the import list.
if (get() == '}')
break;
// Scan
scan_file_path();
}
}
else {
scan_file_path();
}
}

Cargando…
Cancelar
Guardar