From 64c3850df41acad1454ec0b73d56e0fb5caff1cf Mon Sep 17 00:00:00 2001 From: Ludovic 'Archivist' Lagouardette Date: Mon, 23 Jan 2023 07:47:13 +0100 Subject: [PATCH] Initial commit --- .idea/misc.xml | 4 + .idea/modules.xml | 8 + .idea/sugar.iml | 2 + .idea/vcs.xml | 6 + CMakeLists.txt | 10 ++ include/molasses/lexer.h | 17 ++ include/molasses/parser_primitives.h | 144 ++++++++++++++++ prototypes/molasses/basic_file.mol | 7 + prototypes/molasses/first_rountrip.mol | 17 ++ src/main.cpp | 50 ++++++ src/molasses/lexer.cpp | 110 ++++++++++++ src/molasses/parser_primitives.cpp | 228 +++++++++++++++++++++++++ 12 files changed, 603 insertions(+) create mode 100644 .idea/misc.xml create mode 100644 .idea/modules.xml create mode 100644 .idea/sugar.iml create mode 100644 .idea/vcs.xml create mode 100644 CMakeLists.txt create mode 100644 include/molasses/lexer.h create mode 100644 include/molasses/parser_primitives.h create mode 100644 prototypes/molasses/basic_file.mol create mode 100644 prototypes/molasses/first_rountrip.mol create mode 100644 src/main.cpp create mode 100644 src/molasses/lexer.cpp create mode 100644 src/molasses/parser_primitives.cpp diff --git a/.idea/misc.xml b/.idea/misc.xml new file mode 100644 index 0000000..79b3c94 --- /dev/null +++ b/.idea/misc.xml @@ -0,0 +1,4 @@ + + + + \ No newline at end of file diff --git a/.idea/modules.xml b/.idea/modules.xml new file mode 100644 index 0000000..427b63b --- /dev/null +++ b/.idea/modules.xml @@ -0,0 +1,8 @@ + + + + + + + + \ No newline at end of file diff --git a/.idea/sugar.iml b/.idea/sugar.iml new file mode 100644 index 0000000..f08604b --- /dev/null +++ b/.idea/sugar.iml @@ -0,0 +1,2 @@ + + \ No newline at end of file diff --git a/.idea/vcs.xml b/.idea/vcs.xml new file mode 100644 index 0000000..94a25f7 --- /dev/null +++ b/.idea/vcs.xml @@ -0,0 +1,6 @@ + + + + + + \ No newline at end of file diff --git a/CMakeLists.txt b/CMakeLists.txt new file mode 100644 index 0000000..ebdfb74 --- /dev/null +++ b/CMakeLists.txt @@ -0,0 +1,10 @@ +cmake_minimum_required(VERSION 3.24) +project(sugar) + +find_package(cppfront REQUIRED) + +set(CMAKE_CXX_STANDARD 20) + +include_directories(include) + +add_executable(sugar src/main.cpp src/molasses/lexer.cpp include/molasses/lexer.h src/molasses/parser_primitives.cpp include/molasses/parser_primitives.h) diff --git a/include/molasses/lexer.h b/include/molasses/lexer.h new file mode 100644 index 0000000..b08e052 --- /dev/null +++ b/include/molasses/lexer.h @@ -0,0 +1,17 @@ +#pragma once +#include +#include +#include + +namespace molasses { + // We will always want symbols to be convertible to int for dictionary lookups + using symbol = int; + + struct lexed_output { + std::map dictionary; + std::vector symbols; + }; + + lexed_output lex(const std::string &); + lexed_output concatenate(const lexed_output& lhs, const lexed_output& rhs); +} \ No newline at end of file diff --git a/include/molasses/parser_primitives.h b/include/molasses/parser_primitives.h new file mode 100644 index 0000000..a37756c --- /dev/null +++ b/include/molasses/parser_primitives.h @@ -0,0 +1,144 @@ +#pragma once +#include +#include +#include +#include +#include +#include +#include + +#include "molasses/lexer.h" + +namespace molasses { + struct type { + [[nodiscard]] virtual std::string name() const = 0; + [[nodiscard]] virtual size_t byte_size() const = 0; + }; + + inline auto operator<=>(const type& lhs, const type& rhs) { + return lhs.name() <=> rhs.name(); + } + + struct primitive_type : public type { + std::string _name; + size_t _byte_size; + + primitive_type(std::string name, size_t byte_size) + : _name(std::forward(name)) + , _byte_size(byte_size) + {} + + [[nodiscard]] std::string name() const final { + return _name; + } + [[nodiscard]] size_t byte_size() const final { + return _byte_size; + }; + }; + + struct parser_context; + + struct operation { + [[nodiscard]] virtual std::string name() const = 0; + [[nodiscard]] virtual std::vector argument_types() const = 0; + [[nodiscard]] virtual std::vector return_types() const = 0; + [[nodiscard]] virtual std::vector generate(const parser_context&) const = 0; + + // Add generate() -> instruction[] + }; + + struct primitive_operation : public operation { + std::string _name; + std::vector _args; + std::vector _rets; + std::vector _instructions; + + primitive_operation(std::string name, std::vector args, std::vector rets) + : _name(std::forward(name)) + , _args(std::forward>(args)) + , _rets(std::forward>(rets)) + {} + + [[nodiscard]] std::string name() const final { + return _name; + } + [[nodiscard]] std::vector argument_types() const final { + return _args; + } + [[nodiscard]] std::vector return_types() const final { + return _rets; + } + [[nodiscard]] std::vector generate(const parser_context&) const final { + return _instructions; + } + }; + + struct procedure_operation : public operation { + std::string _name; + std::vector _args; + std::vector _rets; + std::vector _body; + + procedure_operation(std::string name, std::vector args, std::vector rets, std::vector body) + : _name(std::forward(name)) + , _args(std::forward>(args)) + , _rets(std::forward>(rets)) + , _body(std::forward>(body)) + {} + + [[nodiscard]] std::string name() const final { + return _name; + } + [[nodiscard]] std::vector argument_types() const final { + return _args; + } + [[nodiscard]] std::vector return_types() const final { + return _rets; + } + [[nodiscard]] std::vector generate(const parser_context&) const final; + }; + + inline auto operator<=>(const operation& lhs, const operation& rhs) { + return lhs.name() <=> rhs.name(); + } + + struct TypeInputError : std::runtime_error { + TypeInputError() : std::runtime_error("Bad type provided") {} + // TODO: Better error message + }; + struct ValueMissingError : std::runtime_error { + ValueMissingError() : std::runtime_error("Expected value, none provided") {} + // TODO: Better error message + }; + struct ProcedureStackError : std::runtime_error { + ProcedureStackError() : std::runtime_error("Expected the stack to look like the return stack upon completion") {} + // TODO: Better error message + }; + struct UnexpectedTokenError : std::runtime_error { + UnexpectedTokenError() : std::runtime_error("An unexpected token has been encountered") {} + // TODO: Better error message + }; + struct ExpectingTokenError : std::runtime_error { + ExpectingTokenError() : std::runtime_error("An expected token has not been encountered before the end of the input") {} + // TODO: Better error message + }; + + std::vector operator>>(std::vector current_stack, const operation& next_op); + + std::optional try_parse_int32(const std::string& str); + + struct parser_context { + std::vector> types; + std::vector> operations; + + [[nodiscard]] std::shared_ptr lookup_type(const std::string&) const; + [[nodiscard]] std::shared_ptr lookup_operation(const std::string&) const; + }; + + parser_context parse(parser_context, const lexed_output&); + + parser_context register_integers(parser_context); + + bool type_check(const parser_context&, const lexed_output&, const std::vector&, std::vector execution_input, const std::vector& execution_output); +} + diff --git a/prototypes/molasses/basic_file.mol b/prototypes/molasses/basic_file.mol new file mode 100644 index 0000000..ff4ed3d --- /dev/null +++ b/prototypes/molasses/basic_file.mol @@ -0,0 +1,7 @@ +__PROC__ procedure_name +i16 i8 +__--__ +i32 +__DO__ +i32 __CAST__ __SWAP__ i32 __CAST__ * +__END__ \ No newline at end of file diff --git a/prototypes/molasses/first_rountrip.mol b/prototypes/molasses/first_rountrip.mol new file mode 100644 index 0000000..42fb50e --- /dev/null +++ b/prototypes/molasses/first_rountrip.mol @@ -0,0 +1,17 @@ +__PROC__ write +i64 i8 ptr i64 +__--__ +i32 +__DO__ + __LET__ size, ptr, fd + size ptr __CAST_I64__ fd 1_i64 __SYSCALL4__ + __END_LET__ +__END__ + +proc __DEREF_I64_PTR__ +i64 ptr +-- +i64 +DO + +END \ No newline at end of file diff --git a/src/main.cpp b/src/main.cpp new file mode 100644 index 0000000..1779e2b --- /dev/null +++ b/src/main.cpp @@ -0,0 +1,50 @@ +#include "molasses/lexer.h" +#include "molasses/parser_primitives.h" +#include + +int main() { + /* + molasses::lexed_output initial; + + initial.dictionary[1] = "+"; + { + auto v = molasses::lex("hello hello potato 128 hello 128 +"); + auto v2 = molasses::lex("salad hello potato 129 hello 128"); + for (auto symbol: v.symbols) { + std::cout << "v: " << symbol << " - " << v.dictionary.at(symbol) << "\n"; + } + std::cout << "\n"; + for (auto symbol: v2.symbols) { + std::cout << "v2: " << symbol << " - " << v2.dictionary.at(symbol) << "\n"; + } + + auto v_merged = molasses::concatenate(initial, molasses::concatenate(v, v2)); + + std::cout << "\n"; + for (auto symbol: v_merged.symbols) { + std::cout << "v_merged: " << symbol << " - " << v_merged.dictionary.at(symbol) << "\n"; + } + } + + auto v = molasses::lex("1 2 +"); + molasses::parser_context ctx; + ctx = molasses::register_integers(ctx); + ctx.operations.emplace_back(std::make_shared(std::string{"+"}, std::vector({"i32", "i32"}), std::vector({"i32"}))); + if(molasses::type_check(ctx, v, v.symbols, {}, {"i32"})) { + std::cout << "Checks out\n"; + }*/ + + auto lexed = molasses::lex("__PROC__ sum\n" + "i32 i32\n" + "__--__\n" + "i32\n" + "__DO__\n" + "+\n" + "__END__"); + + molasses::parser_context ctx; + ctx = molasses::register_integers(ctx); + ctx.operations.emplace_back(std::make_shared(std::string{"+"}, std::vector({"i32", "i32"}), std::vector({"i32"}))); + molasses::parse(ctx, lexed); + +} diff --git a/src/molasses/lexer.cpp b/src/molasses/lexer.cpp new file mode 100644 index 0000000..5e9b790 --- /dev/null +++ b/src/molasses/lexer.cpp @@ -0,0 +1,110 @@ +#include "molasses/lexer.h" + +#include +#include +#include + +namespace molasses { + lexed_output lex(const std::string & source) { + lexed_output output; + std::map reverse_dictionary; + std::stringstream builder; + int token_counter = 1; + + // Processes the current token into the output if it is not empty + // This should be called upon reaching the end of a token + const auto process_token = [&](const std::string& token) { + if(not token.empty()) { + symbol current_symbol; + if( + auto it = reverse_dictionary.find(token); + it == reverse_dictionary.end() + ) { + reverse_dictionary[token] = token_counter; + output.dictionary[token_counter] = token; + current_symbol = token_counter; + token_counter++; + } else { + current_symbol = it->second; + } + output.symbols.push_back(current_symbol); + builder = std::stringstream(); + } + }; + + for(auto& character : source) { + if(std::isspace(character)) { + process_token(builder.str()); + } else { + builder << character; + } + } + process_token(builder.str()); // process the last token if needed + return output; + } + + using conversion_table = std::map; + + lexed_output concatenate(const lexed_output& lhs, const lexed_output& rhs) { + + // primitive that flips keys and values of dictionaries + constexpr auto dictionary_reversal = [](auto& destination,const auto& source) { + for(auto& it : source) { + destination.insert_or_assign(it.second, it.first); + } + }; + + // primitive that merges a dictionary into a reversed one and returns a conversion table of symbols + // from the dictionary to the newly generated reverse dictionary + auto build_reverse_dictionary = [dictionary_reversal] (auto& reverse_dictionary, auto dictionary) -> conversion_table { + // Make the right dictionary into a reverse dictionary + std::map right_reverse_dictionary; + dictionary_reversal(right_reverse_dictionary, dictionary); + + // find the maximum token id in the left dictionary + int max_token = 0; + if(not reverse_dictionary.empty()) { + max_token = std::max_element( + reverse_dictionary.begin(), + reverse_dictionary.end(), + [](const auto &lhs, const auto &rhs) -> bool { + return lhs.second < rhs.second; + } + )->second; + } + + // make the conversions and update the reverse dictionary + conversion_table conversions; + for(auto& [key, value] : right_reverse_dictionary) { + if(auto match = reverse_dictionary.find(key); match != reverse_dictionary.end()) { + conversions[value] = match->second; + } else { + max_token+=1; + conversions[value] = max_token; + reverse_dictionary[key] = max_token; + } + } + + return conversions; + }; + + std::map reverse_dictionary; + dictionary_reversal(reverse_dictionary, lhs.dictionary); + + auto conversions = build_reverse_dictionary(reverse_dictionary, rhs.dictionary); + auto symbol_stream = rhs.symbols; + lexed_output output{.symbols = lhs.symbols}; + + for(auto& old_symbol : symbol_stream) { + //This diagnostic is pretty lousy, but that is what happens when keys are taken by reference +#pragma clang diagnostic push +#pragma ide diagnostic ignored "LocalValueEscapesScope" + old_symbol = conversions[old_symbol]; +#pragma clang diagnostic pop + } + + dictionary_reversal(output.dictionary, reverse_dictionary); + std::copy(symbol_stream.begin(), symbol_stream.end(), std::back_inserter(output.symbols)); + return output; + } +} \ No newline at end of file diff --git a/src/molasses/parser_primitives.cpp b/src/molasses/parser_primitives.cpp new file mode 100644 index 0000000..0eee99c --- /dev/null +++ b/src/molasses/parser_primitives.cpp @@ -0,0 +1,228 @@ +#include +#include +#include "molasses/parser_primitives.h" + +namespace molasses { + parser_context register_integers(parser_context ctx) { + ctx.types.push_back(std::make_shared("i8",1)); + ctx.types.push_back(std::make_shared("i16",2)); + ctx.types.push_back(std::make_shared("i32",4)); + ctx.types.push_back(std::make_shared("i64",8)); + ctx.types.push_back(std::make_shared("u8",1)); + ctx.types.push_back(std::make_shared("u16",2)); + ctx.types.push_back(std::make_shared("u32",4)); + ctx.types.push_back(std::make_shared("u64",8)); + + return ctx; + } + + std::vector operator>>(std::vector current_stack, const operation& next_op) { + { + auto args = next_op.argument_types(); + while(not (args.empty() or current_stack.empty())) { + if(current_stack.back() != args.back()) { + throw TypeInputError(); + } else { + args.pop_back(); + current_stack.pop_back(); + } + } + if(not args.empty()) { + throw ValueMissingError(); + } + } + { + auto return_types = next_op.return_types(); + std::move(return_types.begin(), return_types.end(), std::back_inserter(current_stack)); + } + return current_stack; + } + + std::optional try_parse_int32(const std::string& str) { + int32_t value; + auto begin = str.data(); + auto end = str.data()+str.size(); + auto result = std::from_chars(begin, end, value, 10); + // TODO: Add other bases + if(result.ptr == end) { + return value; + } + return std::nullopt; + } + + auto find_ptr_by_name_in_container(auto container, const auto& name) -> typeof(*std::begin(container)) { + auto it = std::find_if(std::begin(container), std::end(container), [&](auto elem){ + return elem->name() == name; + }); + if(it != std::end(container)) { + return *it; + } + return {}; + } + + std::shared_ptr parser_context::lookup_type(const std::string & name) const { + return find_ptr_by_name_in_container(types, name); + } + + std::shared_ptr parser_context::lookup_operation(const std::string & name) const { + return find_ptr_by_name_in_container(operations, name); + } + + bool type_check( + const parser_context& parser_state, + const lexed_output& lexer_state, + const std::vector& consumed_stream, + std::vector execution_input, + const std::vector& execution_output + ) { + auto& type_stack = execution_input; + + for(const auto& symbol : consumed_stream) { + const auto& symbol_text = lexer_state.dictionary.at(symbol); + if(auto is_int = try_parse_int32(symbol_text); is_int) { + type_stack.emplace_back("i32"); + } else if(auto is_op = parser_state.lookup_operation(symbol_text); is_op) { + type_stack = type_stack >> *is_op; + } + } + + return type_stack == execution_output; + } + + parser_context parse(parser_context ctx, const lexed_output& lexer_data) { + enum op : int { + DO_KW = 1, + SEPARATOR_KW, + PROC_KW, + END_KW + }; + + lexed_output fake; + fake.dictionary[PROC_KW] = "__PROC__"; + fake.dictionary[SEPARATOR_KW] = "__--__"; + fake.dictionary[DO_KW] = "__DO__"; + fake.dictionary[END_KW] = "__END__"; + + auto tokens = concatenate(fake, lexer_data); + + std::vector> parsed_procedures; + + auto parse_proc = [&](auto it) -> std::pair> { +#define CHECK_FOR_UNEXPECTED_STREAM_END \ + if(it == tokens.symbols.end()) { \ + throw ExpectingTokenError(); \ + } + + if(*it != PROC_KW) { + throw UnexpectedTokenError(); + } + ++it; + CHECK_FOR_UNEXPECTED_STREAM_END; + + std::string name = tokens.dictionary.at(*it); + ++it; + CHECK_FOR_UNEXPECTED_STREAM_END; + + if(it == tokens.symbols.end()) { + throw ExpectingTokenError(); + } + + // Process arguments list + std::vector argument_types; + while(*it != SEPARATOR_KW) { + argument_types.emplace_back(tokens.dictionary.at(*it)); + ++it; + CHECK_FOR_UNEXPECTED_STREAM_END; + } + ++it; + CHECK_FOR_UNEXPECTED_STREAM_END; + + // Process return types list + std::vector return_types; + while(*it != DO_KW) { + return_types.emplace_back(tokens.dictionary.at(*it)); + ++it; + CHECK_FOR_UNEXPECTED_STREAM_END; + } + ++it; + CHECK_FOR_UNEXPECTED_STREAM_END; + + // Process return types list + std::vector body; + while(*it != END_KW) { + body.emplace_back(*it); + ++it; + CHECK_FOR_UNEXPECTED_STREAM_END; + } + ++it; + + return std::make_pair(it, std::make_shared(name, argument_types, return_types, body)); +#undef CHECK_FOR_UNEXPECTED_STREAM_END + }; + + auto [iterator, procedure] = parse_proc(tokens.symbols.begin()); + ctx.operations.push_back(procedure); + parsed_procedures.emplace_back(std::move(procedure)); + + for(auto& proc : parsed_procedures) { + if(not type_check(ctx, tokens, proc->_body, proc->_args, proc->_rets)) { + throw ProcedureStackError(); + } + } + + return ctx; + } + + std::vector initialize_stack() { + return { + ".bss\n",// TODO: make threadlocal + "stack_instruction:", + " .quad 0", + ".text\n", + "initialize_callstack:\n", + " movq $9, %rax\n", + " movq $0, %rdi\n", + " movq $8192, %rsi\n", + " movq $3, %rdx\n", + " movq $34, %r10\n", + " movq $-1, %r8\n", + " movq $0, %r9\n", + " syscall\n", + " movq %rax, (stack_instruction)\n", + " retq\n", + }; + } + + std::vector generate_call(std::string target) { + static uint64_t label_count= 0; + return { + "movq return_label_n"+std::to_string(label_count)+", (stack_instruction)\n", + "addq $8, stack_instruction\n", + "jmp "+target+"\n", + "return_label_n"+std::to_string(label_count++)+":" + }; + } + + std::vector procedure_operation::generate(const parser_context& ctx) const { + size_t initial_stack = 0; + size_t final_stack = 0; + for(const auto& elem : argument_types()) { + initial_stack += ctx.lookup_type(elem)->byte_size(); + } + for(const auto& elem : return_types()) { + final_stack += ctx.lookup_type(elem)->byte_size(); + } + std::vector ops; + ops.emplace_back(name()+":\n"); + + + + // Return to caller + ops.emplace_back(" addq $-8, stack_instruction\n"); + ops.emplace_back(" movq (stack_instruction), %rax\n"); + ops.emplace_back(" pushq %rax\n"); + ops.emplace_back(" retq\n"); + + return ops; + } +}