|
|
- #include "molasses/lexer.h"
-
- #include <algorithm>
- #include <sstream>
- #include <iostream>
-
- namespace molasses {
- lexed_output lex(const std::string& file_name, const std::string & source) {
- lexed_output output;
- std::map<std::string, int> reverse_dictionary;
- std::stringstream builder;
- int token_counter = 1;
- int line = 1;
- int column = 0;
-
- enum class state_machine_t {
- normal,
- string,
- string_escape,
- string_end,
- };
-
- state_machine_t state = state_machine_t::normal;
-
- // Processes the current token into the output if it is not empty
- // This should be called upon reaching the end of a token
- const auto process_token = [&](const std::string& token) {
- if(not token.empty()) {
- symbol current_symbol;
- if(
- auto it = reverse_dictionary.find(token);
- it == reverse_dictionary.end()
- ) {
- reverse_dictionary[token] = token_counter;
- output.dictionary[token_counter] = token;
- current_symbol = {token_counter, file_name, line, column, state == state_machine_t::string_end};
- token_counter++;
- } else {
- current_symbol = {it->second, file_name, line, column, state == state_machine_t::string_end};
- }
- output.symbols.push_back(current_symbol);
- builder = std::stringstream();
- }
- };
-
- for(auto& character : source) {
- if(character == '\n') {
- line++;
- column = 0;
- }
- column++;
- if(state == state_machine_t::string_escape) {
- switch(character) {
- case 'n': builder << '\n'; break;
- case 't': builder << '\t'; break;
- case '\\': [[fallthrough]];
- default:
- builder << character;
- }
- state = state_machine_t::string;
- continue;
- }
- if(character == '\"') {
- if(builder.str().empty() && state == state_machine_t::normal) {
- state = state_machine_t::string;
- continue;
- } else if (state == state_machine_t::string) {
- state = state_machine_t::string_end;
- continue;
- }
- } else if(character == '\\' && state == state_machine_t::string) {
- state = state_machine_t::string_escape;
- continue;
- }
- if(std::isspace(character)) {
- if(state == state_machine_t::normal or state == state_machine_t::string_end) {
- process_token(builder.str());
- state = state_machine_t::normal;
- } else {
- builder << character;
- }
- } else {
- if(state == state_machine_t::string_end) {
- std::stringstream quoted;
- quoted << "\"" << builder.str() << "\"";
- builder.swap(quoted);
- state = state_machine_t::normal;
- }
- builder << character;
- }
- }
- process_token(builder.str()); // process the last token if needed
- return output;
- }
-
- using conversion_table = std::map<int, int>;
-
- lexed_output concatenate(const lexed_output& lhs, const lexed_output& rhs) {
-
- // primitive that flips keys and values of dictionaries
- constexpr auto dictionary_reversal = [](auto& destination,const auto& source) {
- for(auto& it : source) {
- destination.insert_or_assign(it.second, it.first);
- }
- };
-
- // primitive that merges a dictionary into a reversed one and returns a conversion table of symbols
- // from the dictionary to the newly generated reverse dictionary
- auto build_reverse_dictionary = [dictionary_reversal] (auto& reverse_dictionary, auto dictionary) -> conversion_table {
- // Make the right dictionary into a reverse dictionary
- std::map<std::string, int> right_reverse_dictionary;
- dictionary_reversal(right_reverse_dictionary, dictionary);
-
- // find the maximum token id in the left dictionary
- int max_token = 0;
- if(not reverse_dictionary.empty()) {
- max_token = std::max_element(
- reverse_dictionary.begin(),
- reverse_dictionary.end(),
- [](const auto &lhs, const auto &rhs) -> bool {
- return lhs.second < rhs.second;
- }
- )->second;
- }
-
- // make the conversions and update the reverse dictionary
- conversion_table conversions;
- for(auto& [key, value] : right_reverse_dictionary) {
- if(auto match = reverse_dictionary.find(key); match != reverse_dictionary.end()) {
- conversions[value] = match->second;
- } else {
- max_token+=1;
- conversions[value] = max_token;
- reverse_dictionary[key] = max_token;
- }
- }
-
- return conversions;
- };
-
- std::map<std::string, int> reverse_dictionary;
- dictionary_reversal(reverse_dictionary, lhs.dictionary);
-
- auto conversions = build_reverse_dictionary(reverse_dictionary, rhs.dictionary);
- auto symbol_stream = rhs.symbols;
- lexed_output output{.symbols = lhs.symbols};
-
- for(auto& old_symbol : symbol_stream) {
- //This diagnostic is pretty lousy, but that is what happens when keys are taken by reference
- #pragma clang diagnostic push
- #pragma ide diagnostic ignored "LocalValueEscapesScope"
- old_symbol.id = conversions[old_symbol];
- #pragma clang diagnostic pop
- }
-
- dictionary_reversal(output.dictionary, reverse_dictionary);
- std::copy(symbol_stream.begin(), symbol_stream.end(), std::back_inserter(output.symbols));
- return output;
- }
- }
|