#include "molasses/lexer.h"
|
|
|
|
#include <algorithm>
|
|
#include <sstream>
|
|
#include <iostream>
|
|
|
|
namespace molasses {
|
|
lexed_output lex(const std::string& file_name, const std::string & source) {
|
|
lexed_output output;
|
|
std::map<std::string, int> reverse_dictionary;
|
|
std::stringstream builder;
|
|
int token_counter = 1;
|
|
int line = 1;
|
|
int column = 0;
|
|
|
|
enum class state_machine_t {
|
|
normal,
|
|
string,
|
|
string_escape,
|
|
string_end,
|
|
};
|
|
|
|
state_machine_t state = state_machine_t::normal;
|
|
|
|
// Processes the current token into the output if it is not empty
|
|
// This should be called upon reaching the end of a token
|
|
const auto process_token = [&](const std::string& token) {
|
|
if(not token.empty()) {
|
|
symbol current_symbol;
|
|
if(
|
|
auto it = reverse_dictionary.find(token);
|
|
it == reverse_dictionary.end()
|
|
) {
|
|
reverse_dictionary[token] = token_counter;
|
|
output.dictionary[token_counter] = token;
|
|
current_symbol = {token_counter, file_name, line, column, state == state_machine_t::string_end};
|
|
token_counter++;
|
|
} else {
|
|
current_symbol = {it->second, file_name, line, column, state == state_machine_t::string_end};
|
|
}
|
|
output.symbols.push_back(current_symbol);
|
|
builder = std::stringstream();
|
|
}
|
|
};
|
|
|
|
for(auto& character : source) {
|
|
if(character == '\n') {
|
|
line++;
|
|
column = 0;
|
|
}
|
|
column++;
|
|
if(state == state_machine_t::string_escape) {
|
|
switch(character) {
|
|
case 'n': builder << '\n'; break;
|
|
case 't': builder << '\t'; break;
|
|
case '\\': [[fallthrough]];
|
|
default:
|
|
builder << character;
|
|
}
|
|
continue;
|
|
}
|
|
if(character == '\"') {
|
|
if(builder.view().empty() && state == state_machine_t::normal) {
|
|
state = state_machine_t::string;
|
|
continue;
|
|
} else if (state == state_machine_t::string) {
|
|
state = state_machine_t::string_end;
|
|
continue;
|
|
}
|
|
} else if(character == '\\' && state == state_machine_t::string) {
|
|
state = state_machine_t::string_escape;
|
|
continue;
|
|
}
|
|
if(std::isspace(character)) {
|
|
if(state == state_machine_t::normal or state == state_machine_t::string_end) {
|
|
process_token(builder.str());
|
|
state = state_machine_t::normal;
|
|
} else {
|
|
builder << character;
|
|
}
|
|
} else {
|
|
if(state == state_machine_t::string_end) {
|
|
std::stringstream quoted;
|
|
quoted << "\"" << builder.str() << "\"";
|
|
builder.swap(quoted);
|
|
state = state_machine_t::normal;
|
|
}
|
|
builder << character;
|
|
}
|
|
}
|
|
process_token(builder.str()); // process the last token if needed
|
|
return output;
|
|
}
|
|
|
|
using conversion_table = std::map<int, int>;
|
|
|
|
lexed_output concatenate(const lexed_output& lhs, const lexed_output& rhs) {
|
|
|
|
// primitive that flips keys and values of dictionaries
|
|
constexpr auto dictionary_reversal = [](auto& destination,const auto& source) {
|
|
for(auto& it : source) {
|
|
destination.insert_or_assign(it.second, it.first);
|
|
}
|
|
};
|
|
|
|
// primitive that merges a dictionary into a reversed one and returns a conversion table of symbols
|
|
// from the dictionary to the newly generated reverse dictionary
|
|
auto build_reverse_dictionary = [dictionary_reversal] (auto& reverse_dictionary, auto dictionary) -> conversion_table {
|
|
// Make the right dictionary into a reverse dictionary
|
|
std::map<std::string, int> right_reverse_dictionary;
|
|
dictionary_reversal(right_reverse_dictionary, dictionary);
|
|
|
|
// find the maximum token id in the left dictionary
|
|
int max_token = 0;
|
|
if(not reverse_dictionary.empty()) {
|
|
max_token = std::max_element(
|
|
reverse_dictionary.begin(),
|
|
reverse_dictionary.end(),
|
|
[](const auto &lhs, const auto &rhs) -> bool {
|
|
return lhs.second < rhs.second;
|
|
}
|
|
)->second;
|
|
}
|
|
|
|
// make the conversions and update the reverse dictionary
|
|
conversion_table conversions;
|
|
for(auto& [key, value] : right_reverse_dictionary) {
|
|
if(auto match = reverse_dictionary.find(key); match != reverse_dictionary.end()) {
|
|
conversions[value] = match->second;
|
|
} else {
|
|
max_token+=1;
|
|
conversions[value] = max_token;
|
|
reverse_dictionary[key] = max_token;
|
|
}
|
|
}
|
|
|
|
return conversions;
|
|
};
|
|
|
|
std::map<std::string, int> reverse_dictionary;
|
|
dictionary_reversal(reverse_dictionary, lhs.dictionary);
|
|
|
|
auto conversions = build_reverse_dictionary(reverse_dictionary, rhs.dictionary);
|
|
auto symbol_stream = rhs.symbols;
|
|
lexed_output output{.symbols = lhs.symbols};
|
|
|
|
for(auto& old_symbol : symbol_stream) {
|
|
//This diagnostic is pretty lousy, but that is what happens when keys are taken by reference
|
|
#pragma clang diagnostic push
|
|
#pragma ide diagnostic ignored "LocalValueEscapesScope"
|
|
old_symbol.id = conversions[old_symbol];
|
|
#pragma clang diagnostic pop
|
|
}
|
|
|
|
dictionary_reversal(output.dictionary, reverse_dictionary);
|
|
std::copy(symbol_stream.begin(), symbol_stream.end(), std::back_inserter(output.symbols));
|
|
return output;
|
|
}
|
|
}
|