Du kan inte välja fler än 25 ämnen Ämnen måste starta med en bokstav eller siffra, kan innehålla bindestreck ('-') och vara max 35 tecken långa.

159 rader
5.5 KiB

1 år sedan
1 år sedan
1 år sedan
1 år sedan
1 år sedan
1 år sedan
1 år sedan
  1. #include "molasses/lexer.h"
  2. #include <algorithm>
  3. #include <sstream>
  4. #include <iostream>
  5. namespace molasses {
  6. lexed_output lex(const std::string& file_name, const std::string & source) {
  7. lexed_output output;
  8. std::map<std::string, int> reverse_dictionary;
  9. std::stringstream builder;
  10. int token_counter = 1;
  11. int line = 1;
  12. int column = 0;
  13. enum class state_machine_t {
  14. normal,
  15. string,
  16. string_escape,
  17. string_end,
  18. };
  19. state_machine_t state = state_machine_t::normal;
  20. // Processes the current token into the output if it is not empty
  21. // This should be called upon reaching the end of a token
  22. const auto process_token = [&](const std::string& token) {
  23. if(not token.empty()) {
  24. symbol current_symbol;
  25. if(
  26. auto it = reverse_dictionary.find(token);
  27. it == reverse_dictionary.end()
  28. ) {
  29. reverse_dictionary[token] = token_counter;
  30. output.dictionary[token_counter] = token;
  31. current_symbol = {token_counter, file_name, line, column, state == state_machine_t::string_end};
  32. token_counter++;
  33. } else {
  34. current_symbol = {it->second, file_name, line, column, state == state_machine_t::string_end};
  35. }
  36. output.symbols.push_back(current_symbol);
  37. builder = std::stringstream();
  38. }
  39. };
  40. for(auto& character : source) {
  41. column++;
  42. if(state == state_machine_t::string_escape) {
  43. switch(character) {
  44. case 'n': builder << '\n'; break;
  45. case 't': builder << '\t'; break;
  46. case '\\': [[fallthrough]];
  47. default:
  48. builder << character;
  49. }
  50. state = state_machine_t::string;
  51. continue;
  52. }
  53. if(character == '\"') {
  54. if(builder.str().empty() && state == state_machine_t::normal) {
  55. state = state_machine_t::string;
  56. continue;
  57. } else if (state == state_machine_t::string) {
  58. state = state_machine_t::string_end;
  59. continue;
  60. }
  61. } else if(character == '\\' && state == state_machine_t::string) {
  62. state = state_machine_t::string_escape;
  63. continue;
  64. }
  65. if(std::isspace(character)) {
  66. if(state == state_machine_t::normal or state == state_machine_t::string_end) {
  67. process_token(builder.str());
  68. state = state_machine_t::normal;
  69. } else {
  70. builder << character;
  71. }
  72. if(character == '\n') {
  73. line++;
  74. column = 0;
  75. }
  76. } else {
  77. if(state == state_machine_t::string_end) {
  78. std::stringstream quoted;
  79. quoted << "\"" << builder.str() << "\"";
  80. builder.swap(quoted);
  81. state = state_machine_t::normal;
  82. }
  83. builder << character;
  84. }
  85. }
  86. process_token(builder.str()); // process the last token if needed
  87. return output;
  88. }
  89. using conversion_table = std::map<int, int>;
  90. lexed_output concatenate(const lexed_output& lhs, const lexed_output& rhs) {
  91. // primitive that flips keys and values of dictionaries
  92. constexpr auto dictionary_reversal = [](auto& destination,const auto& source) {
  93. for(auto& it : source) {
  94. destination.insert_or_assign(it.second, it.first);
  95. }
  96. };
  97. // primitive that merges a dictionary into a reversed one and returns a conversion table of symbols
  98. // from the dictionary to the newly generated reverse dictionary
  99. auto build_reverse_dictionary = [dictionary_reversal] (auto& reverse_dictionary, auto dictionary) -> conversion_table {
  100. // Make the right dictionary into a reverse dictionary
  101. std::map<std::string, int> right_reverse_dictionary;
  102. dictionary_reversal(right_reverse_dictionary, dictionary);
  103. // find the maximum token id in the left dictionary
  104. int max_token = 0;
  105. if(not reverse_dictionary.empty()) {
  106. max_token = std::max_element(
  107. reverse_dictionary.begin(),
  108. reverse_dictionary.end(),
  109. [](const auto &lhs, const auto &rhs) -> bool {
  110. return lhs.second < rhs.second;
  111. }
  112. )->second;
  113. }
  114. // make the conversions and update the reverse dictionary
  115. conversion_table conversions;
  116. for(auto& [key, value] : right_reverse_dictionary) {
  117. if(auto match = reverse_dictionary.find(key); match != reverse_dictionary.end()) {
  118. conversions[value] = match->second;
  119. } else {
  120. max_token+=1;
  121. conversions[value] = max_token;
  122. reverse_dictionary[key] = max_token;
  123. }
  124. }
  125. return conversions;
  126. };
  127. std::map<std::string, int> reverse_dictionary;
  128. dictionary_reversal(reverse_dictionary, lhs.dictionary);
  129. auto conversions = build_reverse_dictionary(reverse_dictionary, rhs.dictionary);
  130. auto symbol_stream = rhs.symbols;
  131. lexed_output output{.symbols = lhs.symbols};
  132. for(auto& old_symbol : symbol_stream) {
  133. //This diagnostic is pretty lousy, but that is what happens when keys are taken by reference
  134. #pragma clang diagnostic push
  135. #pragma ide diagnostic ignored "LocalValueEscapesScope"
  136. old_symbol.id = conversions[old_symbol];
  137. #pragma clang diagnostic pop
  138. }
  139. dictionary_reversal(output.dictionary, reverse_dictionary);
  140. std::copy(symbol_stream.begin(), symbol_stream.end(), std::back_inserter(output.symbols));
  141. return output;
  142. }
  143. }